better quote handling (#17)

* fixed quote escape bug

* made findClosingQuote more consistent and added unit tests

* fixed bug with findClosingQuote

* minor speedup of findClosingQuote for vanilla lua
This commit is contained in:
Shakil Thakur 2018-05-27 11:26:48 -05:00 committed by GitHub
parent ca292adee0
commit 14b0dded42
2 changed files with 96 additions and 27 deletions

View File

@ -45,7 +45,6 @@ local ssub = string.sub
if type(jit) == 'table' then if type(jit) == 'table' then
-- finds the end of an escape sequence -- finds the end of an escape sequence
function M.findClosingQuote(i, inputLength, inputString, quote, doubleQuoteEscape) function M.findClosingQuote(i, inputLength, inputString, quote, doubleQuoteEscape)
-- local doubleQuoteEscape = doubleQuoteEscape
local currentChar, nextChar = sbyte(inputString, i), nil local currentChar, nextChar = sbyte(inputString, i), nil
while i <= inputLength do while i <= inputLength do
-- print(i) -- print(i)
@ -72,24 +71,19 @@ if type(jit) == 'table' then
else else
-- vanilla lua closing quote finder -- vanilla lua closing quote finder
function M.findClosingQuote(i, inputLength, inputString, quote, doubleQuoteEscape) function M.findClosingQuote(i, inputLength, inputString, quote, doubleQuoteEscape)
local firstCharIndex = 1 local j, difference
local firstChar, iChar = nil, nil i, j = inputString:find('"+', i)
repeat if j == nil then return end
firstCharIndex, i = inputString:find('".?', i+1)
firstChar = sbyte(inputString, firstCharIndex)
iChar = sbyte(inputString, i)
-- nextChar = string.byte(inputString, i+1)
-- print("HI", offset, i)
-- print(firstChar, iChar)
if firstChar == quote and iChar == quote then
doubleQuoteEscape = true
end
until iChar ~= quote
if i == nil then if i == nil then
return inputLength-1, doubleQuoteEscape return inputLength-1, doubleQuoteEscape
end end
-- print("exiting", i-2) difference = j - i
return i-2, doubleQuoteEscape -- print("difference", difference, "I", i, "J", j)
if difference >= 1 then doubleQuoteEscape = true end
if difference == 1 then
return M.findClosingQuote(j+1, inputLength, inputString, quote, doubleQuoteEscape)
end
return j-1, doubleQuoteEscape
end end
end end
@ -103,10 +97,10 @@ local function loadFile(textFile)
return allLines return allLines
end end
-- creates a new field and adds it to the main table -- creates a new field
local function createField(inputString, quote, fieldStart, i, doubleQuoteEscape) local function createField(inputString, quote, fieldStart, i, doubleQuoteEscape)
local field local field
-- so, if we just recently de-escaped, we don't want the trailing \" -- so, if we just recently de-escaped, we don't want the trailing "
if sbyte(inputString, i-1) == quote then if sbyte(inputString, i-1) == quote then
-- print("Skipping last \"") -- print("Skipping last \"")
field = ssub(inputString, fieldStart, i-2) field = ssub(inputString, fieldStart, i-2)
@ -131,7 +125,7 @@ local function parseString(inputString, inputLength, delimiter, i, headerField,
local fieldStart = i local fieldStart = i
local fieldNum = 1 local fieldNum = 1
local lineNum = 1 local lineNum = 1
local doubleQuoteEscape = false local doubleQuoteEscape, emptyIdentified = false, false
local exit = false local exit = false
--bytes --bytes
@ -149,12 +143,14 @@ local function parseString(inputString, inputLength, delimiter, i, headerField,
headerField = {} headerField = {}
assignValue = function() assignValue = function()
headerField[fieldNum] = field headerField[fieldNum] = field
emptyIdentified = false
return true return true
end end
else else
outResults = {} outResults = {}
outResults[1] = {} outResults[1] = {}
assignValue = function() assignValue = function()
emptyIdentified = false
if not pcall(function() if not pcall(function()
outResults[lineNum][headerField[fieldNum]] = field outResults[lineNum][headerField[fieldNum]] = field
end) then end) then
@ -181,9 +177,9 @@ local function parseString(inputString, inputLength, delimiter, i, headerField,
-- empty string -- empty string
if currentChar == quote and nextChar == quote then if currentChar == quote and nextChar == quote then
-- print("EMPTY STRING")
skipChar = 1 skipChar = 1
fieldStart = i + 2 fieldStart = i + 2
emptyIdentified = true
-- print("fs+2:", fieldStart) -- print("fs+2:", fieldStart)
-- identifies the escape toggle. -- identifies the escape toggle.
@ -192,6 +188,15 @@ local function parseString(inputString, inputLength, delimiter, i, headerField,
elseif currentChar == quote and nextChar ~= quote and fieldStart == i then elseif currentChar == quote and nextChar ~= quote and fieldStart == i then
-- print("New Quoted Field", i) -- print("New Quoted Field", i)
fieldStart = i + 1 fieldStart = i + 1
-- if an empty field was identified before assignment, it means
-- that this is a quoted field that starts with escaped quotes
-- ex: """a"""
if emptyIdentified then
fieldStart = fieldStart - 2
emptyIdentified = false
end
i, doubleQuoteEscape = M.findClosingQuote(i+1, inputLength, inputString, quote, doubleQuoteEscape) i, doubleQuoteEscape = M.findClosingQuote(i+1, inputLength, inputString, quote, doubleQuoteEscape)
-- print("I VALUE", i, doubleQuoteEscape) -- print("I VALUE", i, doubleQuoteEscape)
skipChar = 1 skipChar = 1
@ -211,7 +216,6 @@ local function parseString(inputString, inputLength, delimiter, i, headerField,
fieldNum = fieldNum + 1 fieldNum = fieldNum + 1
fieldStart = i + 1 fieldStart = i + 1
-- print("fs+1:", fieldStart) -- print("fs+1:", fieldStart)
-- end
-- newline?! -- newline?!
elseif (currentChar == CR or currentChar == LF) then elseif (currentChar == CR or currentChar == LF) then
@ -219,7 +223,6 @@ local function parseString(inputString, inputLength, delimiter, i, headerField,
-- create the new field -- create the new field
field = createField(inputString, quote, fieldStart, i, doubleQuoteEscape) field = createField(inputString, quote, fieldStart, i, doubleQuoteEscape)
-- outResults[headerField[fieldNum]][lineNum] = field
exit = assignValue() exit = assignValue()
if exit then if exit then
if (currentChar == CR and nextChar == LF) then if (currentChar == CR and nextChar == LF) then
@ -235,8 +238,6 @@ local function parseString(inputString, inputLength, delimiter, i, headerField,
if (currentChar == CR and nextChar == LF) then if (currentChar == CR and nextChar == LF) then
-- print("CRLF DETECTED") -- print("CRLF DETECTED")
skipChar = 1 skipChar = 1
fieldStart = fieldStart + 1
-- print("fs:", fieldStart)
end end
-- incrememnt for new line -- incrememnt for new line
@ -269,7 +270,7 @@ local function parseString(inputString, inputLength, delimiter, i, headerField,
-- if there's no newline, the parser doesn't return headers correctly... -- if there's no newline, the parser doesn't return headers correctly...
-- ex: a,b,c -- ex: a,b,c
if outResults == nil then if outResults == nil then
return headerField, i return headerField, i-1
end end
-- clean up last line if it's weird (this happens when there is a CRLF newline at end of file) -- clean up last line if it's weird (this happens when there is a CRLF newline at end of file)
@ -355,7 +356,9 @@ function ftcsv.parse(inputFile, delimiter, options)
local startLine = 1 local startLine = 1
-- check for BOM -- check for BOM
if string.byte(inputString, 1) == 239 and string.byte(inputString, 2) == 187 and string.byte(inputString, 3) == 191 then if string.byte(inputString, 1) == 239
and string.byte(inputString, 2) == 187
and string.byte(inputString, 3) == 191 then
startLine = 4 startLine = 4
end end
local headerField, i = parseString(inputString, inputLength, delimiter, startLine) local headerField, i = parseString(inputString, inputLength, delimiter, startLine)
@ -370,7 +373,7 @@ function ftcsv.parse(inputFile, delimiter, options)
-- for files where there aren't headers! -- for files where there aren't headers!
if header == false then if header == false then
i = 0 i = 1
for j = 1, #headerField do for j = 1, #headerField do
headerField[j] = j headerField[j] = j
end end

View File

@ -21,6 +21,46 @@ describe("csv features", function()
assert.are.same(expected, actual) assert.are.same(expected, actual)
end) end)
it("should handle cr loading from string", function()
local expected = {}
expected[1] = {}
expected[1].a = "apple"
expected[1].b = "banana"
expected[1].c = "carrot"
local actual = ftcsv.parse("a,b,c\rapple,banana,carrot", ",", {loadFromString=true})
assert.are.same(expected, actual)
end)
it("should handle quotes loading from string", function()
local expected = {}
expected[1] = {}
expected[1].a = "apple"
expected[1].b = "banana"
expected[1].c = "carrot"
local actual = ftcsv.parse('"a","b","c"\n"apple","banana","carrot"', ",", {loadFromString=true})
assert.are.same(expected, actual)
end)
it("should handle doublequotes loading from string", function()
local expected = {}
expected[1] = {}
expected[1].a = '"apple"'
expected[1].b = '"banana"'
expected[1].c = '"carrot"'
local actual = ftcsv.parse('"a","b","c"\n"""apple""","""banana""","""carrot"""', ",", {loadFromString=true})
assert.are.same(expected, actual)
end)
it("should handle doublequotes loading from string", function()
local expected = {}
expected[1] = {}
expected[1].a = '"apple"'
expected[1].b = 'banana'
expected[1].c = '"carrot"'
local actual = ftcsv.parse('"a","b","c"\n"""apple""","banana","""carrot"""', ",", {loadFromString=true})
assert.are.same(expected, actual)
end)
it("should handle renaming a field", function() it("should handle renaming a field", function()
local expected = {} local expected = {}
expected[1] = {} expected[1] = {}
@ -125,6 +165,32 @@ describe("csv features", function()
assert.are.same(expected, actual) assert.are.same(expected, actual)
end) end)
it("should handle files with quotes and without (headers and newlines)", function()
local expected = {}
expected[1] = {}
expected[1][1] = "apple"
expected[1][2] = "banana"
expected[1][3] = "carrot"
local options = {loadFromString=true, headers=false}
local actual = ftcsv.parse('"apple">"banana">"carrot"', ">", options)
assert.are.same(expected, actual)
end)
it("should handle files with quotes and without (headers and newlines)", function()
local expected = {}
expected[1] = {}
expected[1][1] = "apple"
expected[1][2] = "banana"
expected[1][3] = "carrot"
expected[2] = {}
expected[2][1] = "diamond"
expected[2][2] = "emerald"
expected[2][3] = "pearl"
local options = {loadFromString=true, headers=false}
local actual = ftcsv.parse('"apple">"banana">"carrot"\n"diamond">"emerald">"pearl"', ">", options)
assert.are.same(expected, actual)
end)
it("should handle files without (headers and newlines) w/newline at end", function() it("should handle files without (headers and newlines) w/newline at end", function()
local expected = {} local expected = {}
expected[1] = {} expected[1] = {}