From 14b0dded4213b1ab9e6d7456c46cfc662a45fb16 Mon Sep 17 00:00:00 2001 From: Shakil Thakur Date: Sun, 27 May 2018 11:26:48 -0500 Subject: [PATCH] better quote handling (#17) * fixed quote escape bug * made findClosingQuote more consistent and added unit tests * fixed bug with findClosingQuote * minor speedup of findClosingQuote for vanilla lua --- ftcsv.lua | 57 +++++++++++++++++++------------------ spec/feature_spec.lua | 66 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+), 27 deletions(-) diff --git a/ftcsv.lua b/ftcsv.lua index 1e9d579..decb289 100644 --- a/ftcsv.lua +++ b/ftcsv.lua @@ -45,7 +45,6 @@ local ssub = string.sub if type(jit) == 'table' then -- finds the end of an escape sequence function M.findClosingQuote(i, inputLength, inputString, quote, doubleQuoteEscape) - -- local doubleQuoteEscape = doubleQuoteEscape local currentChar, nextChar = sbyte(inputString, i), nil while i <= inputLength do -- print(i) @@ -72,24 +71,19 @@ if type(jit) == 'table' then else -- vanilla lua closing quote finder function M.findClosingQuote(i, inputLength, inputString, quote, doubleQuoteEscape) - local firstCharIndex = 1 - local firstChar, iChar = nil, nil - repeat - firstCharIndex, i = inputString:find('".?', i+1) - firstChar = sbyte(inputString, firstCharIndex) - iChar = sbyte(inputString, i) - -- nextChar = string.byte(inputString, i+1) - -- print("HI", offset, i) - -- print(firstChar, iChar) - if firstChar == quote and iChar == quote then - doubleQuoteEscape = true - end - until iChar ~= quote + local j, difference + i, j = inputString:find('"+', i) + if j == nil then return end if i == nil then return inputLength-1, doubleQuoteEscape end - -- print("exiting", i-2) - return i-2, doubleQuoteEscape + difference = j - i + -- print("difference", difference, "I", i, "J", j) + if difference >= 1 then doubleQuoteEscape = true end + if difference == 1 then + return M.findClosingQuote(j+1, inputLength, inputString, quote, doubleQuoteEscape) + end + return j-1, doubleQuoteEscape end end @@ -103,10 +97,10 @@ local function loadFile(textFile) return allLines end --- creates a new field and adds it to the main table +-- creates a new field local function createField(inputString, quote, fieldStart, i, doubleQuoteEscape) local field - -- so, if we just recently de-escaped, we don't want the trailing \" + -- so, if we just recently de-escaped, we don't want the trailing " if sbyte(inputString, i-1) == quote then -- print("Skipping last \"") field = ssub(inputString, fieldStart, i-2) @@ -131,7 +125,7 @@ local function parseString(inputString, inputLength, delimiter, i, headerField, local fieldStart = i local fieldNum = 1 local lineNum = 1 - local doubleQuoteEscape = false + local doubleQuoteEscape, emptyIdentified = false, false local exit = false --bytes @@ -149,12 +143,14 @@ local function parseString(inputString, inputLength, delimiter, i, headerField, headerField = {} assignValue = function() headerField[fieldNum] = field + emptyIdentified = false return true end else outResults = {} outResults[1] = {} assignValue = function() + emptyIdentified = false if not pcall(function() outResults[lineNum][headerField[fieldNum]] = field end) then @@ -181,9 +177,9 @@ local function parseString(inputString, inputLength, delimiter, i, headerField, -- empty string if currentChar == quote and nextChar == quote then - -- print("EMPTY STRING") skipChar = 1 fieldStart = i + 2 + emptyIdentified = true -- print("fs+2:", fieldStart) -- identifies the escape toggle. @@ -192,6 +188,15 @@ local function parseString(inputString, inputLength, delimiter, i, headerField, elseif currentChar == quote and nextChar ~= quote and fieldStart == i then -- print("New Quoted Field", i) fieldStart = i + 1 + + -- if an empty field was identified before assignment, it means + -- that this is a quoted field that starts with escaped quotes + -- ex: """a""" + if emptyIdentified then + fieldStart = fieldStart - 2 + emptyIdentified = false + end + i, doubleQuoteEscape = M.findClosingQuote(i+1, inputLength, inputString, quote, doubleQuoteEscape) -- print("I VALUE", i, doubleQuoteEscape) skipChar = 1 @@ -211,7 +216,6 @@ local function parseString(inputString, inputLength, delimiter, i, headerField, fieldNum = fieldNum + 1 fieldStart = i + 1 -- print("fs+1:", fieldStart) - -- end -- newline?! elseif (currentChar == CR or currentChar == LF) then @@ -219,7 +223,6 @@ local function parseString(inputString, inputLength, delimiter, i, headerField, -- create the new field field = createField(inputString, quote, fieldStart, i, doubleQuoteEscape) - -- outResults[headerField[fieldNum]][lineNum] = field exit = assignValue() if exit then if (currentChar == CR and nextChar == LF) then @@ -235,8 +238,6 @@ local function parseString(inputString, inputLength, delimiter, i, headerField, if (currentChar == CR and nextChar == LF) then -- print("CRLF DETECTED") skipChar = 1 - fieldStart = fieldStart + 1 - -- print("fs:", fieldStart) end -- incrememnt for new line @@ -269,7 +270,7 @@ local function parseString(inputString, inputLength, delimiter, i, headerField, -- if there's no newline, the parser doesn't return headers correctly... -- ex: a,b,c if outResults == nil then - return headerField, i + return headerField, i-1 end -- clean up last line if it's weird (this happens when there is a CRLF newline at end of file) @@ -355,7 +356,9 @@ function ftcsv.parse(inputFile, delimiter, options) local startLine = 1 -- check for BOM - if string.byte(inputString, 1) == 239 and string.byte(inputString, 2) == 187 and string.byte(inputString, 3) == 191 then + if string.byte(inputString, 1) == 239 + and string.byte(inputString, 2) == 187 + and string.byte(inputString, 3) == 191 then startLine = 4 end local headerField, i = parseString(inputString, inputLength, delimiter, startLine) @@ -370,7 +373,7 @@ function ftcsv.parse(inputFile, delimiter, options) -- for files where there aren't headers! if header == false then - i = 0 + i = 1 for j = 1, #headerField do headerField[j] = j end diff --git a/spec/feature_spec.lua b/spec/feature_spec.lua index 70c1bc4..62355d4 100644 --- a/spec/feature_spec.lua +++ b/spec/feature_spec.lua @@ -21,6 +21,46 @@ describe("csv features", function() assert.are.same(expected, actual) end) + it("should handle cr loading from string", function() + local expected = {} + expected[1] = {} + expected[1].a = "apple" + expected[1].b = "banana" + expected[1].c = "carrot" + local actual = ftcsv.parse("a,b,c\rapple,banana,carrot", ",", {loadFromString=true}) + assert.are.same(expected, actual) + end) + + it("should handle quotes loading from string", function() + local expected = {} + expected[1] = {} + expected[1].a = "apple" + expected[1].b = "banana" + expected[1].c = "carrot" + local actual = ftcsv.parse('"a","b","c"\n"apple","banana","carrot"', ",", {loadFromString=true}) + assert.are.same(expected, actual) + end) + + it("should handle doublequotes loading from string", function() + local expected = {} + expected[1] = {} + expected[1].a = '"apple"' + expected[1].b = '"banana"' + expected[1].c = '"carrot"' + local actual = ftcsv.parse('"a","b","c"\n"""apple""","""banana""","""carrot"""', ",", {loadFromString=true}) + assert.are.same(expected, actual) + end) + + it("should handle doublequotes loading from string", function() + local expected = {} + expected[1] = {} + expected[1].a = '"apple"' + expected[1].b = 'banana' + expected[1].c = '"carrot"' + local actual = ftcsv.parse('"a","b","c"\n"""apple""","banana","""carrot"""', ",", {loadFromString=true}) + assert.are.same(expected, actual) + end) + it("should handle renaming a field", function() local expected = {} expected[1] = {} @@ -125,6 +165,32 @@ describe("csv features", function() assert.are.same(expected, actual) end) + it("should handle files with quotes and without (headers and newlines)", function() + local expected = {} + expected[1] = {} + expected[1][1] = "apple" + expected[1][2] = "banana" + expected[1][3] = "carrot" + local options = {loadFromString=true, headers=false} + local actual = ftcsv.parse('"apple">"banana">"carrot"', ">", options) + assert.are.same(expected, actual) + end) + + it("should handle files with quotes and without (headers and newlines)", function() + local expected = {} + expected[1] = {} + expected[1][1] = "apple" + expected[1][2] = "banana" + expected[1][3] = "carrot" + expected[2] = {} + expected[2][1] = "diamond" + expected[2][2] = "emerald" + expected[2][3] = "pearl" + local options = {loadFromString=true, headers=false} + local actual = ftcsv.parse('"apple">"banana">"carrot"\n"diamond">"emerald">"pearl"', ">", options) + assert.are.same(expected, actual) + end) + it("should handle files without (headers and newlines) w/newline at end", function() local expected = {} expected[1] = {}