mirror of
https://github.com/FourierTransformer/ftcsv.git
synced 2024-11-19 19:54:23 +00:00
better quote handling (#17)
* fixed quote escape bug * made findClosingQuote more consistent and added unit tests * fixed bug with findClosingQuote * minor speedup of findClosingQuote for vanilla lua
This commit is contained in:
parent
ca292adee0
commit
14b0dded42
57
ftcsv.lua
57
ftcsv.lua
@ -45,7 +45,6 @@ local ssub = string.sub
|
|||||||
if type(jit) == 'table' then
|
if type(jit) == 'table' then
|
||||||
-- finds the end of an escape sequence
|
-- finds the end of an escape sequence
|
||||||
function M.findClosingQuote(i, inputLength, inputString, quote, doubleQuoteEscape)
|
function M.findClosingQuote(i, inputLength, inputString, quote, doubleQuoteEscape)
|
||||||
-- local doubleQuoteEscape = doubleQuoteEscape
|
|
||||||
local currentChar, nextChar = sbyte(inputString, i), nil
|
local currentChar, nextChar = sbyte(inputString, i), nil
|
||||||
while i <= inputLength do
|
while i <= inputLength do
|
||||||
-- print(i)
|
-- print(i)
|
||||||
@ -72,24 +71,19 @@ if type(jit) == 'table' then
|
|||||||
else
|
else
|
||||||
-- vanilla lua closing quote finder
|
-- vanilla lua closing quote finder
|
||||||
function M.findClosingQuote(i, inputLength, inputString, quote, doubleQuoteEscape)
|
function M.findClosingQuote(i, inputLength, inputString, quote, doubleQuoteEscape)
|
||||||
local firstCharIndex = 1
|
local j, difference
|
||||||
local firstChar, iChar = nil, nil
|
i, j = inputString:find('"+', i)
|
||||||
repeat
|
if j == nil then return end
|
||||||
firstCharIndex, i = inputString:find('".?', i+1)
|
|
||||||
firstChar = sbyte(inputString, firstCharIndex)
|
|
||||||
iChar = sbyte(inputString, i)
|
|
||||||
-- nextChar = string.byte(inputString, i+1)
|
|
||||||
-- print("HI", offset, i)
|
|
||||||
-- print(firstChar, iChar)
|
|
||||||
if firstChar == quote and iChar == quote then
|
|
||||||
doubleQuoteEscape = true
|
|
||||||
end
|
|
||||||
until iChar ~= quote
|
|
||||||
if i == nil then
|
if i == nil then
|
||||||
return inputLength-1, doubleQuoteEscape
|
return inputLength-1, doubleQuoteEscape
|
||||||
end
|
end
|
||||||
-- print("exiting", i-2)
|
difference = j - i
|
||||||
return i-2, doubleQuoteEscape
|
-- print("difference", difference, "I", i, "J", j)
|
||||||
|
if difference >= 1 then doubleQuoteEscape = true end
|
||||||
|
if difference == 1 then
|
||||||
|
return M.findClosingQuote(j+1, inputLength, inputString, quote, doubleQuoteEscape)
|
||||||
|
end
|
||||||
|
return j-1, doubleQuoteEscape
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
@ -103,10 +97,10 @@ local function loadFile(textFile)
|
|||||||
return allLines
|
return allLines
|
||||||
end
|
end
|
||||||
|
|
||||||
-- creates a new field and adds it to the main table
|
-- creates a new field
|
||||||
local function createField(inputString, quote, fieldStart, i, doubleQuoteEscape)
|
local function createField(inputString, quote, fieldStart, i, doubleQuoteEscape)
|
||||||
local field
|
local field
|
||||||
-- so, if we just recently de-escaped, we don't want the trailing \"
|
-- so, if we just recently de-escaped, we don't want the trailing "
|
||||||
if sbyte(inputString, i-1) == quote then
|
if sbyte(inputString, i-1) == quote then
|
||||||
-- print("Skipping last \"")
|
-- print("Skipping last \"")
|
||||||
field = ssub(inputString, fieldStart, i-2)
|
field = ssub(inputString, fieldStart, i-2)
|
||||||
@ -131,7 +125,7 @@ local function parseString(inputString, inputLength, delimiter, i, headerField,
|
|||||||
local fieldStart = i
|
local fieldStart = i
|
||||||
local fieldNum = 1
|
local fieldNum = 1
|
||||||
local lineNum = 1
|
local lineNum = 1
|
||||||
local doubleQuoteEscape = false
|
local doubleQuoteEscape, emptyIdentified = false, false
|
||||||
local exit = false
|
local exit = false
|
||||||
|
|
||||||
--bytes
|
--bytes
|
||||||
@ -149,12 +143,14 @@ local function parseString(inputString, inputLength, delimiter, i, headerField,
|
|||||||
headerField = {}
|
headerField = {}
|
||||||
assignValue = function()
|
assignValue = function()
|
||||||
headerField[fieldNum] = field
|
headerField[fieldNum] = field
|
||||||
|
emptyIdentified = false
|
||||||
return true
|
return true
|
||||||
end
|
end
|
||||||
else
|
else
|
||||||
outResults = {}
|
outResults = {}
|
||||||
outResults[1] = {}
|
outResults[1] = {}
|
||||||
assignValue = function()
|
assignValue = function()
|
||||||
|
emptyIdentified = false
|
||||||
if not pcall(function()
|
if not pcall(function()
|
||||||
outResults[lineNum][headerField[fieldNum]] = field
|
outResults[lineNum][headerField[fieldNum]] = field
|
||||||
end) then
|
end) then
|
||||||
@ -181,9 +177,9 @@ local function parseString(inputString, inputLength, delimiter, i, headerField,
|
|||||||
|
|
||||||
-- empty string
|
-- empty string
|
||||||
if currentChar == quote and nextChar == quote then
|
if currentChar == quote and nextChar == quote then
|
||||||
-- print("EMPTY STRING")
|
|
||||||
skipChar = 1
|
skipChar = 1
|
||||||
fieldStart = i + 2
|
fieldStart = i + 2
|
||||||
|
emptyIdentified = true
|
||||||
-- print("fs+2:", fieldStart)
|
-- print("fs+2:", fieldStart)
|
||||||
|
|
||||||
-- identifies the escape toggle.
|
-- identifies the escape toggle.
|
||||||
@ -192,6 +188,15 @@ local function parseString(inputString, inputLength, delimiter, i, headerField,
|
|||||||
elseif currentChar == quote and nextChar ~= quote and fieldStart == i then
|
elseif currentChar == quote and nextChar ~= quote and fieldStart == i then
|
||||||
-- print("New Quoted Field", i)
|
-- print("New Quoted Field", i)
|
||||||
fieldStart = i + 1
|
fieldStart = i + 1
|
||||||
|
|
||||||
|
-- if an empty field was identified before assignment, it means
|
||||||
|
-- that this is a quoted field that starts with escaped quotes
|
||||||
|
-- ex: """a"""
|
||||||
|
if emptyIdentified then
|
||||||
|
fieldStart = fieldStart - 2
|
||||||
|
emptyIdentified = false
|
||||||
|
end
|
||||||
|
|
||||||
i, doubleQuoteEscape = M.findClosingQuote(i+1, inputLength, inputString, quote, doubleQuoteEscape)
|
i, doubleQuoteEscape = M.findClosingQuote(i+1, inputLength, inputString, quote, doubleQuoteEscape)
|
||||||
-- print("I VALUE", i, doubleQuoteEscape)
|
-- print("I VALUE", i, doubleQuoteEscape)
|
||||||
skipChar = 1
|
skipChar = 1
|
||||||
@ -211,7 +216,6 @@ local function parseString(inputString, inputLength, delimiter, i, headerField,
|
|||||||
fieldNum = fieldNum + 1
|
fieldNum = fieldNum + 1
|
||||||
fieldStart = i + 1
|
fieldStart = i + 1
|
||||||
-- print("fs+1:", fieldStart)
|
-- print("fs+1:", fieldStart)
|
||||||
-- end
|
|
||||||
|
|
||||||
-- newline?!
|
-- newline?!
|
||||||
elseif (currentChar == CR or currentChar == LF) then
|
elseif (currentChar == CR or currentChar == LF) then
|
||||||
@ -219,7 +223,6 @@ local function parseString(inputString, inputLength, delimiter, i, headerField,
|
|||||||
-- create the new field
|
-- create the new field
|
||||||
field = createField(inputString, quote, fieldStart, i, doubleQuoteEscape)
|
field = createField(inputString, quote, fieldStart, i, doubleQuoteEscape)
|
||||||
|
|
||||||
-- outResults[headerField[fieldNum]][lineNum] = field
|
|
||||||
exit = assignValue()
|
exit = assignValue()
|
||||||
if exit then
|
if exit then
|
||||||
if (currentChar == CR and nextChar == LF) then
|
if (currentChar == CR and nextChar == LF) then
|
||||||
@ -235,8 +238,6 @@ local function parseString(inputString, inputLength, delimiter, i, headerField,
|
|||||||
if (currentChar == CR and nextChar == LF) then
|
if (currentChar == CR and nextChar == LF) then
|
||||||
-- print("CRLF DETECTED")
|
-- print("CRLF DETECTED")
|
||||||
skipChar = 1
|
skipChar = 1
|
||||||
fieldStart = fieldStart + 1
|
|
||||||
-- print("fs:", fieldStart)
|
|
||||||
end
|
end
|
||||||
|
|
||||||
-- incrememnt for new line
|
-- incrememnt for new line
|
||||||
@ -269,7 +270,7 @@ local function parseString(inputString, inputLength, delimiter, i, headerField,
|
|||||||
-- if there's no newline, the parser doesn't return headers correctly...
|
-- if there's no newline, the parser doesn't return headers correctly...
|
||||||
-- ex: a,b,c
|
-- ex: a,b,c
|
||||||
if outResults == nil then
|
if outResults == nil then
|
||||||
return headerField, i
|
return headerField, i-1
|
||||||
end
|
end
|
||||||
|
|
||||||
-- clean up last line if it's weird (this happens when there is a CRLF newline at end of file)
|
-- clean up last line if it's weird (this happens when there is a CRLF newline at end of file)
|
||||||
@ -355,7 +356,9 @@ function ftcsv.parse(inputFile, delimiter, options)
|
|||||||
local startLine = 1
|
local startLine = 1
|
||||||
|
|
||||||
-- check for BOM
|
-- check for BOM
|
||||||
if string.byte(inputString, 1) == 239 and string.byte(inputString, 2) == 187 and string.byte(inputString, 3) == 191 then
|
if string.byte(inputString, 1) == 239
|
||||||
|
and string.byte(inputString, 2) == 187
|
||||||
|
and string.byte(inputString, 3) == 191 then
|
||||||
startLine = 4
|
startLine = 4
|
||||||
end
|
end
|
||||||
local headerField, i = parseString(inputString, inputLength, delimiter, startLine)
|
local headerField, i = parseString(inputString, inputLength, delimiter, startLine)
|
||||||
@ -370,7 +373,7 @@ function ftcsv.parse(inputFile, delimiter, options)
|
|||||||
|
|
||||||
-- for files where there aren't headers!
|
-- for files where there aren't headers!
|
||||||
if header == false then
|
if header == false then
|
||||||
i = 0
|
i = 1
|
||||||
for j = 1, #headerField do
|
for j = 1, #headerField do
|
||||||
headerField[j] = j
|
headerField[j] = j
|
||||||
end
|
end
|
||||||
|
@ -21,6 +21,46 @@ describe("csv features", function()
|
|||||||
assert.are.same(expected, actual)
|
assert.are.same(expected, actual)
|
||||||
end)
|
end)
|
||||||
|
|
||||||
|
it("should handle cr loading from string", function()
|
||||||
|
local expected = {}
|
||||||
|
expected[1] = {}
|
||||||
|
expected[1].a = "apple"
|
||||||
|
expected[1].b = "banana"
|
||||||
|
expected[1].c = "carrot"
|
||||||
|
local actual = ftcsv.parse("a,b,c\rapple,banana,carrot", ",", {loadFromString=true})
|
||||||
|
assert.are.same(expected, actual)
|
||||||
|
end)
|
||||||
|
|
||||||
|
it("should handle quotes loading from string", function()
|
||||||
|
local expected = {}
|
||||||
|
expected[1] = {}
|
||||||
|
expected[1].a = "apple"
|
||||||
|
expected[1].b = "banana"
|
||||||
|
expected[1].c = "carrot"
|
||||||
|
local actual = ftcsv.parse('"a","b","c"\n"apple","banana","carrot"', ",", {loadFromString=true})
|
||||||
|
assert.are.same(expected, actual)
|
||||||
|
end)
|
||||||
|
|
||||||
|
it("should handle doublequotes loading from string", function()
|
||||||
|
local expected = {}
|
||||||
|
expected[1] = {}
|
||||||
|
expected[1].a = '"apple"'
|
||||||
|
expected[1].b = '"banana"'
|
||||||
|
expected[1].c = '"carrot"'
|
||||||
|
local actual = ftcsv.parse('"a","b","c"\n"""apple""","""banana""","""carrot"""', ",", {loadFromString=true})
|
||||||
|
assert.are.same(expected, actual)
|
||||||
|
end)
|
||||||
|
|
||||||
|
it("should handle doublequotes loading from string", function()
|
||||||
|
local expected = {}
|
||||||
|
expected[1] = {}
|
||||||
|
expected[1].a = '"apple"'
|
||||||
|
expected[1].b = 'banana'
|
||||||
|
expected[1].c = '"carrot"'
|
||||||
|
local actual = ftcsv.parse('"a","b","c"\n"""apple""","banana","""carrot"""', ",", {loadFromString=true})
|
||||||
|
assert.are.same(expected, actual)
|
||||||
|
end)
|
||||||
|
|
||||||
it("should handle renaming a field", function()
|
it("should handle renaming a field", function()
|
||||||
local expected = {}
|
local expected = {}
|
||||||
expected[1] = {}
|
expected[1] = {}
|
||||||
@ -125,6 +165,32 @@ describe("csv features", function()
|
|||||||
assert.are.same(expected, actual)
|
assert.are.same(expected, actual)
|
||||||
end)
|
end)
|
||||||
|
|
||||||
|
it("should handle files with quotes and without (headers and newlines)", function()
|
||||||
|
local expected = {}
|
||||||
|
expected[1] = {}
|
||||||
|
expected[1][1] = "apple"
|
||||||
|
expected[1][2] = "banana"
|
||||||
|
expected[1][3] = "carrot"
|
||||||
|
local options = {loadFromString=true, headers=false}
|
||||||
|
local actual = ftcsv.parse('"apple">"banana">"carrot"', ">", options)
|
||||||
|
assert.are.same(expected, actual)
|
||||||
|
end)
|
||||||
|
|
||||||
|
it("should handle files with quotes and without (headers and newlines)", function()
|
||||||
|
local expected = {}
|
||||||
|
expected[1] = {}
|
||||||
|
expected[1][1] = "apple"
|
||||||
|
expected[1][2] = "banana"
|
||||||
|
expected[1][3] = "carrot"
|
||||||
|
expected[2] = {}
|
||||||
|
expected[2][1] = "diamond"
|
||||||
|
expected[2][2] = "emerald"
|
||||||
|
expected[2][3] = "pearl"
|
||||||
|
local options = {loadFromString=true, headers=false}
|
||||||
|
local actual = ftcsv.parse('"apple">"banana">"carrot"\n"diamond">"emerald">"pearl"', ">", options)
|
||||||
|
assert.are.same(expected, actual)
|
||||||
|
end)
|
||||||
|
|
||||||
it("should handle files without (headers and newlines) w/newline at end", function()
|
it("should handle files without (headers and newlines) w/newline at end", function()
|
||||||
local expected = {}
|
local expected = {}
|
||||||
expected[1] = {}
|
expected[1] = {}
|
||||||
|
Loading…
Reference in New Issue
Block a user