mirror of
https://github.com/FourierTransformer/ftcsv.git
synced 2024-11-19 19:54:23 +00:00
added more unit tests and finalized api
This commit is contained in:
parent
66e8c1306e
commit
d24a00c290
195
ftcsv.lua
195
ftcsv.lua
@ -1,5 +1,28 @@
|
||||
---------------
|
||||
-- ## ftcsv, a fairly fast csv library written in pure lua
|
||||
--
|
||||
-- It works well for CSVs that can easily be fully loaded into memory (easily
|
||||
-- up to a hundred MBs). Currently, there isn't a "large" file mode with
|
||||
-- proper readers and writers for ingesting CSVs in bulk with a fixed amount
|
||||
-- of memory
|
||||
--
|
||||
-- @author Shakil Thakur
|
||||
-- @copyright 2016
|
||||
-- @license MIT
|
||||
---------------
|
||||
|
||||
local ftcsv = {}
|
||||
|
||||
-- load an entire file into memory
|
||||
local function loadFile(textFile)
|
||||
local file = io.open(textFile, "r")
|
||||
if not file then error("File not found at " .. textFile) end
|
||||
local allLines = file:read("*all")
|
||||
file:close()
|
||||
return allLines
|
||||
end
|
||||
|
||||
-- finds the end of an escape sequence
|
||||
local function findClosingQuote(i, inputLength, inputString, quote, doubleQuoteEscape)
|
||||
local doubleQuoteEscape = doubleQuoteEscape
|
||||
while i <= inputLength do
|
||||
@ -20,19 +43,23 @@ local function findClosingQuote(i, inputLength, inputString, quote, doubleQuoteE
|
||||
end
|
||||
end
|
||||
|
||||
-- creates a new field and adds it to the main table
|
||||
local function createNewField(inputString, quote, fieldStart, i, line, fieldNum, doubleQuoteEscape, fieldsToKeep)
|
||||
-- print(lineNum, fieldNum, fieldStart, i-1)
|
||||
-- so, if we just recently de-escaped, we don't want the trailing \"
|
||||
-- if fieldsToKeep == nil then
|
||||
-- local fieldsToKeep = fieldsToKeep
|
||||
local output = line[fieldNum]
|
||||
if fieldsToKeep == nil or fieldsToKeep[fieldNum] then
|
||||
-- print(fieldsToKeep)
|
||||
-- print("b4", i, fieldNum, line[fieldNum])
|
||||
if string.byte(inputString, i-1) == quote then
|
||||
-- print("Skipping last \"")
|
||||
line[fieldNum] = string.sub(inputString, fieldStart, i-2)
|
||||
else
|
||||
line[fieldNum] = string.sub(inputString, fieldStart, i-1)
|
||||
end
|
||||
-- print("aft", i, fieldNum, line[fieldNum])
|
||||
-- remove the double quotes (if they existed)
|
||||
if doubleQuoteEscape then
|
||||
-- print("QUOTE REPLACE")
|
||||
@ -43,11 +70,13 @@ local function createNewField(inputString, quote, fieldStart, i, line, fieldNum,
|
||||
end
|
||||
end
|
||||
|
||||
-- creates the headers after reading through to the first line
|
||||
local function createHeaders(line, rename, fieldsToKeep)
|
||||
-- print("CREATING HEADERS")
|
||||
local headers = {}
|
||||
for i = 1, #line do
|
||||
if rename[line[i]] then
|
||||
-- print("RENAMING", line[i], rename[line[i]])
|
||||
headers[i] = rename[line[i]]
|
||||
else
|
||||
headers[i] = line[i]
|
||||
@ -61,39 +90,46 @@ local function createHeaders(line, rename, fieldsToKeep)
|
||||
return headers, 0, true, fieldsToKeep
|
||||
end
|
||||
|
||||
function ftcsv.decode(inputString, separator, options)
|
||||
-- main function used to parse
|
||||
function ftcsv.parse(inputFile, delimiter, options)
|
||||
-- each line in outResults holds another table
|
||||
local outResults = {}
|
||||
outResults[1] = {}
|
||||
|
||||
-- separator MUST be one character
|
||||
if #separator ~= 1 and type("separator") ~= "string" then error("the separator must be of string type and exactly one character") end
|
||||
local separator = string.byte(separator)
|
||||
-- delimiter MUST be one character
|
||||
assert(#delimiter == 1 and type(delimiter) == "string", "the delimiter must be of string type and exactly one character")
|
||||
local delimiter = string.byte(delimiter)
|
||||
|
||||
-- OPTIONS yo
|
||||
local header = true
|
||||
local rename = {}
|
||||
local fieldsToKeep = nil
|
||||
local ofieldsToKeep = nil
|
||||
local loadFromString = false
|
||||
if options then
|
||||
if options.headers ~= nil then
|
||||
if type(options.headers) ~= "boolean" then
|
||||
error("ftcsv only takes the boolean 'true' or 'false' for the optional parameter 'headers' (default 'true'). You passed in '" .. options.headers .. "' of type '" .. type(options.headers) .. "'.")
|
||||
end
|
||||
assert(type(options.headers) == "boolean", "ftcsv only takes the boolean 'true' or 'false' for the optional parameter 'headers' (default 'true'). You passed in '" .. tostring(options.headers) .. "' of type '" .. type(options.headers) .. "'.")
|
||||
header = options.headers
|
||||
end
|
||||
if options.rename ~= nil then
|
||||
if type(options.rename) ~= "table" then
|
||||
error("ftcsv only takes in a key-value table for the optional parameter 'rename'. You passed in '" .. options.rename .. "' of type '" .. type(options.rename) .. "'.")
|
||||
end
|
||||
assert(type(options.rename) == "table", "ftcsv only takes in a key-value table for the optional parameter 'rename'. You passed in '" .. tostring(options.rename) .. "' of type '" .. type(options.rename) .. "'.")
|
||||
rename = options.rename
|
||||
end
|
||||
if options.fieldsToKeep ~= nil then
|
||||
assert(type(options.fieldsToKeep) == "table", "ftcsv only takes in a list (as a table) for the optional parameter 'fieldsToKeep'. You passed in '" .. tostring(options.fieldsToKeep) .. "' of type '" .. type(options.fieldsToKeep) .. "'.")
|
||||
ofieldsToKeep = options.fieldsToKeep
|
||||
if type(options.fieldsToKeep) ~= "table" then
|
||||
error("ftcsv only takes in a list (as a table for the optional parameter 'fieldsToKeep'. You passed in '" .. options.fieldsToKeep .. "' of type '" .. type(options.fieldsToKeep) .. "'.")
|
||||
end
|
||||
end
|
||||
if options.loadFromString ~= nil then
|
||||
assert(type(options.loadFromString) == "boolean", "ftcsv only takes a boolean value for optional parameter 'loadFromString'. You passed in '" .. tostring(options.loadFromString) .. "' of type '" .. type(options.loadFromString) .. "'.")
|
||||
loadFromString = options.loadFromString
|
||||
end
|
||||
end
|
||||
|
||||
local inputString
|
||||
if loadFromString then
|
||||
inputString = inputFile
|
||||
else
|
||||
inputString = loadFile(inputFile)
|
||||
end
|
||||
|
||||
local CR = string.byte("\r")
|
||||
@ -138,7 +174,7 @@ function ftcsv.decode(inputString, separator, options)
|
||||
-- end
|
||||
|
||||
-- create some fields if we can!
|
||||
elseif currentChar == separator then
|
||||
elseif currentChar == delimiter then
|
||||
-- for that first field
|
||||
if not headerSet and lineNum == 1 then
|
||||
headerField[fieldNum] = fieldNum
|
||||
@ -196,75 +232,114 @@ function ftcsv.decode(inputString, separator, options)
|
||||
|
||||
-- clean up last line if it's weird (this happens when there is a CRLF newline at end of file)
|
||||
-- doing a count gets it to pick up the oddballs
|
||||
local count = 0
|
||||
for _, _ in pairs(outResults[lineNum]) do
|
||||
count = count + 1
|
||||
local finalLineCount = 0
|
||||
for _, value in pairs(outResults[lineNum]) do
|
||||
finalLineCount = finalLineCount + 1
|
||||
end
|
||||
if count ~= #headerField then
|
||||
local initialLineCount = 0
|
||||
for _, value in pairs(outResults[1]) do
|
||||
initialLineCount = initialLineCount + 1
|
||||
end
|
||||
-- print("Final/Initial", finalLineCount, initialLineCount)
|
||||
if finalLineCount ~= initialLineCount then
|
||||
outResults[lineNum] = nil
|
||||
end
|
||||
|
||||
return outResults
|
||||
end
|
||||
|
||||
-- a function that delimits " to "", used by the writer
|
||||
local function delimitField(field)
|
||||
if field:find('"') then
|
||||
return '"' .. field:gsub('"', '""') .. '"'
|
||||
elseif field:find(" ") or field:find(",") or field:find("\n") then
|
||||
return '"' .. field .. '"'
|
||||
elseif field == "" then
|
||||
return '""'
|
||||
return field:gsub('"', '""')
|
||||
else
|
||||
return field
|
||||
end
|
||||
end
|
||||
|
||||
function ftcsv.encode(inputTable, separator, headers)
|
||||
-- separator MUST be one character
|
||||
if #separator ~= 1 and type("separator") ~= "string" then error("the separator must be of string type and exactly one character") end
|
||||
-- a function that compiles some lua code to quickly print out the csv
|
||||
local function writer(inputTable, dilimeter, headers)
|
||||
-- they get re-created here if they need to be escaped so lua understands it based on how
|
||||
-- they came in
|
||||
local headers = headers
|
||||
for i = 1, #headers do
|
||||
if inputTable[1][headers[i]] == nil then
|
||||
error("the field '" .. headers[i] .. "' doesn't exist in the table")
|
||||
end
|
||||
if headers[i]:find('"') then
|
||||
headers[i] = headers[i]:gsub('"', '\\"')
|
||||
end
|
||||
end
|
||||
|
||||
-- keep track of me output
|
||||
local outputFunc = [[
|
||||
local state, i = ...
|
||||
local d = state.delimitField
|
||||
i = i + 1;
|
||||
if i > state.tableSize then return nil end;
|
||||
return i, '"' .. d(state.t[i]["]] .. table.concat(headers, [["]) .. '"]] .. dilimeter .. [["' .. d(state.t[i]["]]) .. [["]) .. '"\r\n']]
|
||||
|
||||
-- print(outputFunc)
|
||||
|
||||
local state = {}
|
||||
state.t = inputTable
|
||||
state.tableSize = #inputTable
|
||||
state.delimitField = delimitField
|
||||
|
||||
return load(outputFunc), state, 0
|
||||
|
||||
end
|
||||
|
||||
-- takes the values from the headers in the first row of the input table
|
||||
local function extractHeaders(inputTable)
|
||||
headers = {}
|
||||
for key, _ in pairs(inputTable[1]) do
|
||||
headers[#headers+1] = key
|
||||
end
|
||||
|
||||
-- lets make the headers alphabetical
|
||||
table.sort(headers)
|
||||
|
||||
return headers
|
||||
end
|
||||
|
||||
-- turns a lua table into a csv
|
||||
-- works really quickly with luajit-2.1, because table.concat life
|
||||
function ftcsv.encode(inputTable, delimiter, options)
|
||||
local output = {}
|
||||
|
||||
-- grab the headers from the first file if they are not provided
|
||||
-- we'll do this easily and not so quickly...
|
||||
local headers = headers
|
||||
-- dilimeter MUST be one character
|
||||
assert(#delimiter == 1 and type(delimiter) == "string", "the delimiter must be of string type and exactly one character")
|
||||
local delimiter = delimiter
|
||||
|
||||
-- grab the headers from the options if they are there
|
||||
local headers = nil
|
||||
if options then
|
||||
if options.headers ~= nil then
|
||||
assert(type(options.headers) == "table", "ftcsv only takes in a list (as a table) for the optional parameter 'headers'. You passed in '" .. tostring(options.headers) .. "' of type '" .. type(options.headers) .. "'.")
|
||||
headers = options.headers
|
||||
end
|
||||
end
|
||||
if headers == nil then
|
||||
headers = {}
|
||||
for key, _ in pairs(inputTable[1]) do
|
||||
headers[#headers+1] = key
|
||||
end
|
||||
|
||||
-- lets make the headers alphabetical
|
||||
table.sort(headers)
|
||||
headers = extractHeaders(inputTable)
|
||||
end
|
||||
|
||||
-- this is for outputting the headers
|
||||
local line = {}
|
||||
for i, header in ipairs(headers) do
|
||||
line[i] = delimitField(header)
|
||||
end
|
||||
line.length = #line
|
||||
|
||||
-- string the header together yo
|
||||
output[1] = table.concat(line, separator)
|
||||
|
||||
-- cheap and fast (because buffers)
|
||||
for i, fields in ipairs(inputTable) do
|
||||
local numHeaders = 0
|
||||
for j = 1, #headers do
|
||||
local field = fields[headers[j]]
|
||||
line[j] = delimitField(field)
|
||||
numHeaders = j
|
||||
-- newHeaders are needed if there are quotes within the header
|
||||
-- because they need to be escaped
|
||||
local newHeaders = {}
|
||||
for i = 1, #headers do
|
||||
if headers[i]:find('"') then
|
||||
newHeaders[i] = headers[i]:gsub('"', '""')
|
||||
else
|
||||
newHeaders[i] = headers[i]
|
||||
end
|
||||
-- all lines should have the same number of fields
|
||||
if line.length ~= numHeaders then
|
||||
error("All lines should have the same length. The line at row " .. i .. " is of length " .. numHeaders .. " instead of " .. line.length)
|
||||
end
|
||||
output[i+1] = table.concat(line, separator)
|
||||
end
|
||||
output[1] = '"' .. table.concat(newHeaders, '","') .. '"\r\n'
|
||||
|
||||
return table.concat(output, "\r\n")
|
||||
-- add each line by line.
|
||||
for i, line in writer(inputTable, delimiter, headers) do
|
||||
output[i+1] = line
|
||||
end
|
||||
return table.concat(output)
|
||||
end
|
||||
|
||||
return ftcsv
|
||||
|
3
spec/csvs/escaped_quotes_in_header.csv
Normal file
3
spec/csvs/escaped_quotes_in_header.csv
Normal file
@ -0,0 +1,3 @@
|
||||
"li""on",tiger,"be""ar"
|
||||
1,2,3
|
||||
5,6,7
|
|
109
spec/feature_spec.lua
Normal file
109
spec/feature_spec.lua
Normal file
@ -0,0 +1,109 @@
|
||||
local ftcsv = require('ftcsv')
|
||||
|
||||
describe("csv features", function()
|
||||
it("should handle loading from string", function()
|
||||
local expected = {}
|
||||
expected[1] = {}
|
||||
expected[1].a = "apple"
|
||||
expected[1].b = "banana"
|
||||
expected[1].c = "carrot"
|
||||
local actual = ftcsv.parse("a,b,c\napple,banana,carrot", ",", {loadFromString=true})
|
||||
assert.are.same(expected, actual)
|
||||
end)
|
||||
|
||||
it("should handle crlf loading from string", function()
|
||||
local expected = {}
|
||||
expected[1] = {}
|
||||
expected[1].a = "apple"
|
||||
expected[1].b = "banana"
|
||||
expected[1].c = "carrot"
|
||||
local actual = ftcsv.parse("a,b,c\r\napple,banana,carrot", ",", {loadFromString=true})
|
||||
assert.are.same(expected, actual)
|
||||
end)
|
||||
|
||||
it("should handle renaming a field", function()
|
||||
local expected = {}
|
||||
expected[1] = {}
|
||||
expected[1].d = "apple"
|
||||
expected[1].b = "banana"
|
||||
expected[1].c = "carrot"
|
||||
local actual = ftcsv.parse("a,b,c\r\napple,banana,carrot", ",", {loadFromString=true, rename={["a"] = "d"}})
|
||||
assert.are.same(expected, actual)
|
||||
end)
|
||||
|
||||
it("should handle renaming multiple fields", function()
|
||||
local expected = {}
|
||||
expected[1] = {}
|
||||
expected[1].d = "apple"
|
||||
expected[1].e = "banana"
|
||||
expected[1].f = "carrot"
|
||||
local options = {loadFromString=true, rename={["a"] = "d", ["b"] = "e", ["c"] = "f"}}
|
||||
local actual = ftcsv.parse("a,b,c\r\napple,banana,carrot", ",", options)
|
||||
assert.are.same(expected, actual)
|
||||
end)
|
||||
|
||||
it("should handle renaming multiple fields to the same out value", function()
|
||||
local expected = {}
|
||||
expected[1] = {}
|
||||
expected[1].d = "apple"
|
||||
expected[1].e = "carrot"
|
||||
local options = {loadFromString=true, rename={["a"] = "d", ["b"] = "e", ["c"] = "e"}}
|
||||
local actual = ftcsv.parse("a,b,c\r\napple,banana,carrot", ",", options)
|
||||
assert.are.same(expected, actual)
|
||||
end)
|
||||
|
||||
it("should handle renaming multiple fields to the same out value with newline at end", function()
|
||||
local expected = {}
|
||||
expected[1] = {}
|
||||
expected[1].d = "apple"
|
||||
expected[1].e = "carrot"
|
||||
local options = {loadFromString=true, rename={["a"] = "d", ["b"] = "e", ["c"] = "e"}}
|
||||
local actual = ftcsv.parse("a,b,c\r\napple,banana,carrot\r\n", ",", options)
|
||||
assert.are.same(expected, actual)
|
||||
end)
|
||||
|
||||
it("should handle only keeping a few fields", function()
|
||||
local expected = {}
|
||||
expected[1] = {}
|
||||
expected[1].a = "apple"
|
||||
expected[1].b = "banana"
|
||||
local options = {loadFromString=true, fieldsToKeep={"a","b"}}
|
||||
local actual = ftcsv.parse("a,b,c\r\napple,banana,carrot\r\n", ",", options)
|
||||
assert.are.same(expected, actual)
|
||||
end)
|
||||
|
||||
it("should handle only keeping a few fields with a rename to an existing field", function()
|
||||
local expected = {}
|
||||
expected[1] = {}
|
||||
expected[1].a = "apple"
|
||||
expected[1].b = "carrot"
|
||||
local options = {loadFromString=true, fieldsToKeep={"a","b"}, rename={["c"] = "b"}}
|
||||
local actual = ftcsv.parse("a,b,c\r\napple,banana,carrot\r\n", ",", options)
|
||||
assert.are.same(expected, actual)
|
||||
end)
|
||||
|
||||
it("should handle only keeping a few fields with a rename to a new field", function()
|
||||
local expected = {}
|
||||
expected[1] = {}
|
||||
expected[1].a = "apple"
|
||||
expected[1].f = "carrot"
|
||||
local options = {loadFromString=true, fieldsToKeep={"a","f"}, rename={["c"] = "f"}}
|
||||
local actual = ftcsv.parse("a,b,c\r\napple,banana,carrot\r\n", ",", options)
|
||||
assert.are.same(expected, actual)
|
||||
end)
|
||||
|
||||
it("should handle files without headers", function()
|
||||
local expected = {}
|
||||
expected[1] = {}
|
||||
expected[1][1] = "apple"
|
||||
expected[1][2] = "banana"
|
||||
expected[1][3] = "carrot"
|
||||
expected[2] = {}
|
||||
expected[2][1] = "diamond"
|
||||
expected[2][2] = "emerald"
|
||||
expected[2][3] = "pearl"
|
||||
local options = {loadFromString=true, header=false}
|
||||
local actual = ftcsv.parse("apple>banana>carrot\ndiamond>emerald>pearl", ">", options)
|
||||
end)
|
||||
|
||||
end)
|
12
spec/json/escaped_quotes_in_header.json
Normal file
12
spec/json/escaped_quotes_in_header.json
Normal file
@ -0,0 +1,12 @@
|
||||
[
|
||||
{
|
||||
"li\"on": "1",
|
||||
"tiger": "2",
|
||||
"be\"ar": "3"
|
||||
},
|
||||
{
|
||||
"li\"on": "5",
|
||||
"tiger": "6",
|
||||
"be\"ar": "7"
|
||||
}
|
||||
]
|
@ -1,7 +1,5 @@
|
||||
local cjson = require("cjson")
|
||||
local ftcsv = require('ftcsv')
|
||||
-- local csv = require('csv')
|
||||
-- local staecsv = require('state-csv')
|
||||
|
||||
local function loadFile(textFile)
|
||||
local file = io.open(textFile, "r")
|
||||
@ -19,6 +17,7 @@ local files = {
|
||||
"empty_no_quotes",
|
||||
"empty_crlf",
|
||||
"escaped_quotes",
|
||||
"escaped_quotes_in_header",
|
||||
"json",
|
||||
"json_no_newline",
|
||||
"newlines",
|
||||
@ -32,30 +31,33 @@ local files = {
|
||||
describe("csv decode", function()
|
||||
for _, value in ipairs(files) do
|
||||
it("should handle " .. value, function()
|
||||
local contents = loadFile("spec/csvs/" .. value .. ".csv")
|
||||
local json = loadFile("spec/json/" .. value .. ".json")
|
||||
json = cjson.decode(json)
|
||||
-- local parse = staecsv:ftcsv(contents, ",")
|
||||
local parse = ftcsv.decode(contents, ",")
|
||||
-- local f = csv.openstring(contents, {separator=",", header=true})
|
||||
-- local parse = {}
|
||||
-- for fields in f:lines() do
|
||||
-- parse[#parse+1] = fields
|
||||
-- end
|
||||
local parse = ftcsv.parse("spec/csvs/" .. value .. ".csv", ",")
|
||||
assert.are.same(json, parse)
|
||||
end)
|
||||
end
|
||||
end)
|
||||
|
||||
describe("csv decode from string", function()
|
||||
for _, value in ipairs(files) do
|
||||
it("should handle " .. value, function()
|
||||
local contents = loadFile("spec/csvs/" .. value .. ".csv")
|
||||
local json = loadFile("spec/json/" .. value .. ".json")
|
||||
json = cjson.decode(json)
|
||||
local parse = ftcsv.parse(contents, ",", {loadFromString=true})
|
||||
assert.are.same(json, parse)
|
||||
end)
|
||||
end
|
||||
end)
|
||||
|
||||
describe("csv encode", function()
|
||||
for _, value in ipairs(files) do
|
||||
it("should handle " .. value, function()
|
||||
local originalFile = loadFile("spec/csvs/" .. value .. ".csv")
|
||||
local jsonFile = loadFile("spec/json/" .. value .. ".json")
|
||||
local jsonDecode = cjson.decode(jsonFile)
|
||||
-- local parse = staecsv:ftcsv(contents, ",")
|
||||
local reEncoded = ftcsv.decode(ftcsv.encode(jsonDecode, ","), ",")
|
||||
local reEncoded = ftcsv.parse(ftcsv.encode(jsonDecode, ","), ",", {loadFromString=true})
|
||||
-- local f = csv.openstring(contents, {separator=",", header=true})
|
||||
-- local parse = {}
|
||||
-- for fields in f:lines() do
|
Loading…
Reference in New Issue
Block a user