From 11f1c6e4375e5847bbbd9873338dc254e0187fbb Mon Sep 17 00:00:00 2001 From: FourierTransformer Date: Tue, 6 Aug 2024 07:58:30 -0500 Subject: [PATCH] Make delimiter optional and fix bug when reusing options table (#42) --- README.md | 53 +++++++---- ...1.3.0-1.rockspec => ftcsv-1.4.0-1.rockspec | 6 +- ftcsv.lua | 93 ++++++++++++------- spec/dynamic_features_spec.lua | 6 +- spec/feature_spec.lua | 45 +++++++++ spec/parseLine_spec.lua | 78 ++++++++++++++++ 6 files changed, 222 insertions(+), 59 deletions(-) rename ftcsv-1.3.0-1.rockspec => ftcsv-1.4.0-1.rockspec (85%) diff --git a/README.md b/README.md index 8f73792..88f5452 100644 --- a/README.md +++ b/README.md @@ -17,16 +17,16 @@ luarocks install ftcsv There are two main parsing methods: `ftcv.parse` and `ftcsv.parseLine`. `ftcsv.parse` loads the entire file and parses it, while `ftcsv.parseLine` is an iterator that parses one line at a time. -### `ftcsv.parse(fileName, delimiter [, options])` -`ftcsv.parse` will load the entire csv file into memory, then parse it in one go, returning a lua table with the parsed data and a lua table containing the column headers. It has only two required parameters - a file name and delimiter (limited to one character). A few optional parameters can be passed in via a table (examples below). +### `ftcsv.parse(fileName, [, options])` +`ftcsv.parse` will load the entire csv file into memory, then parse it in one go, returning a lua table with the parsed data and a lua table containing the column headers. It has only one required parameter - the file name. A few optional parameters can be passed in via a table (examples below). Just loading a csv file: ```lua local ftcsv = require('ftcsv') -local zipcodes, headers = ftcsv.parse("free-zipcode-database.csv", ",") +local zipcodes, headers = ftcsv.parse("free-zipcode-database.csv") ``` -### `ftcsv.parseLine(fileName, delimiter, [, options])` +### `ftcsv.parseLine(fileName, [, options])` `ftcsv.parseLine` will open a file and read `options.bufferSize` bytes of the file. `bufferSize` defaults to 2^16 bytes (which provides the fastest parsing on most unix-based systems), or can be specified in the options. `ftcsv.parseLine` is an iterator and returns one line at a time. When all the lines in the buffer are read, it will read in another `bufferSize` bytes of a file and repeat the process until the entire file has been read. If specifying `bufferSize` there are a couple of things to remember: @@ -37,7 +37,7 @@ If specifying `bufferSize` there are a couple of things to remember: Parsing through a csv file: ```lua local ftcsv = require("ftcsv") -for index, zipcode in ftcsv.parseLine("free-zipcode-database.csv", ",") do +for index, zipcode in ftcsv.parseLine("free-zipcode-database.csv") do print(zipcode.Zipcode) print(zipcode.State) end @@ -48,11 +48,18 @@ end The options are the same for `parseLine` and `parse`, with the exception of `loadFromString` and `bufferSize`. `loadFromString` only works with `parse` and `bufferSize` can only be specified for `parseLine`. The following are optional parameters passed in via the third argument as a table. + - `delimeter` + + If your file doesn't use the comma character as the delimiter, you can specify your own. It is limited to one character and defaults to `,` + ```lua + ftcsv.parse("a>b>c\r\n1,2,3", {loadFromString=true, delimiter=">"}) + ``` + - `loadFromString` If you want to load a csv from a string instead of a file, set `loadFromString` to `true` (default: `false`) ```lua - ftcsv.parse("a,b,c\r\n1,2,3", ",", {loadFromString=true}) + ftcsv.parse("a,b,c\r\n1,2,3", {loadFromString=true}) ``` - `rename` @@ -63,7 +70,7 @@ The following are optional parameters passed in via the third argument as a tabl ```lua local options = {loadFromString=true, rename={["a"] = "d", ["b"] = "e", ["c"] = "f"}} - local actual = ftcsv.parse("a,b,c\r\napple,banana,carrot", ",", options) + local actual = ftcsv.parse("a,b,c\r\napple,banana,carrot", options) ``` - `fieldsToKeep` @@ -74,7 +81,7 @@ The following are optional parameters passed in via the third argument as a tabl ```lua local options = {loadFromString=true, fieldsToKeep={"a","f"}, rename={["c"] = "f"}} - local actual = ftcsv.parse("a,b,c\r\napple,banana,carrot\r\n", ",", options) + local actual = ftcsv.parse("a,b,c\r\napple,banana,carrot\r\n", options) ``` Also Note: If you apply a function to the headers via headerFunc, and want to select fields from fieldsToKeep, you need to have what the post-modified header would be in fieldsToKeep. @@ -85,7 +92,7 @@ The following are optional parameters passed in via the third argument as a tabl ```lua local options = {loadFromString=true, ignoreQuotes=true} - local actual = ftcsv.parse('a,b,c\n"apple,banana,carrot', ",", options) + local actual = ftcsv.parse('a,b,c\n"apple,banana,carrot', options) ``` - `headerFunc` @@ -95,7 +102,7 @@ The following are optional parameters passed in via the third argument as a tabl Ex: making all fields uppercase ```lua local options = {loadFromString=true, headerFunc=string.upper} - local actual = ftcsv.parse("a,b,c\napple,banana,carrot", ",", options) + local actual = ftcsv.parse("a,b,c\napple,banana,carrot", options) ``` - `headers` @@ -103,15 +110,15 @@ The following are optional parameters passed in via the third argument as a tabl Set `headers` to `false` if the file you are reading doesn't have any headers. This will cause ftcsv to create indexed tables rather than a key-value tables for the output. ```lua - local options = {loadFromString=true, headers=false} - local actual = ftcsv.parse("apple>banana>carrot\ndiamond>emerald>pearl", ">", options) + local options = {loadFromString=true, headers=false, delimiter=">"} + local actual = ftcsv.parse("apple>banana>carrot\ndiamond>emerald>pearl", options) ``` Note: Header-less files can still use the `rename` option and after a field has been renamed, it can specified as a field to keep. The `rename` syntax changes a little bit: ```lua - local options = {loadFromString=true, headers=false, rename={"a","b","c"}, fieldsToKeep={"a","b"}} - local actual = ftcsv.parse("apple>banana>carrot\ndiamond>emerald>pearl", ">", options) + local options = {loadFromString=true, headers=false, rename={"a","b","c"}, fieldsToKeep={"a","b"}, delimiter=">"} + local actual = ftcsv.parse("apple>banana>carrot\ndiamond>emerald>pearl", options) ``` In the above example, the first field becomes 'a', the second field becomes 'b' and so on. @@ -120,7 +127,7 @@ For all tested examples, take a look in /spec/feature_spec.lua The options can be string together. For example if you wanted to `loadFromString` and not use `headers`, you could use the following: ```lua -ftcsv.parse("apple,banana,carrot", ",", {loadFromString=true, headers=false}) +ftcsv.parse("apple,banana,carrot", {loadFromString=true, headers=false}) ``` ## Encoding @@ -137,7 +144,7 @@ file:close() ### Options - `fieldsToKeep` - if `fieldsToKeep` is set in the encode process, only the fields specified will be written out to a file. + if `fieldsToKeep` is set in the encode process, only the fields specified will be written out to a file. The `fieldsToKeep` will be written out in the order that is specified. ```lua local output = ftcsv.encode(everyUser, ",", {fieldsToKeep={"Name", "Phone", "City"}}) @@ -148,7 +155,7 @@ file:close() if `onlyRequiredQuotes` is set to `true`, the output will only include quotes around fields that are quotes, have newlines, or contain the delimter. ```lua - local output = ftcsv.encode(everyUser, ",", {noQuotes=true}) + local output = ftcsv.encode(everyUser, ",", {onlyRequiredQuotes=true}) ``` @@ -184,7 +191,7 @@ NOTE: times are measured using `os.clock()`, so they are in CPU seconds. Each te Benchmarks were run under ftcsv 1.2.0 ## Performance -I did some basic testing and found that in lua, if you want to iterate over a string character-by-character and compare chars, `string.byte` performs faster than `string.sub`. As such, ftcsv iterates over the whole file and does byte compares to find quotes and delimiters and then generates a table from it. When using vanilla lua, it proved faster to use `string.find` instead of iterating character by character (which is faster in LuaJIT), so ftcsv accounts for that and will perform the fastest option that is availble. If you have thoughts on how to improve performance (either big picture or specifically within the code), create a GitHub issue - I'd love to hear about it! +I did some basic testing and found that in lua, if you want to iterate over a string character-by-character and compare chars, `string.byte` performs faster than `string.sub`. As such, ftcsv iterates over the whole file and does byte compares to find quotes and delimiters and then generates a table from it. When using vanilla lua, it proved faster to use `string.find` instead of iterating character by character (which is faster in LuaJIT), so ftcsv accounts for that and will perform the fastest option that is available. If you have thoughts on how to improve performance (either big picture or specifically within the code), create a GitHub issue - I'd love to hear about it! ## Contributing @@ -200,6 +207,16 @@ Feel free to create a new issue for any bugs you've found or help you need. If y 8. Enjoy the changes made! +## Delimiter no longer required as of 1.4.0! +Starting with version 1.4.0, the delimiter no longer required as the second argument. **But don't worry,** ftcsv remains backwards compatible! We check the argument types and adjust parsing as necessary. There is no intention to remove this backwards compatibility layer, so you can always enjoy your up-to-date lightning fast CSV parser! + +So this works just fine: +```lua +ftcsv.parse("a>b>c\r\n1,2,3", ">", {loadFromString=true}) +``` + +The delimiter as the second argument will always take precedent if both are provided. + ## Licenses - The main library is licensed under the MIT License. Feel free to use it! diff --git a/ftcsv-1.3.0-1.rockspec b/ftcsv-1.4.0-1.rockspec similarity index 85% rename from ftcsv-1.3.0-1.rockspec rename to ftcsv-1.4.0-1.rockspec index aa87ea6..a6cbf7c 100644 --- a/ftcsv-1.3.0-1.rockspec +++ b/ftcsv-1.4.0-1.rockspec @@ -1,16 +1,16 @@ package = "ftcsv" -version = "1.3.0-1" +version = "1.4.0-1" source = { url = "git://github.com/FourierTransformer/ftcsv.git", - tag = "1.3.0" + tag = "1.4.0" } description = { summary = "A fast pure lua csv library (parser and encoder)", detailed = [[ ftcsv is a fast and easy to use csv library for lua. It can read in CSV files, - do some basic transformations (rename fields) and can create the csv format. + do some basic transformations (rename fields, retain, etc) and can create a CSV file. It supports UTF-8, header-less CSVs, and maintaining correct line endings for multi-line fields. diff --git a/ftcsv.lua b/ftcsv.lua index 5526630..a206c18 100644 --- a/ftcsv.lua +++ b/ftcsv.lua @@ -1,5 +1,5 @@ local ftcsv = { - _VERSION = 'ftcsv 1.3.0', + _VERSION = 'ftcsv 1.4.0', _DESCRIPTION = 'CSV library for Lua', _URL = 'https://github.com/FourierTransformer/ftcsv', _LICENSE = [[ @@ -90,7 +90,7 @@ end -- determine the real headers as opposed to the header mapping -local function determineRealHeaders(headerField, fieldsToKeep) +local function determineRealHeaders(headerField, fieldsToKeep) local realHeaders = {} local headerSet = {} for i = 1, #headerField do @@ -396,6 +396,22 @@ local function initializeInputFromStringOrFile(inputFile, options, amount) return inputString, file end +local function determineArgumentOrder(delimiter, options) + -- backwards compatibile layer + if type(delimiter) == "string" then + return delimiter, options + + -- the new format for parseLine + elseif type(delimiter) == "table" then + local realDelimiter = delimiter.delimiter or "," + return realDelimiter, delimiter + + -- if nothing is specified, assume "," delimited and call it a day! + else + return ",", nil + end +end + local function parseOptions(delimiter, options, fromParseLine) -- delimiter MUST be one character assert(#delimiter == 1 and type(delimiter) == "string", "the delimiter must be of string type and exactly one character") @@ -404,50 +420,54 @@ local function parseOptions(delimiter, options, fromParseLine) if options then - if options.headers ~= nil then - assert(type(options.headers) == "boolean", "ftcsv only takes the boolean 'true' or 'false' for the optional parameter 'headers' (default 'true'). You passed in '" .. tostring(options.headers) .. "' of type '" .. type(options.headers) .. "'.") - end + if options.headers ~= nil then + assert(type(options.headers) == "boolean", "ftcsv only takes the boolean 'true' or 'false' for the optional parameter 'headers' (default 'true'). You passed in '" .. tostring(options.headers) .. "' of type '" .. type(options.headers) .. "'.") + end - if options.rename ~= nil then - assert(type(options.rename) == "table", "ftcsv only takes in a key-value table for the optional parameter 'rename'. You passed in '" .. tostring(options.rename) .. "' of type '" .. type(options.rename) .. "'.") - end + if options.rename ~= nil then + assert(type(options.rename) == "table", "ftcsv only takes in a key-value table for the optional parameter 'rename'. You passed in '" .. tostring(options.rename) .. "' of type '" .. type(options.rename) .. "'.") + end - if options.fieldsToKeep ~= nil then - assert(type(options.fieldsToKeep) == "table", "ftcsv only takes in a list (as a table) for the optional parameter 'fieldsToKeep'. You passed in '" .. tostring(options.fieldsToKeep) .. "' of type '" .. type(options.fieldsToKeep) .. "'.") - local ofieldsToKeep = options.fieldsToKeep - if ofieldsToKeep ~= nil then - fieldsToKeep = {} - for j = 1, #ofieldsToKeep do - fieldsToKeep[ofieldsToKeep[j]] = true - end - end - if options.headers == false and options.rename == nil then - error("ftcsv: fieldsToKeep only works with header-less files when using the 'rename' functionality") + if options.fieldsToKeep ~= nil then + assert(type(options.fieldsToKeep) == "table", "ftcsv only takes in a list (as a table) for the optional parameter 'fieldsToKeep'. You passed in '" .. tostring(options.fieldsToKeep) .. "' of type '" .. type(options.fieldsToKeep) .. "'.") + local ofieldsToKeep = options.fieldsToKeep + if ofieldsToKeep ~= nil then + fieldsToKeep = {} + for j = 1, #ofieldsToKeep do + fieldsToKeep[ofieldsToKeep[j]] = true end end - - if options.loadFromString ~= nil then - assert(type(options.loadFromString) == "boolean", "ftcsv only takes a boolean value for optional parameter 'loadFromString'. You passed in '" .. tostring(options.loadFromString) .. "' of type '" .. type(options.loadFromString) .. "'.") + if options.headers == false and options.rename == nil then + error("ftcsv: fieldsToKeep only works with header-less files when using the 'rename' functionality") end + end - if options.headerFunc ~= nil then - assert(type(options.headerFunc) == "function", "ftcsv only takes a function value for optional parameter 'headerFunc'. You passed in '" .. tostring(options.headerFunc) .. "' of type '" .. type(options.headerFunc) .. "'.") - end + if options.loadFromString ~= nil then + assert(type(options.loadFromString) == "boolean", "ftcsv only takes a boolean value for optional parameter 'loadFromString'. You passed in '" .. tostring(options.loadFromString) .. "' of type '" .. type(options.loadFromString) .. "'.") + end - if options.ignoreQuotes == nil then - options.ignoreQuotes = false + if options.headerFunc ~= nil then + assert(type(options.headerFunc) == "function", "ftcsv only takes a function value for optional parameter 'headerFunc'. You passed in '" .. tostring(options.headerFunc) .. "' of type '" .. type(options.headerFunc) .. "'.") + end + + if options.ignoreQuotes == nil then + options.ignoreQuotes = false + else + assert(type(options.ignoreQuotes) == "boolean", "ftcsv only takes a boolean value for optional parameter 'ignoreQuotes'. You passed in '" .. tostring(options.ignoreQuotes) .. "' of type '" .. type(options.ignoreQuotes) .. "'.") + end + + if fromParseLine == true then + if options.bufferSize == nil then + options.bufferSize = 2^16 else - assert(type(options.ignoreQuotes) == "boolean", "ftcsv only takes a boolean value for optional parameter 'ignoreQuotes'. You passed in '" .. tostring(options.ignoreQuotes) .. "' of type '" .. type(options.ignoreQuotes) .. "'.") + assert(type(options.bufferSize) == "number", "ftcsv only takes a number value for optional parameter 'bufferSize'. You passed in '" .. tostring(options.bufferSize) .. "' of type '" .. type(options.bufferSize) .. "'.") end - if options.bufferSize == nil then - options.bufferSize = 2^16 - else - assert(type(options.bufferSize) == "number", "ftcsv only takes a number value for optional parameter 'bufferSize'. You passed in '" .. tostring(options.bufferSize) .. "' of type '" .. type(options.bufferSize) .. "'.") - if fromParseLine == false then - error("ftcsv: bufferSize can only be specified using 'parseLine'. When using 'parse', the entire file is read into memory") - end + else + if options.bufferSize ~= nil then + error("ftcsv: bufferSize can only be specified using 'parseLine'. When using 'parse', the entire file is read into memory") end + end else options = { @@ -539,6 +559,8 @@ end -- runs the show! function ftcsv.parse(inputFile, delimiter, options) + local delimiter, options = determineArgumentOrder(delimiter, options) + local options, fieldsToKeep = parseOptions(delimiter, options, false) local inputString = initializeInputFromStringOrFile(inputFile, options, "*all") @@ -573,6 +595,7 @@ local function initializeInputFile(inputString, options) end function ftcsv.parseLine(inputFile, delimiter, userOptions) + local delimiter, userOptions = determineArgumentOrder(delimiter, userOptions) local options, fieldsToKeep = parseOptions(delimiter, userOptions, true) local inputString, file = initializeInputFile(inputFile, options) diff --git a/spec/dynamic_features_spec.lua b/spec/dynamic_features_spec.lua index 29bf291..6ccbc5f 100644 --- a/spec/dynamic_features_spec.lua +++ b/spec/dynamic_features_spec.lua @@ -61,7 +61,7 @@ describe("csv features", function() end local options = {loadFromString=true, rename={["a"] = "d", ["b"] = "e", ["c"] = "f"}} - local actual, actualHeaders = ftcsv.parse(defaultString, ",", options) + local actual, actualHeaders = ftcsv.parse(defaultString, options) assert.are.same(expected, actual) assert.are.same(expectedHeaders, actualHeaders) end) @@ -123,7 +123,7 @@ describe("csv features", function() end local options = {loadFromString=true, fieldsToKeep={"a", "b"}} - local actual, actualHeaders = ftcsv.parse(defaultString, ",", options) + local actual, actualHeaders = ftcsv.parse(defaultString, options) assert.are.same(expected, actual) assert.are.same(expectedHeaders, actualHeaders) end) @@ -347,7 +347,7 @@ describe("csv features", function() end local options = {loadFromString=true, headers=false} - local actual, actualHeaders = ftcsv.parse(defaultString, ",", options) + local actual, actualHeaders = ftcsv.parse(defaultString, options) assert.are.same(expected, actual) assert.are.same(expectedHeaders, actualHeaders) end) diff --git a/spec/feature_spec.lua b/spec/feature_spec.lua index e252dbe..88774de 100644 --- a/spec/feature_spec.lua +++ b/spec/feature_spec.lua @@ -71,6 +71,16 @@ describe("csv features", function() assert.are.same(expected, actual) end) + it("should handle escaped doublequotes with delimiter in options", function() + local expected = {} + expected[1] = {} + expected[1].a = 'A"B""C' + expected[1].b = 'A""B"C' + expected[1].c = 'A"""B""C' + local actual = ftcsv.parse('a;b;c\n"A""B""""C";"A""""B""C";"A""""""B""""C"', {loadFromString=true, delimiter=";"}) + assert.are.same(expected, actual) + end) + it("should handle renaming a field", function() local expected = {} expected[1] = {} @@ -492,4 +502,39 @@ describe("csv features", function() assert.are.same(expected, actual) end) + it("should handle ignoring the single quote without specifying the delimeter", function() + local expected = {} + expected[1] = {} + expected[1].a = '"apple' + expected[1].b = "banana" + expected[1].c = "carrot" + local actual = ftcsv.parse('a,b,c\n"apple,banana,carrot', {loadFromString=true, ignoreQuotes=true}) + assert.are.same(expected, actual) + end) + + it("should handle reusing the options", function() + local expected = {} + expected[1] = {} + expected[1].a = '"apple' + expected[1].b = "banana" + expected[1].c = "carrot" + local options = {loadFromString=true, ignoreQuotes=true} + local first = ftcsv.parse('a,b,c\n"apple,banana,carrot', ",", options) + local actual = ftcsv.parse('a,b,c\n"apple,banana,carrot', ",", options) + assert.are.same(expected, actual) + end) + + it("should handle reusing the options without specifying the delimeter", function() + local expected = {} + expected[1] = {} + expected[1].a = '"apple' + expected[1].b = "banana" + expected[1].c = "carrot" + local options = {loadFromString=true, ignoreQuotes=true} + local first = ftcsv.parse('a,b,c\n"apple,banana,carrot', options) + local actual = ftcsv.parse('a,b,c\n"apple,banana,carrot', options) + assert.are.same(expected, actual) + end) + + end) diff --git a/spec/parseLine_spec.lua b/spec/parseLine_spec.lua index 3c079ed..526d678 100644 --- a/spec/parseLine_spec.lua +++ b/spec/parseLine_spec.lua @@ -86,3 +86,81 @@ describe("parseLine with options but not bufferSize", function() assert.are.same(#json, #parse) end) end) + +describe("parseLine features small, working buffer size without delimiter", function() + it("should handle correctness", function() + local json = loadFile("spec/json/correctness.json") + json = cjson.decode(json) + local parse = {} + for i, line in ftcsv.parseLine("spec/csvs/correctness.csv", {bufferSize=52}) do + assert.are.same(json[i], line) + parse[i] = line + end + assert.are.same(#json, #parse) + assert.are.same(json, parse) + end) +end) + +describe("parseLine features small, nonworking buffer size without delimiter", function() + it("should handle correctness", function() + local test = function() + local parse = {} + for i, line in ftcsv.parseLine("spec/csvs/correctness.csv", {bufferSize=63}) do + parse[i] = line + end + return parse + end + assert.has_error(test, "ftcsv: bufferSize needs to be larger to parse this file") + end) +end) + +describe("parseLine features smaller, nonworking buffer size without delimiter", function() + it("should handle correctness", function() + local test = function() + local parse = {} + for i, line in ftcsv.parseLine("spec/csvs/correctness.csv", {bufferSize=50}) do + parse[i] = line + end + return parse + end + assert.has_error(test, "ftcsv: bufferSize needs to be larger to parse this file") + end) +end) + +describe("smaller bufferSize than header and incorrect number of fields without delimiter", function() + it("should handle correctness", function() + local test = function() + local parse = {} + for i, line in ftcsv.parseLine("spec/csvs/correctness.csv", {bufferSize=23}) do + parse[i] = line + end + return parse + end + assert.has_error(test, "ftcsv: bufferSize needs to be larger to parse this file") + end) +end) + +describe("smaller bufferSize than header, but with correct field numbers without delimiter", function() + it("should handle correctness", function() + local test = function() + local parse = {} + for i, line in ftcsv.parseLine("spec/csvs/correctness.csv", {bufferSize=30}) do + parse[i] = line + end + return parse + end + assert.has_error(test, "ftcsv: bufferSize needs to be larger to parse this file") + end) +end) + +describe("parseLine with options but not bufferSize without delimiter", function() + it("should handle correctness", function() + local json = loadFile("spec/json/correctness.json") + json = cjson.decode(json) + local parse = {} + for i, line in ftcsv.parseLine("spec/csvs/correctness.csv", {rename={["Year"] = "Full Year"}}) do + parse[i] = line + end + assert.are.same(#json, #parse) + end) +end) \ No newline at end of file