Make delimiter optional and fix bug when reusing options table (#42)

This commit is contained in:
FourierTransformer 2024-08-06 07:58:30 -05:00 committed by GitHub
parent 527620af3d
commit 11f1c6e437
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 222 additions and 59 deletions

View File

@ -17,16 +17,16 @@ luarocks install ftcsv
There are two main parsing methods: `ftcv.parse` and `ftcsv.parseLine`.
`ftcsv.parse` loads the entire file and parses it, while `ftcsv.parseLine` is an iterator that parses one line at a time.
### `ftcsv.parse(fileName, delimiter [, options])`
`ftcsv.parse` will load the entire csv file into memory, then parse it in one go, returning a lua table with the parsed data and a lua table containing the column headers. It has only two required parameters - a file name and delimiter (limited to one character). A few optional parameters can be passed in via a table (examples below).
### `ftcsv.parse(fileName, [, options])`
`ftcsv.parse` will load the entire csv file into memory, then parse it in one go, returning a lua table with the parsed data and a lua table containing the column headers. It has only one required parameter - the file name. A few optional parameters can be passed in via a table (examples below).
Just loading a csv file:
```lua
local ftcsv = require('ftcsv')
local zipcodes, headers = ftcsv.parse("free-zipcode-database.csv", ",")
local zipcodes, headers = ftcsv.parse("free-zipcode-database.csv")
```
### `ftcsv.parseLine(fileName, delimiter, [, options])`
### `ftcsv.parseLine(fileName, [, options])`
`ftcsv.parseLine` will open a file and read `options.bufferSize` bytes of the file. `bufferSize` defaults to 2^16 bytes (which provides the fastest parsing on most unix-based systems), or can be specified in the options. `ftcsv.parseLine` is an iterator and returns one line at a time. When all the lines in the buffer are read, it will read in another `bufferSize` bytes of a file and repeat the process until the entire file has been read.
If specifying `bufferSize` there are a couple of things to remember:
@ -37,7 +37,7 @@ If specifying `bufferSize` there are a couple of things to remember:
Parsing through a csv file:
```lua
local ftcsv = require("ftcsv")
for index, zipcode in ftcsv.parseLine("free-zipcode-database.csv", ",") do
for index, zipcode in ftcsv.parseLine("free-zipcode-database.csv") do
print(zipcode.Zipcode)
print(zipcode.State)
end
@ -48,11 +48,18 @@ end
The options are the same for `parseLine` and `parse`, with the exception of `loadFromString` and `bufferSize`. `loadFromString` only works with `parse` and `bufferSize` can only be specified for `parseLine`.
The following are optional parameters passed in via the third argument as a table.
- `delimeter`
If your file doesn't use the comma character as the delimiter, you can specify your own. It is limited to one character and defaults to `,`
```lua
ftcsv.parse("a>b>c\r\n1,2,3", {loadFromString=true, delimiter=">"})
```
- `loadFromString`
If you want to load a csv from a string instead of a file, set `loadFromString` to `true` (default: `false`)
```lua
ftcsv.parse("a,b,c\r\n1,2,3", ",", {loadFromString=true})
ftcsv.parse("a,b,c\r\n1,2,3", {loadFromString=true})
```
- `rename`
@ -63,7 +70,7 @@ The following are optional parameters passed in via the third argument as a tabl
```lua
local options = {loadFromString=true, rename={["a"] = "d", ["b"] = "e", ["c"] = "f"}}
local actual = ftcsv.parse("a,b,c\r\napple,banana,carrot", ",", options)
local actual = ftcsv.parse("a,b,c\r\napple,banana,carrot", options)
```
- `fieldsToKeep`
@ -74,7 +81,7 @@ The following are optional parameters passed in via the third argument as a tabl
```lua
local options = {loadFromString=true, fieldsToKeep={"a","f"}, rename={["c"] = "f"}}
local actual = ftcsv.parse("a,b,c\r\napple,banana,carrot\r\n", ",", options)
local actual = ftcsv.parse("a,b,c\r\napple,banana,carrot\r\n", options)
```
Also Note: If you apply a function to the headers via headerFunc, and want to select fields from fieldsToKeep, you need to have what the post-modified header would be in fieldsToKeep.
@ -85,7 +92,7 @@ The following are optional parameters passed in via the third argument as a tabl
```lua
local options = {loadFromString=true, ignoreQuotes=true}
local actual = ftcsv.parse('a,b,c\n"apple,banana,carrot', ",", options)
local actual = ftcsv.parse('a,b,c\n"apple,banana,carrot', options)
```
- `headerFunc`
@ -95,7 +102,7 @@ The following are optional parameters passed in via the third argument as a tabl
Ex: making all fields uppercase
```lua
local options = {loadFromString=true, headerFunc=string.upper}
local actual = ftcsv.parse("a,b,c\napple,banana,carrot", ",", options)
local actual = ftcsv.parse("a,b,c\napple,banana,carrot", options)
```
- `headers`
@ -103,15 +110,15 @@ The following are optional parameters passed in via the third argument as a tabl
Set `headers` to `false` if the file you are reading doesn't have any headers. This will cause ftcsv to create indexed tables rather than a key-value tables for the output.
```lua
local options = {loadFromString=true, headers=false}
local actual = ftcsv.parse("apple>banana>carrot\ndiamond>emerald>pearl", ">", options)
local options = {loadFromString=true, headers=false, delimiter=">"}
local actual = ftcsv.parse("apple>banana>carrot\ndiamond>emerald>pearl", options)
```
Note: Header-less files can still use the `rename` option and after a field has been renamed, it can specified as a field to keep. The `rename` syntax changes a little bit:
```lua
local options = {loadFromString=true, headers=false, rename={"a","b","c"}, fieldsToKeep={"a","b"}}
local actual = ftcsv.parse("apple>banana>carrot\ndiamond>emerald>pearl", ">", options)
local options = {loadFromString=true, headers=false, rename={"a","b","c"}, fieldsToKeep={"a","b"}, delimiter=">"}
local actual = ftcsv.parse("apple>banana>carrot\ndiamond>emerald>pearl", options)
```
In the above example, the first field becomes 'a', the second field becomes 'b' and so on.
@ -120,7 +127,7 @@ For all tested examples, take a look in /spec/feature_spec.lua
The options can be string together. For example if you wanted to `loadFromString` and not use `headers`, you could use the following:
```lua
ftcsv.parse("apple,banana,carrot", ",", {loadFromString=true, headers=false})
ftcsv.parse("apple,banana,carrot", {loadFromString=true, headers=false})
```
## Encoding
@ -137,7 +144,7 @@ file:close()
### Options
- `fieldsToKeep`
if `fieldsToKeep` is set in the encode process, only the fields specified will be written out to a file.
if `fieldsToKeep` is set in the encode process, only the fields specified will be written out to a file. The `fieldsToKeep` will be written out in the order that is specified.
```lua
local output = ftcsv.encode(everyUser, ",", {fieldsToKeep={"Name", "Phone", "City"}})
@ -148,7 +155,7 @@ file:close()
if `onlyRequiredQuotes` is set to `true`, the output will only include quotes around fields that are quotes, have newlines, or contain the delimter.
```lua
local output = ftcsv.encode(everyUser, ",", {noQuotes=true})
local output = ftcsv.encode(everyUser, ",", {onlyRequiredQuotes=true})
```
@ -184,7 +191,7 @@ NOTE: times are measured using `os.clock()`, so they are in CPU seconds. Each te
Benchmarks were run under ftcsv 1.2.0
## Performance
I did some basic testing and found that in lua, if you want to iterate over a string character-by-character and compare chars, `string.byte` performs faster than `string.sub`. As such, ftcsv iterates over the whole file and does byte compares to find quotes and delimiters and then generates a table from it. When using vanilla lua, it proved faster to use `string.find` instead of iterating character by character (which is faster in LuaJIT), so ftcsv accounts for that and will perform the fastest option that is availble. If you have thoughts on how to improve performance (either big picture or specifically within the code), create a GitHub issue - I'd love to hear about it!
I did some basic testing and found that in lua, if you want to iterate over a string character-by-character and compare chars, `string.byte` performs faster than `string.sub`. As such, ftcsv iterates over the whole file and does byte compares to find quotes and delimiters and then generates a table from it. When using vanilla lua, it proved faster to use `string.find` instead of iterating character by character (which is faster in LuaJIT), so ftcsv accounts for that and will perform the fastest option that is available. If you have thoughts on how to improve performance (either big picture or specifically within the code), create a GitHub issue - I'd love to hear about it!
## Contributing
@ -200,6 +207,16 @@ Feel free to create a new issue for any bugs you've found or help you need. If y
8. Enjoy the changes made!
## Delimiter no longer required as of 1.4.0!
Starting with version 1.4.0, the delimiter no longer required as the second argument. **But don't worry,** ftcsv remains backwards compatible! We check the argument types and adjust parsing as necessary. There is no intention to remove this backwards compatibility layer, so you can always enjoy your up-to-date lightning fast CSV parser!
So this works just fine:
```lua
ftcsv.parse("a>b>c\r\n1,2,3", ">", {loadFromString=true})
```
The delimiter as the second argument will always take precedent if both are provided.
## Licenses
- The main library is licensed under the MIT License. Feel free to use it!

View File

@ -1,16 +1,16 @@
package = "ftcsv"
version = "1.3.0-1"
version = "1.4.0-1"
source = {
url = "git://github.com/FourierTransformer/ftcsv.git",
tag = "1.3.0"
tag = "1.4.0"
}
description = {
summary = "A fast pure lua csv library (parser and encoder)",
detailed = [[
ftcsv is a fast and easy to use csv library for lua. It can read in CSV files,
do some basic transformations (rename fields) and can create the csv format.
do some basic transformations (rename fields, retain, etc) and can create a CSV file.
It supports UTF-8, header-less CSVs, and maintaining correct line endings for
multi-line fields.

View File

@ -1,5 +1,5 @@
local ftcsv = {
_VERSION = 'ftcsv 1.3.0',
_VERSION = 'ftcsv 1.4.0',
_DESCRIPTION = 'CSV library for Lua',
_URL = 'https://github.com/FourierTransformer/ftcsv',
_LICENSE = [[
@ -90,7 +90,7 @@ end
-- determine the real headers as opposed to the header mapping
local function determineRealHeaders(headerField, fieldsToKeep)
local function determineRealHeaders(headerField, fieldsToKeep)
local realHeaders = {}
local headerSet = {}
for i = 1, #headerField do
@ -396,6 +396,22 @@ local function initializeInputFromStringOrFile(inputFile, options, amount)
return inputString, file
end
local function determineArgumentOrder(delimiter, options)
-- backwards compatibile layer
if type(delimiter) == "string" then
return delimiter, options
-- the new format for parseLine
elseif type(delimiter) == "table" then
local realDelimiter = delimiter.delimiter or ","
return realDelimiter, delimiter
-- if nothing is specified, assume "," delimited and call it a day!
else
return ",", nil
end
end
local function parseOptions(delimiter, options, fromParseLine)
-- delimiter MUST be one character
assert(#delimiter == 1 and type(delimiter) == "string", "the delimiter must be of string type and exactly one character")
@ -404,50 +420,54 @@ local function parseOptions(delimiter, options, fromParseLine)
if options then
if options.headers ~= nil then
assert(type(options.headers) == "boolean", "ftcsv only takes the boolean 'true' or 'false' for the optional parameter 'headers' (default 'true'). You passed in '" .. tostring(options.headers) .. "' of type '" .. type(options.headers) .. "'.")
end
if options.headers ~= nil then
assert(type(options.headers) == "boolean", "ftcsv only takes the boolean 'true' or 'false' for the optional parameter 'headers' (default 'true'). You passed in '" .. tostring(options.headers) .. "' of type '" .. type(options.headers) .. "'.")
end
if options.rename ~= nil then
assert(type(options.rename) == "table", "ftcsv only takes in a key-value table for the optional parameter 'rename'. You passed in '" .. tostring(options.rename) .. "' of type '" .. type(options.rename) .. "'.")
end
if options.rename ~= nil then
assert(type(options.rename) == "table", "ftcsv only takes in a key-value table for the optional parameter 'rename'. You passed in '" .. tostring(options.rename) .. "' of type '" .. type(options.rename) .. "'.")
end
if options.fieldsToKeep ~= nil then
assert(type(options.fieldsToKeep) == "table", "ftcsv only takes in a list (as a table) for the optional parameter 'fieldsToKeep'. You passed in '" .. tostring(options.fieldsToKeep) .. "' of type '" .. type(options.fieldsToKeep) .. "'.")
local ofieldsToKeep = options.fieldsToKeep
if ofieldsToKeep ~= nil then
fieldsToKeep = {}
for j = 1, #ofieldsToKeep do
fieldsToKeep[ofieldsToKeep[j]] = true
end
end
if options.headers == false and options.rename == nil then
error("ftcsv: fieldsToKeep only works with header-less files when using the 'rename' functionality")
if options.fieldsToKeep ~= nil then
assert(type(options.fieldsToKeep) == "table", "ftcsv only takes in a list (as a table) for the optional parameter 'fieldsToKeep'. You passed in '" .. tostring(options.fieldsToKeep) .. "' of type '" .. type(options.fieldsToKeep) .. "'.")
local ofieldsToKeep = options.fieldsToKeep
if ofieldsToKeep ~= nil then
fieldsToKeep = {}
for j = 1, #ofieldsToKeep do
fieldsToKeep[ofieldsToKeep[j]] = true
end
end
if options.loadFromString ~= nil then
assert(type(options.loadFromString) == "boolean", "ftcsv only takes a boolean value for optional parameter 'loadFromString'. You passed in '" .. tostring(options.loadFromString) .. "' of type '" .. type(options.loadFromString) .. "'.")
if options.headers == false and options.rename == nil then
error("ftcsv: fieldsToKeep only works with header-less files when using the 'rename' functionality")
end
end
if options.headerFunc ~= nil then
assert(type(options.headerFunc) == "function", "ftcsv only takes a function value for optional parameter 'headerFunc'. You passed in '" .. tostring(options.headerFunc) .. "' of type '" .. type(options.headerFunc) .. "'.")
end
if options.loadFromString ~= nil then
assert(type(options.loadFromString) == "boolean", "ftcsv only takes a boolean value for optional parameter 'loadFromString'. You passed in '" .. tostring(options.loadFromString) .. "' of type '" .. type(options.loadFromString) .. "'.")
end
if options.ignoreQuotes == nil then
options.ignoreQuotes = false
if options.headerFunc ~= nil then
assert(type(options.headerFunc) == "function", "ftcsv only takes a function value for optional parameter 'headerFunc'. You passed in '" .. tostring(options.headerFunc) .. "' of type '" .. type(options.headerFunc) .. "'.")
end
if options.ignoreQuotes == nil then
options.ignoreQuotes = false
else
assert(type(options.ignoreQuotes) == "boolean", "ftcsv only takes a boolean value for optional parameter 'ignoreQuotes'. You passed in '" .. tostring(options.ignoreQuotes) .. "' of type '" .. type(options.ignoreQuotes) .. "'.")
end
if fromParseLine == true then
if options.bufferSize == nil then
options.bufferSize = 2^16
else
assert(type(options.ignoreQuotes) == "boolean", "ftcsv only takes a boolean value for optional parameter 'ignoreQuotes'. You passed in '" .. tostring(options.ignoreQuotes) .. "' of type '" .. type(options.ignoreQuotes) .. "'.")
assert(type(options.bufferSize) == "number", "ftcsv only takes a number value for optional parameter 'bufferSize'. You passed in '" .. tostring(options.bufferSize) .. "' of type '" .. type(options.bufferSize) .. "'.")
end
if options.bufferSize == nil then
options.bufferSize = 2^16
else
assert(type(options.bufferSize) == "number", "ftcsv only takes a number value for optional parameter 'bufferSize'. You passed in '" .. tostring(options.bufferSize) .. "' of type '" .. type(options.bufferSize) .. "'.")
if fromParseLine == false then
error("ftcsv: bufferSize can only be specified using 'parseLine'. When using 'parse', the entire file is read into memory")
end
else
if options.bufferSize ~= nil then
error("ftcsv: bufferSize can only be specified using 'parseLine'. When using 'parse', the entire file is read into memory")
end
end
else
options = {
@ -539,6 +559,8 @@ end
-- runs the show!
function ftcsv.parse(inputFile, delimiter, options)
local delimiter, options = determineArgumentOrder(delimiter, options)
local options, fieldsToKeep = parseOptions(delimiter, options, false)
local inputString = initializeInputFromStringOrFile(inputFile, options, "*all")
@ -573,6 +595,7 @@ local function initializeInputFile(inputString, options)
end
function ftcsv.parseLine(inputFile, delimiter, userOptions)
local delimiter, userOptions = determineArgumentOrder(delimiter, userOptions)
local options, fieldsToKeep = parseOptions(delimiter, userOptions, true)
local inputString, file = initializeInputFile(inputFile, options)

View File

@ -61,7 +61,7 @@ describe("csv features", function()
end
local options = {loadFromString=true, rename={["a"] = "d", ["b"] = "e", ["c"] = "f"}}
local actual, actualHeaders = ftcsv.parse(defaultString, ",", options)
local actual, actualHeaders = ftcsv.parse(defaultString, options)
assert.are.same(expected, actual)
assert.are.same(expectedHeaders, actualHeaders)
end)
@ -123,7 +123,7 @@ describe("csv features", function()
end
local options = {loadFromString=true, fieldsToKeep={"a", "b"}}
local actual, actualHeaders = ftcsv.parse(defaultString, ",", options)
local actual, actualHeaders = ftcsv.parse(defaultString, options)
assert.are.same(expected, actual)
assert.are.same(expectedHeaders, actualHeaders)
end)
@ -347,7 +347,7 @@ describe("csv features", function()
end
local options = {loadFromString=true, headers=false}
local actual, actualHeaders = ftcsv.parse(defaultString, ",", options)
local actual, actualHeaders = ftcsv.parse(defaultString, options)
assert.are.same(expected, actual)
assert.are.same(expectedHeaders, actualHeaders)
end)

View File

@ -71,6 +71,16 @@ describe("csv features", function()
assert.are.same(expected, actual)
end)
it("should handle escaped doublequotes with delimiter in options", function()
local expected = {}
expected[1] = {}
expected[1].a = 'A"B""C'
expected[1].b = 'A""B"C'
expected[1].c = 'A"""B""C'
local actual = ftcsv.parse('a;b;c\n"A""B""""C";"A""""B""C";"A""""""B""""C"', {loadFromString=true, delimiter=";"})
assert.are.same(expected, actual)
end)
it("should handle renaming a field", function()
local expected = {}
expected[1] = {}
@ -492,4 +502,39 @@ describe("csv features", function()
assert.are.same(expected, actual)
end)
it("should handle ignoring the single quote without specifying the delimeter", function()
local expected = {}
expected[1] = {}
expected[1].a = '"apple'
expected[1].b = "banana"
expected[1].c = "carrot"
local actual = ftcsv.parse('a,b,c\n"apple,banana,carrot', {loadFromString=true, ignoreQuotes=true})
assert.are.same(expected, actual)
end)
it("should handle reusing the options", function()
local expected = {}
expected[1] = {}
expected[1].a = '"apple'
expected[1].b = "banana"
expected[1].c = "carrot"
local options = {loadFromString=true, ignoreQuotes=true}
local first = ftcsv.parse('a,b,c\n"apple,banana,carrot', ",", options)
local actual = ftcsv.parse('a,b,c\n"apple,banana,carrot', ",", options)
assert.are.same(expected, actual)
end)
it("should handle reusing the options without specifying the delimeter", function()
local expected = {}
expected[1] = {}
expected[1].a = '"apple'
expected[1].b = "banana"
expected[1].c = "carrot"
local options = {loadFromString=true, ignoreQuotes=true}
local first = ftcsv.parse('a,b,c\n"apple,banana,carrot', options)
local actual = ftcsv.parse('a,b,c\n"apple,banana,carrot', options)
assert.are.same(expected, actual)
end)
end)

View File

@ -86,3 +86,81 @@ describe("parseLine with options but not bufferSize", function()
assert.are.same(#json, #parse)
end)
end)
describe("parseLine features small, working buffer size without delimiter", function()
it("should handle correctness", function()
local json = loadFile("spec/json/correctness.json")
json = cjson.decode(json)
local parse = {}
for i, line in ftcsv.parseLine("spec/csvs/correctness.csv", {bufferSize=52}) do
assert.are.same(json[i], line)
parse[i] = line
end
assert.are.same(#json, #parse)
assert.are.same(json, parse)
end)
end)
describe("parseLine features small, nonworking buffer size without delimiter", function()
it("should handle correctness", function()
local test = function()
local parse = {}
for i, line in ftcsv.parseLine("spec/csvs/correctness.csv", {bufferSize=63}) do
parse[i] = line
end
return parse
end
assert.has_error(test, "ftcsv: bufferSize needs to be larger to parse this file")
end)
end)
describe("parseLine features smaller, nonworking buffer size without delimiter", function()
it("should handle correctness", function()
local test = function()
local parse = {}
for i, line in ftcsv.parseLine("spec/csvs/correctness.csv", {bufferSize=50}) do
parse[i] = line
end
return parse
end
assert.has_error(test, "ftcsv: bufferSize needs to be larger to parse this file")
end)
end)
describe("smaller bufferSize than header and incorrect number of fields without delimiter", function()
it("should handle correctness", function()
local test = function()
local parse = {}
for i, line in ftcsv.parseLine("spec/csvs/correctness.csv", {bufferSize=23}) do
parse[i] = line
end
return parse
end
assert.has_error(test, "ftcsv: bufferSize needs to be larger to parse this file")
end)
end)
describe("smaller bufferSize than header, but with correct field numbers without delimiter", function()
it("should handle correctness", function()
local test = function()
local parse = {}
for i, line in ftcsv.parseLine("spec/csvs/correctness.csv", {bufferSize=30}) do
parse[i] = line
end
return parse
end
assert.has_error(test, "ftcsv: bufferSize needs to be larger to parse this file")
end)
end)
describe("parseLine with options but not bufferSize without delimiter", function()
it("should handle correctness", function()
local json = loadFile("spec/json/correctness.json")
json = cjson.decode(json)
local parse = {}
for i, line in ftcsv.parseLine("spec/csvs/correctness.csv", {rename={["Year"] = "Full Year"}}) do
parse[i] = line
end
assert.are.same(#json, #parse)
end)
end)