diff --git a/README.md b/README.md new file mode 100644 index 0000000..c33033d --- /dev/null +++ b/README.md @@ -0,0 +1,122 @@ +# ftcsv +ftcsv, a fairly fast csv library written in pure Lua. It's been tested with LuaJIT 2.0/2.1 and Lua 5.2 + +It works well for CSVs that can easily be fully loaded into memory (easily up to a hundred MB). Currently, there isn't a "large" file mode with proper readers and writers for ingesting CSVs in bulk with a fixed amount of memory. It correctly handles both `\n` (LF) and `\r\n` (CRLF) line endings (ie it should work with Windows and Mac/Linux line endings) and has UTF-8 support. + + + +## Installing +You can either grab `ftcsv.lua` from here or install via luarocks: + +``` +luarocks install ftcsv +``` + + +## Parsing +###`ftcsv.parse(fileName, delimiter [, options])` + +ftcsv will load the entire csv file into memory, then parse it in one go, returning a lua table with the parsed data. It has only two required parameters - a file name and delimiter (limited to one character). A few optional parameters can be passed in via a table (examples below). + +Just loading a csv file: +```lua +local ftcsv = require('ftcsv') +local zipcodes = ftcsv.parse("free-zipcode-database.csv", ",") +``` + +### Options +The following are optional parameters passed in via the third argument as a table. For example if you wanted to `loadFromString` and not use `headers`, you could use the following: +```lua +ftcsv.parse("apple,banana,carrot", ",", {loadFromString=true, headers=false}) +``` + - `loadFromString` + + If you want to load a csv from a string instead of a file, set `loadFromString` to `true` (default: `false`) + ```lua + ftcsv.parse("a,b,c\r\n1,2,3", ",", {loadFromString=true}) + ``` + + - `rename` + + If you want to rename a field, you can set `rename` to change the field names. The below example will change the headers from `a,b,c` to `d,e,f` + + Note: You can rename two fields to the same value, ftcsv will keep the field that appears latest in the line. + + ```lua + local options = {loadFromString=true, rename={["a"] = "d", ["b"] = "e", ["c"] = "f"}} + local actual = ftcsv.parse("a,b,c\r\napple,banana,carrot", ",", options) + ``` + + - `fieldsToKeep` + + If you only want to keep certain fields from the CSV, send them in as a table-list and it should parse a little faster and use less memory. + + Note: If you want to keep a renamed field, put the new name of the field in `fieldsToKeep`: + + ```lua + local options = {loadFromString=true, fieldsToKeep={"a","f"}, rename={["c"] = "f"}} + local actual = ftcsv.parse("a,b,c\r\napple,banana,carrot\r\n", ",", options) + ``` + + - `headers` + + Set `headers` to `false` if the file you are reading doesn't have any headers. This will cause ftcsv to create indexed tables rather than a key-value tables for the output. + + ```lua + local options = {loadFromString=true, headers=false} + local actual = ftcsv.parse("apple>banana>carrot\ndiamond>emerald>pearl", ">", options) + ``` + + Note: Header-less files can still use the `rename` option and after a field has been renamed, it can specified as a field to keep. The `rename` syntax changes a little bit: + + ```lua + local options = {loadFromString=true, headers=false, rename={"a","b","c"}, fieldsToKeep={"a","b"}} + local actual = ftcsv.parse("apple>banana>carrot\ndiamond>emerald>pearl", ">", options) + ``` + + In the above example, the first field becomes 'a', the second field becomes 'b' and so on. + +For all tested examples, take a look in /spec/feature_spec.lua + + +## Encoding +###`ftcsv.encode(inputTable, delimiter[, options])` +ftcsv can also take a lua table and turn it into a text string to be written to a file. It has two required parameters, an inputTable and a delimiter. You can use it to write out a file like this: +```lua +local fileOutput = ftcsv.encode(users, ",") +local file = assert(io.open("ALLUSERS.csv", "w")) +file:write(fileOutput) +file:close() +``` + +### Options + - `fieldsToKeep` + + if `fieldsToKeep` is set in the encode process, only the fields specified will be written out to a file. + + ```lua + local output = ftcsv.encode(everyUser, ",", {fieldsToKeep={"Name", "Phone", "City"}}) + ``` + + + +## Performance +I did some basic testing and found that in lua, if you want to iterate over a string character-by-character and look for single chars, `string.byte` performs better than `string.sub`. As such, ftcsv iterates over the whole file and does byte compares to find quotes and delimiters and then generates a table from it. If you have thoughts on how to improve performance (either big picture or specifically within the code), create a GitHub issue - I'd love to hear about it! + + + +## Contributing +Feel free to create a new issue for any bugs you've found or help you need. If you want to contribute back to the project please do the following: + 1. Fork the repo + 2. Create a new branch + 3. Push your changes to the branch + 4. Run the test suite and make sure it still works + 5. Submit a pull request + 6. ??? + 7. Enjoy the changes made to the repo! + + + +## Licenses + - The main library is licensed under the MIT License. Feel free to use it! + - Some of the test CSVs are from [csv-spectrum](https://github.com/maxogden/csv-spectrum) (BSD-2-Clause) which includes some from [csvkit](https://github.com/wireservice/csvkit) (MIT License) \ No newline at end of file diff --git a/ftcsv-1.0.0-1.rockspec b/ftcsv-1.0.0-1.rockspec new file mode 100644 index 0000000..70f5dea --- /dev/null +++ b/ftcsv-1.0.0-1.rockspec @@ -0,0 +1,33 @@ +package = "ftcsv" +version = "1.0.0-1" + +source = { + url = "git://github.com/FourierTransformer/ftcsv.git", + tag = "1.0.0" +} + +description = { + summary = "A fairly fast csv library written in pure Lua", + detailed = [[ + ftcsv is a fast and easy to use csv library for lua. It can read in CSV files, + do some basic transformations (rename fields) and can create the csv format. + It supports UTF-8, header-less CSVs, and maintaining correct line endings for + multi-line fields. + + Note: Currently it cannot load CSV files where the file can't fit in memory. + ]], + homepage = "https://github.com/FourierTransformer/ftcsv", + maintainer = "Shakil Thakur ", + license = "MIT" +} + +dependencies = { + "lua >= 5.1, <5.3", +} + +build = { + type = "builtin", + modules = { + ["ftcsv"] = "ftcsv.lua" + }, +} \ No newline at end of file diff --git a/ftcsv.lua b/ftcsv.lua index d6c2079..29d5b0e 100644 --- a/ftcsv.lua +++ b/ftcsv.lua @@ -49,7 +49,8 @@ local function createNewField(inputString, quote, fieldStart, i, line, fieldNum, -- so, if we just recently de-escaped, we don't want the trailing \" -- if fieldsToKeep == nil then -- local fieldsToKeep = fieldsToKeep - local output = line[fieldNum] + -- print(fieldNum) + -- print(fieldsToKeep[fieldNum]) if fieldsToKeep == nil or fieldsToKeep[fieldNum] then -- print(fieldsToKeep) -- print("b4", i, fieldNum, line[fieldNum]) @@ -71,7 +72,7 @@ local function createNewField(inputString, quote, fieldStart, i, line, fieldNum, end -- creates the headers after reading through to the first line -local function createHeaders(line, rename, fieldsToKeep) +local function createHeaders(line, rename) -- print("CREATING HEADERS") local headers = {} for i = 1, #line do @@ -82,12 +83,7 @@ local function createHeaders(line, rename, fieldsToKeep) headers[i] = line[i] end end - if fieldsToKeep ~= nil then - for i = 1, #fieldsToKeep do - fieldsToKeep[fieldsToKeep[i]] = true - end - end - return headers, 0, true, fieldsToKeep + return headers, 0, true end -- main function used to parse @@ -118,6 +114,9 @@ function ftcsv.parse(inputFile, delimiter, options) if options.fieldsToKeep ~= nil then assert(type(options.fieldsToKeep) == "table", "ftcsv only takes in a list (as a table) for the optional parameter 'fieldsToKeep'. You passed in '" .. tostring(options.fieldsToKeep) .. "' of type '" .. type(options.fieldsToKeep) .. "'.") ofieldsToKeep = options.fieldsToKeep + if header == false then + assert(next(rename) ~= nil, "ftcsv can only have fieldsToKeep for header-less files when they have been renamed. Please add the 'rename' option and try again.") + end end if options.loadFromString ~= nil then assert(type(options.loadFromString) == "boolean", "ftcsv only takes a boolean value for optional parameter 'loadFromString'. You passed in '" .. tostring(options.loadFromString) .. "' of type '" .. type(options.loadFromString) .. "'.") @@ -199,10 +198,32 @@ function ftcsv.parse(inputFile, delimiter, options) doubleQuoteEscape = createNewField(inputString, quote, fieldStart, i, outResults[lineNum], headerField[fieldNum], doubleQuoteEscape, fieldsToKeep) -- if we have headers then we gotta do something about it - if header and lineNum == 1 and not headerSet then - headerField, lineNum, headerSet, fieldsToKeep = createHeaders(outResults[lineNum], rename, ofieldsToKeep) + if lineNum == 1 and not headerSet then + if ofieldsToKeep ~= nil then + fieldsToKeep = {} + for i = 1, #ofieldsToKeep do + fieldsToKeep[ofieldsToKeep[i]] = true + end + end + if header then + headerField, lineNum, headerSet = createHeaders(outResults[lineNum], rename) + else + -- files without headers, but with a rename need to be handled too! + if #rename > 0 then + for j = 1, math.max(#rename, #headerField) do + headerField[j] = rename[j] + -- this is an odd case of where there are certain fields to be kept + if fieldsToKeep == nil or fieldsToKeep[rename[j]] then + outResults[1][rename[j]] = outResults[1][j] + end + -- print("J", j) + outResults[1][j] = nil + end + end + end end + -- incrememnt for new line lineNum = lineNum + 1 outResults[lineNum] = {} fieldNum = 1 @@ -314,9 +335,9 @@ function ftcsv.encode(inputTable, delimiter, options) -- grab the headers from the options if they are there local headers = nil if options then - if options.headers ~= nil then - assert(type(options.headers) == "table", "ftcsv only takes in a list (as a table) for the optional parameter 'headers'. You passed in '" .. tostring(options.headers) .. "' of type '" .. type(options.headers) .. "'.") - headers = options.headers + if options.fieldsToKeep ~= nil then + assert(type(options.fieldsToKeep) == "table", "ftcsv only takes in a list (as a table) for the optional parameter 'fieldsToKeep'. You passed in '" .. tostring(options.headers) .. "' of type '" .. type(options.headers) .. "'.") + headers = options.fieldsToKeep end end if headers == nil then diff --git a/spec/feature_spec.lua b/spec/feature_spec.lua index 819d644..b935796 100644 --- a/spec/feature_spec.lua +++ b/spec/feature_spec.lua @@ -102,8 +102,55 @@ describe("csv features", function() expected[2][1] = "diamond" expected[2][2] = "emerald" expected[2][3] = "pearl" - local options = {loadFromString=true, header=false} + local options = {loadFromString=true, headers=false} local actual = ftcsv.parse("apple>banana>carrot\ndiamond>emerald>pearl", ">", options) + assert.are.same(expected, actual) + end) + + it("should error out for fieldsToKeep if no headers and no renaming", function() + local options = {loadFromString=true, headers=false, fieldsToKeep={1, 2}} + assert.has.errors(function() ftcsv.parse("apple>banana>carrot\ndiamond>emerald>pearl", ">", options) end) + end) + + it("should handle only renaming fields from files without headers", function() + local expected = {} + expected[1] = {} + expected[1].a = "apple" + expected[1].b = "banana" + expected[1].c = "carrot" + expected[2] = {} + expected[2].a = "diamond" + expected[2].b = "emerald" + expected[2].c = "pearl" + local options = {loadFromString=true, headers=false, rename={"a","b","c"}} + local actual = ftcsv.parse("apple>banana>carrot\ndiamond>emerald>pearl", ">", options) + assert.are.same(expected, actual) + end) + + it("should handle only renaming fields from files without headers and only keeping a few fields", function() + local expected = {} + expected[1] = {} + expected[1].a = "apple" + expected[1].b = "banana" + expected[2] = {} + expected[2].a = "diamond" + expected[2].b = "emerald" + local options = {loadFromString=true, headers=false, rename={"a","b","c"}, fieldsToKeep={"a","b"}} + local actual = ftcsv.parse("apple>banana>carrot\ndiamond>emerald>pearl", ">", options) + assert.are.same(expected, actual) + end) + + it("should handle if the number of renames doesn't equal the number of fields", function() + local expected = {} + expected[1] = {} + expected[1].a = "apple" + expected[1].b = "banana" + expected[2] = {} + expected[2].a = "diamond" + expected[2].b = "emerald" + local options = {loadFromString=true, headers=false, rename={"a","b"}, fieldsToKeep={"a","b"}} + local actual = ftcsv.parse("apple>banana>carrot\ndiamond>emerald>pearl", ">", options) + assert.are.same(expected, actual) end) end) \ No newline at end of file