adding ftcsv and test scripts

2024-11-19 19:54:23 +00:00 · 2016-03-09 06:37:25 -06:00 · 2016-03-09 06:37:25 -06:00 · 572997ae12
commit 572997ae12
32 changed files with 537 additions and 0 deletions
--- a/ftcsv.lua
+++ b/ftcsv.lua
@ -0,0 +1,270 @@
+local ftcsv = {}
+
+local function findClosingQuote(i, inputLength, inputString, quote, doubleQuoteEscape)
+    local doubleQuoteEscape = doubleQuoteEscape
+    while i <= inputLength do
+        -- print(i)
+        local currentChar = string.byte(inputString, i)
+        local nextChar = string.byte(inputString, i+1)
+        -- this one deals with " double quotes that are escaped "" within single quotes "
+        -- these should be turned into a single quote at the end of the field
+        if currentChar == quote and nextChar == quote then
+            doubleQuoteEscape = true
+            i = i + 2
+        -- identifies the escape toggle
+        elseif currentChar == quote and nextChar ~= quote then
+            return i-1, doubleQuoteEscape
+        else
+            i = i + 1
+        end
+    end
+end
+
+local function createNewField(inputString, quote, fieldStart, i, line, fieldNum, doubleQuoteEscape, fieldsToKeep)
+    -- print(lineNum, fieldNum, fieldStart, i-1)
+    -- so, if we just recently de-escaped, we don't want the trailing \"
+    -- if fieldsToKeep == nil then
+    -- local fieldsToKeep = fieldsToKeep
+    if fieldsToKeep == nil or fieldsToKeep[fieldNum] then
+        -- print(fieldsToKeep)
+        if string.byte(inputString, i-1) == quote then
+            -- print("Skipping last \"")
+            line[fieldNum] = string.sub(inputString, fieldStart, i-2)
+        else
+            line[fieldNum] = string.sub(inputString, fieldStart, i-1)
+        end
+        -- remove the double quotes (if they existed)
+        if doubleQuoteEscape then
+            -- print("QUOTE REPLACE")
+            -- print(line[fieldNum])
+            line[fieldNum] = line[fieldNum]:gsub('""', '"')
+            return false
+        end
+    end
+end
+
+local function createHeaders(line, rename, fieldsToKeep)
+    -- print("CREATING HEADERS")
+    local headers = {}
+    for i = 1, #line do
+        if rename[line[i]] then
+            headers[i] = rename[line[i]]
+        else
+            headers[i] = line[i]
+        end
+    end
+    if fieldsToKeep ~= nil then
+        for i = 1, #fieldsToKeep do
+            fieldsToKeep[fieldsToKeep[i]] = true
+        end
+    end
+    return headers, 0, true, fieldsToKeep
+end
+
+function ftcsv.decode(inputString, separator, options)
+    -- each line in outResults holds another table
+    local outResults = {}
+    outResults[1] = {}
+
+    -- separator MUST be one character
+    if #separator ~= 1 and type("separator") ~= "string" then error("the separator must be of string type and exactly one character") end
+    local separator = string.byte(separator)
+
+    -- OPTIONS yo
+    local header = true
+    local rename = {}
+    local fieldsToKeep = nil
+    local ofieldsToKeep = nil
+    if options then
+        if options.headers ~= nil then
+            if type(options.headers) ~= "boolean" then
+                error("ftcsv only takes the boolean 'true' or 'false' for the optional parameter 'headers' (default 'true'). You passed in '" .. options.headers .. "' of type '" .. type(options.headers) .. "'.")
+            end
+            header = options.headers
+        end
+        if options.rename ~= nil then
+            if type(options.rename) ~= "table" then
+                error("ftcsv only takes in a key-value table for the optional parameter 'rename'. You passed in '" .. options.rename .. "' of type '" .. type(options.rename) .. "'.")
+            end
+            rename = options.rename
+        end
+        if options.fieldsToKeep ~= nil then
+            ofieldsToKeep = options.fieldsToKeep
+            if type(options.fieldsToKeep) ~= "table" then
+                error("ftcsv only takes in a list (as a table for the optional parameter 'fieldsToKeep'. You passed in '" .. options.fieldsToKeep .. "' of type '" .. type(options.fieldsToKeep) .. "'.")
+            end
+        end
+    end
+
+    local CR = string.byte("\r")
+    local LF = string.byte("\n")
+    local quote = string.byte("\"")
+    local doubleQuoteEscape = false
+    local fieldStart = 1
+    local fieldNum = 1
+    local lineNum = 1
+    local skipChar = 0
+    local inputLength = #inputString
+    local headerField = {}
+    local headerSet = false
+    local i = 1
+
+    -- keep track of my chars!
+    local currentChar, nextChar = string.byte(inputString, i), string.byte(inputString, i+1)
+
+    while i <= inputLength do
+        -- go by two chars at a time!
+        -- currentChar = string.byte(inputString, i)
+        nextChar = string.byte(inputString, i+1)
+        -- print(i, string.char(currentChar), string.char(nextChar))
+
+        -- keeps track of characters to "skip" while going through the encoding process
+        -- if skipChar == 0 then
+
+            -- empty string
+            if currentChar == quote and nextChar == quote then
+                -- print("EMPTY STRING")
+                skipChar = 1
+                fieldStart = i + 2
+                -- print("fs+2:", fieldStart)
+
+            -- identifies the escape toggle
+            elseif currentChar == quote and nextChar ~= quote then
+                -- print("ESCAPE TOGGLE")
+                fieldStart = i + 1
+                i, doubleQuoteEscape = findClosingQuote(i+1, inputLength, inputString, quote, doubleQuoteEscape)
+                -- print("I VALUE", i, doubleQuoteEscape)
+                skipChar = 1
+            -- end
+
+            -- create some fields if we can!
+            elseif currentChar == separator then
+                -- for that first field
+                if not headerSet and lineNum == 1 then
+                    headerField[fieldNum] = fieldNum
+                end
+                -- create the new field
+                -- print(headerField[fieldNum])
+                doubleQuoteEscape = createNewField(inputString, quote, fieldStart, i, outResults[lineNum], headerField[fieldNum], doubleQuoteEscape, fieldsToKeep)
+
+                fieldNum = fieldNum + 1
+                fieldStart = i + 1
+                -- print("fs+1:", fieldStart)
+            -- end
+
+            -- newline?!
+            elseif ((currentChar == CR and nextChar == LF) or currentChar == LF) then
+                -- keep track of headers
+                if not headerSet and lineNum == 1 then
+                    headerField[fieldNum] = fieldNum
+                end
+
+                -- create the new field
+                doubleQuoteEscape = createNewField(inputString, quote, fieldStart, i, outResults[lineNum], headerField[fieldNum], doubleQuoteEscape, fieldsToKeep)
+
+                -- if we have headers then we gotta do something about it
+                if header and lineNum == 1 and not headerSet then
+                    headerField, lineNum, headerSet, fieldsToKeep = createHeaders(outResults[lineNum], rename, ofieldsToKeep)
+                end
+
+                lineNum = lineNum + 1
+                outResults[lineNum] = {}
+                fieldNum = 1
+                fieldStart = i + 1
+                -- print("fs:", fieldStart)
+                if (currentChar == CR and nextChar == LF) then
+                    -- print("CRLF DETECTED")
+                    skipChar = 1
+                    fieldStart = fieldStart + 1
+                    -- print("fs:", fieldStart)
+                end
+            end
+
+        i = i + 1 + skipChar
+        if (skipChar > 0) then
+            currentChar = string.byte(inputString, i)
+        else
+            currentChar = nextChar
+        end
+        skipChar = 0
+    end
+
+    -- if the line doesn't end happily (with a quote/newline), the last char will be forgotten.
+    -- this should take care of that.
+    createNewField(inputString, quote, fieldStart, i, outResults[lineNum], headerField[fieldNum], doubleQuoteEscape, fieldsToKeep)
+    -- end
+
+    -- clean up last line if it's weird (this happens when there is a CRLF newline at end of file)
+    -- doing a count gets it to pick up the oddballs
+    local count = 0
+    for _, _ in pairs(outResults[lineNum]) do
+        count = count + 1
+    end
+    if count ~= #headerField then
+        outResults[lineNum] = nil
+    end
+
+    return outResults
+end
+
+local function delimitField(field)
+    if field:find('"') then
+        return '"' .. field:gsub('"', '""') .. '"'
+    elseif field:find(" ") or field:find(",") or field:find("\n") then
+        return '"' .. field .. '"'
+    elseif field == "" then
+        return '""'
+    else
+        return field
+    end
+end
+
+function ftcsv.encode(inputTable, separator, headers)
+    -- separator MUST be one character
+    if #separator ~= 1 and type("separator") ~= "string" then error("the separator must be of string type and exactly one character") end
+
+    -- keep track of me output
+    local output = {}
+
+    -- grab the headers from the first file if they are not provided
+    -- we'll do this easily and not so quickly...
+    local headers = headers
+    if headers == nil then
+        headers = {}
+        for key, _ in pairs(inputTable[1]) do
+            headers[#headers+1] = key
+        end
+
+        -- lets make the headers alphabetical
+        table.sort(headers)
+    end
+
+    -- this is for outputting the headers
+    local line = {}
+    for i, header in ipairs(headers) do
+        line[i] = delimitField(header)
+    end
+    line.length = #line
+
+    -- string the header together yo
+    output[1] = table.concat(line, separator)
+
+    -- cheap and fast (because buffers)
+    for i, fields in ipairs(inputTable) do
+        local numHeaders = 0
+        for j = 1, #headers do
+            local field = fields[headers[j]]
+            line[j] = delimitField(field)
+            numHeaders = j
+        end
+        -- all lines should have the same number of fields
+        if line.length ~= numHeaders then
+            error("All lines should have the same length. The line at row " .. i .. " is of length " .. numHeaders .. " instead of " .. line.length)
+        end
+        output[i+1] = table.concat(line, separator)
+    end
+
+    return table.concat(output, "\r\n")
+end
+
+return ftcsv
--- a/spec/csvs/comma_in_quotes.csv
+++ b/spec/csvs/comma_in_quotes.csv
@ -0,0 +1,2 @@
+first,last,address,city,zip
+John,Doe,120 any st.,"Anytown, WW",08123
--- a/spec/csvs/correctness.csv
+++ b/spec/csvs/correctness.csv
@ -0,0 +1,7 @@
+Year,Make,Model,Description,Price
+1997,Ford,E350,"ac, abs, moon",3000.00
+1999,Chevy,"Venture ""Extended Edition""","",4900.00
+1996,Jeep,Grand Cherokee,"MUST SELL!
+air, moon roof, loaded",4799.00
+1999,Chevy,"Venture ""Extended Edition, Very Large""",,5000.00
+,,"Venture ""Extended Edition""","",4900.00
--- a/spec/csvs/empty.csv
+++ b/spec/csvs/empty.csv
@ -0,0 +1,3 @@
+a,b,c
+1,"",""
+2,3,4
--- a/spec/csvs/empty_crlf.csv
+++ b/spec/csvs/empty_crlf.csv
@ -0,0 +1,3 @@
+a,b,c
+1,"",""
+2,3,4
--- a/spec/csvs/empty_no_newline.csv
+++ b/spec/csvs/empty_no_newline.csv
@ -0,0 +1,2 @@
+a,b,c
+1,"",""
--- a/spec/csvs/empty_no_quotes.csv
+++ b/spec/csvs/empty_no_quotes.csv
@ -0,0 +1,2 @@
+a,b,c
+1,,
--- a/spec/csvs/escaped_quotes.csv
+++ b/spec/csvs/escaped_quotes.csv
@ -0,0 +1,3 @@
+a,b
+1,"ha ""ha"" ha"
+3,4
--- a/spec/csvs/json.csv
+++ b/spec/csvs/json.csv
@ -0,0 +1,2 @@
+key,val
+1,"{""type"": ""Point"", ""coordinates"": [102.0, 0.5]}"
--- a/spec/csvs/json_no_newline.csv
+++ b/spec/csvs/json_no_newline.csv
@ -0,0 +1,2 @@
+key,val
+1,"{""type"": ""Point"", ""coordinates"": [102.0, 0.5]}"
--- a/spec/csvs/newlines.csv
+++ b/spec/csvs/newlines.csv
@ -0,0 +1,5 @@
+a,b,c
+1,2,3
+"Once upon 
+a time",5,6
+7,8,9
--- a/spec/csvs/newlines_crlf.csv
+++ b/spec/csvs/newlines_crlf.csv
@ -0,0 +1,5 @@
+a,b,c
+1,2,3
+"Once upon 
+a time",5,6
+7,8,9
--- a/spec/csvs/quotes_and_newlines.csv
+++ b/spec/csvs/quotes_and_newlines.csv
@ -0,0 +1,5 @@
+a,b
+1,"ha 
+""ha"" 
+ha"
+3,4
--- a/spec/csvs/simple.csv
+++ b/spec/csvs/simple.csv
@ -0,0 +1,2 @@
+a,b,c
+1,2,3
--- a/spec/csvs/simple_crlf.csv
+++ b/spec/csvs/simple_crlf.csv
@ -0,0 +1,2 @@
+a,b,c
+1,2,3
--- a/spec/csvs/utf8.csv
+++ b/spec/csvs/utf8.csv
@ -0,0 +1,3 @@
+a,b,c
+1,2,3
+4,5,ʤ
--- a/spec/json/comma_in_quotes.json
+++ b/spec/json/comma_in_quotes.json
@ -0,0 +1,9 @@
+[
+  {
+    "first": "John",
+    "last": "Doe",
+    "address": "120 any st.",
+    "city": "Anytown, WW",
+    "zip": "08123"
+  }
+]
--- a/spec/json/correctness.json
+++ b/spec/json/correctness.json
@ -0,0 +1,37 @@
+[
+    {
+        "Make": "Ford",
+        "Year": "1997",
+        "Price": "3000.00",
+        "Model": "E350",
+        "Description": "ac, abs, moon"
+    },
+    {
+        "Make": "Chevy",
+        "Year": "1999",
+        "Price": "4900.00",
+        "Model": "Venture \"Extended Edition\"",
+        "Description": ""
+    },
+    {
+        "Make": "Jeep",
+        "Year": "1996",
+        "Price": "4799.00",
+        "Model": "Grand Cherokee",
+        "Description": "MUST SELL!\nair, moon roof, loaded"
+    },
+    {
+        "Make": "Chevy",
+        "Year": "1999",
+        "Price": "5000.00",
+        "Model": "Venture \"Extended Edition, Very Large\"",
+        "Description": ""
+    },
+    {
+        "Make": "",
+        "Year": "",
+        "Price": "4900.00",
+        "Model": "Venture \"Extended Edition\"",
+        "Description": ""
+    }
+]
--- a/spec/json/empty.json
+++ b/spec/json/empty.json
@ -0,0 +1,4 @@
+[
+  { "a": "1", "b": "", "c": "" },
+  { "a": "2", "b": "3", "c": "4" }
+]
--- a/spec/json/empty_crlf.json
+++ b/spec/json/empty_crlf.json
@ -0,0 +1,4 @@
+[
+  { "a": "1", "b": "", "c": "" },
+  { "a": "2", "b": "3", "c": "4" }
+]
--- a/spec/json/empty_no_newline.json
+++ b/spec/json/empty_no_newline.json
@ -0,0 +1,3 @@
+[
+  { "a": "1", "b": "", "c": "" }
+]
--- a/spec/json/empty_no_quotes.json
+++ b/spec/json/empty_no_quotes.json
@ -0,0 +1,3 @@
+[
+  { "a": "1", "b": "", "c": "" }
+]
--- a/spec/json/escaped_quotes.json
+++ b/spec/json/escaped_quotes.json
@ -0,0 +1,10 @@
+[
+  {
+    "a": "1",
+    "b": "ha \"ha\" ha"
+  },
+  {
+    "a": "3",
+    "b": "4"
+  }
+]
--- a/spec/json/json.json
+++ b/spec/json/json.json
@ -0,0 +1,6 @@
+[
+  {
+    "key": "1",
+    "val": "{\"type\": \"Point\", \"coordinates\": [102.0, 0.5]}"
+  }
+]
--- a/spec/json/json_no_newline.json
+++ b/spec/json/json_no_newline.json
@ -0,0 +1,6 @@
+[
+  {
+    "key": "1",
+    "val": "{\"type\": \"Point\", \"coordinates\": [102.0, 0.5]}"
+  }
+]
--- a/spec/json/newlines.json
+++ b/spec/json/newlines.json
@ -0,0 +1,17 @@
+[
+  {
+    "a": "1",
+    "b": "2",
+    "c": "3"
+  },
+  {
+    "a": "Once upon \na time",
+    "b": "5",
+    "c": "6"
+  },
+  {
+    "a": "7",
+    "b": "8",
+    "c": "9"
+  }
+]
--- a/spec/json/newlines_crlf.json
+++ b/spec/json/newlines_crlf.json
@ -0,0 +1,17 @@
+[
+  {
+    "a": "1",
+    "b": "2",
+    "c": "3"
+  },
+  {
+    "a": "Once upon \r\na time",
+    "b": "5",
+    "c": "6"
+  },
+  {
+    "a": "7",
+    "b": "8",
+    "c": "9"
+  }
+]
--- a/spec/json/quotes_and_newlines.json
+++ b/spec/json/quotes_and_newlines.json
@ -0,0 +1,10 @@
+[
+  {
+    "a": "1",
+    "b": "ha \n\"ha\" \nha"
+  },
+  {
+    "a": "3",
+    "b": "4"
+  }
+]
--- a/spec/json/simple.json
+++ b/spec/json/simple.json
@ -0,0 +1,7 @@
+[
+  {
+    "a": "1",
+    "b": "2",
+    "c": "3"
+  }
+]
--- a/spec/json/simple_crlf.json
+++ b/spec/json/simple_crlf.json
@ -0,0 +1,7 @@
+[
+  {
+    "a": "1",
+    "b": "2",
+    "c": "3"
+  }
+]
--- a/spec/json/utf8.json
+++ b/spec/json/utf8.json
@ -0,0 +1,12 @@
+[
+  {
+    "a": "1",
+    "b": "2",
+    "c": "3"
+  },
+  {
+    "a": "4",
+    "b": "5",
+    "c": "ʤ"
+  }
+]
--- a/spec/test_spec.lua
+++ b/spec/test_spec.lua
@ -0,0 +1,67 @@
+local cjson = require("cjson")
+local ftcsv = require('ftcsv')
+-- local csv = require('csv')
+-- local staecsv = require('state-csv')
+
+local function loadFile(textFile)
+    local file = io.open(textFile, "r")
+    if not file then error("File not found at " .. textFile) end
+    local allLines = file:read("*all")
+    file:close()
+    return allLines
+end
+
+local files = {
+	"comma_in_quotes",
+	"correctness",
+	"empty",
+	"empty_no_newline",
+	"empty_no_quotes",
+	"empty_crlf",
+	"escaped_quotes",
+	"json",
+	"json_no_newline",
+	"newlines",
+	"newlines_crlf",
+	"quotes_and_newlines",
+	"simple",
+	"simple_crlf",
+	"utf8"
+}
+
+describe("csv decode", function()
+	for _, value in ipairs(files) do
+		it("should handle " .. value, function()
+			local contents = loadFile("spec/csvs/" .. value .. ".csv")
+			local json = loadFile("spec/json/" .. value .. ".json")
+			json = cjson.decode(json)
+			-- local parse = staecsv:ftcsv(contents, ",")
+			local parse = ftcsv.decode(contents, ",")
+			-- local f = csv.openstring(contents, {separator=",", header=true})
+			-- local parse = {}
+			-- for fields in f:lines() do
+			  -- parse[#parse+1] = fields
+			-- end
+			assert.are.same(json, parse)
+		end)
+	end
+end)
+
+
+describe("csv encode", function()
+	for _, value in ipairs(files) do
+		it("should handle " .. value, function()
+			local originalFile = loadFile("spec/csvs/" .. value .. ".csv")
+			local jsonFile = loadFile("spec/json/" .. value .. ".json")
+			local jsonDecode = cjson.decode(jsonFile)
+			-- local parse = staecsv:ftcsv(contents, ",")
+			local reEncoded = ftcsv.decode(ftcsv.encode(jsonDecode, ","), ",")
+			-- local f = csv.openstring(contents, {separator=",", header=true})
+			-- local parse = {}
+			-- for fields in f:lines() do
+			  -- parse[#parse+1] = fields
+			-- end
+			assert.are.same(jsonDecode, reEncoded)
+		end)
+	end
+end)