2013-12-04 09:16:11 +00:00
|
|
|
--- Read a comma or tab (or other delimiter) separated file.
|
|
|
|
-- This version of a CSV reader differs from others I've seen in that it
|
|
|
|
--
|
|
|
|
-- + handles embedded newlines in fields (if they're delimited with double
|
|
|
|
-- quotes)
|
|
|
|
-- + is line-ending agnostic
|
|
|
|
-- + reads the file line-by-line, so it can potientially handle large
|
|
|
|
-- files.
|
|
|
|
--
|
|
|
|
-- Of course, for such a simple format, CSV is horribly complicated, so it
|
|
|
|
-- likely gets something wrong.
|
|
|
|
|
|
|
|
-- (c) Copyright 2013 Incremental IP Limited.
|
2014-05-18 17:52:16 +00:00
|
|
|
-- (c) Copyright 2014 Kevin Martin
|
2013-12-04 09:16:11 +00:00
|
|
|
-- Available under the MIT licence. See LICENSE for more information.
|
|
|
|
|
2014-05-18 17:52:16 +00:00
|
|
|
local DEFAULT_BUFFER_SIZE = 4096
|
2013-12-04 09:16:11 +00:00
|
|
|
|
|
|
|
|
|
|
|
------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
local function trim_space(s)
|
|
|
|
return s:match("^%s*(.-)%s*$")
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
local function fix_quotes(s)
|
|
|
|
-- the sub(..., -2) is to strip the trailing quote
|
|
|
|
return string.sub(s:gsub('""', '"'), 1, -2)
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
--- Parse a list of columns.
|
|
|
|
-- The main job here is normalising column names and dealing with columns
|
|
|
|
-- for which we have more than one possible name in the header.
|
|
|
|
local function build_column_name_map(columns)
|
|
|
|
local column_name_map = {}
|
|
|
|
for n, v in pairs(columns) do
|
|
|
|
local names
|
|
|
|
local t
|
|
|
|
if type(v) == "table" then
|
|
|
|
t = { transform = v.transform, default = v.default }
|
|
|
|
if v.name then
|
|
|
|
names = { (v.name:gsub("_+", " ")) }
|
2013-12-05 08:37:25 +00:00
|
|
|
elseif v.names then
|
2013-12-04 09:16:11 +00:00
|
|
|
names = v.names
|
|
|
|
for i, n in ipairs(names) do names[i] = n:gsub("_+", " ") end
|
|
|
|
end
|
|
|
|
else
|
|
|
|
if type(v) == "function" then
|
|
|
|
t = { transform = v }
|
|
|
|
else
|
|
|
|
t = {}
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
if not names then
|
|
|
|
names = { (n:lower():gsub("_", " ")) }
|
|
|
|
end
|
|
|
|
|
|
|
|
t.name = n
|
|
|
|
for _, n in ipairs(names) do
|
|
|
|
column_name_map[n:lower()] = t
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
return column_name_map
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
--- Map "virtual" columns to file columns.
|
|
|
|
-- Once we've read the header, work out which columns we're interested in and
|
|
|
|
-- what to do with them. Mostly this is about checking we've got the columns
|
|
|
|
-- we need and writing a nice complaint if we haven't.
|
|
|
|
local function build_column_index_map(header, column_name_map)
|
2013-12-05 02:01:41 +00:00
|
|
|
local column_index_map = {}
|
2013-12-04 09:16:11 +00:00
|
|
|
|
|
|
|
-- Match the columns in the file to the columns in the name map
|
|
|
|
local found = {}
|
|
|
|
for i, word in ipairs(header) do
|
|
|
|
word = word:lower():gsub("[^%w%d]+", " "):gsub("^ *(.-) *$", "%1")
|
|
|
|
local r = column_name_map[word]
|
|
|
|
if r then
|
|
|
|
column_index_map[i] = r
|
|
|
|
found[r.name] = true
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
-- check we found all the columns we need
|
|
|
|
local not_found = {}
|
|
|
|
for name, r in pairs(column_name_map) do
|
|
|
|
if not found[r.name] then
|
|
|
|
local nf = not_found[r.name]
|
|
|
|
if nf then
|
|
|
|
nf[#nf+1] = name
|
|
|
|
else
|
|
|
|
not_found[r.name] = { name }
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
-- If any columns are missing, assemble an error message
|
|
|
|
if next(not_found) then
|
|
|
|
local problems = {}
|
|
|
|
for k, v in pairs(not_found) do
|
|
|
|
local missing
|
|
|
|
if #v == 1 then
|
|
|
|
missing = "'"..v[1].."'"
|
|
|
|
else
|
|
|
|
missing = v[1]
|
|
|
|
for i = 2, #v - 1 do
|
|
|
|
missing = missing..", '"..v[i].."'"
|
|
|
|
end
|
|
|
|
missing = missing.." or '"..v[#v].."'"
|
|
|
|
end
|
|
|
|
problems[#problems+1] = "Couldn't find a column named "..missing
|
|
|
|
end
|
|
|
|
error(table.concat(problems, "\n"), 0)
|
|
|
|
end
|
|
|
|
|
|
|
|
return column_index_map
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
local function transform_field(value, index, map, filename, line, column)
|
|
|
|
local field = map[index]
|
|
|
|
if field then
|
|
|
|
if field.transform then
|
2013-12-05 02:01:41 +00:00
|
|
|
local ok
|
2013-12-04 09:16:11 +00:00
|
|
|
ok, value = pcall(field.transform, value)
|
|
|
|
if not ok then
|
|
|
|
error(("%s:%d:%d: Couldn't read field '%s': %s"):
|
|
|
|
format(filename or "<unknown>", line, column,
|
|
|
|
field.name, value))
|
|
|
|
end
|
|
|
|
end
|
|
|
|
return value or field.default, field.name
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
--- Iterate through the records in a file
|
|
|
|
-- Since records might be more than one line (if there's a newline in quotes)
|
|
|
|
-- and line-endings might not be native, we read the file in chunks of
|
|
|
|
-- `buffer_size`.
|
|
|
|
-- For some reason I do this by writing a `find` and `sub` tha
|
|
|
|
local function separated_values_iterator(file, parameters)
|
|
|
|
local buffer_size = parameters.buffer_size or DEFAULT_BUFFER_SIZE
|
|
|
|
local filename = parameters.filename or "<unknown>"
|
|
|
|
local buffer = ""
|
|
|
|
local anchor_pos = 1
|
|
|
|
local line, line_start = 1, 1, 1
|
|
|
|
local column_name_map = parameters.columns and
|
|
|
|
build_column_name_map(parameters.columns)
|
|
|
|
local column_index_map
|
|
|
|
|
|
|
|
|
|
|
|
-- Cut the front off the buffer if we've already read it
|
|
|
|
local function truncate()
|
|
|
|
if anchor_pos > buffer_size then
|
|
|
|
local remove = math.floor(anchor_pos / buffer_size) * buffer_size
|
|
|
|
buffer = buffer:sub(remove + 1)
|
|
|
|
anchor_pos = anchor_pos - remove
|
|
|
|
line_start = line_start - remove
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
-- Extend the buffer so we can see more
|
|
|
|
local function extend(offset)
|
|
|
|
local extra = anchor_pos + offset - 1 - #buffer
|
|
|
|
if extra > 0 then
|
|
|
|
local size = math.ceil(extra / buffer_size) * buffer_size
|
2013-12-05 01:46:38 +00:00
|
|
|
local s = file:read(size)
|
2013-12-04 09:16:11 +00:00
|
|
|
if not s then return end
|
|
|
|
buffer = buffer..s
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
-- Find something in the buffer, extending it if necessary
|
|
|
|
local function find(pattern, offset)
|
|
|
|
truncate()
|
|
|
|
local first, last, capture
|
|
|
|
while true do
|
|
|
|
first, last, capture = buffer:find(pattern, anchor_pos + offset - 1)
|
|
|
|
if not first then
|
|
|
|
local s = file:read(buffer_size)
|
|
|
|
if not s then return end
|
|
|
|
buffer = buffer..s
|
|
|
|
else
|
|
|
|
return first - anchor_pos + 1, last - anchor_pos + 1, capture
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
-- Get a substring from the buffer, extending it if necessary
|
|
|
|
local function sub(a, b)
|
|
|
|
truncate()
|
|
|
|
extend(b)
|
2013-12-05 08:20:10 +00:00
|
|
|
local b = b == -1 and b or anchor_pos + b - 1
|
|
|
|
return buffer:sub(anchor_pos + a - 1, b)
|
2013-12-04 09:16:11 +00:00
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
-- If the user hasn't specified a separator, try to work out what it is.
|
|
|
|
local sep = parameters.separator
|
|
|
|
if not sep then
|
|
|
|
local _
|
|
|
|
_, _, sep = find("([,\t])", 1)
|
|
|
|
end
|
|
|
|
sep = "(["..sep.."\n\r])"
|
|
|
|
|
|
|
|
|
|
|
|
-- Start reading the file
|
|
|
|
local field_count, fields, starts = 0, {}, {}
|
|
|
|
local header
|
|
|
|
|
|
|
|
while true do
|
|
|
|
local field_start_line = line
|
|
|
|
local field_start_column = anchor_pos - line_start + 1
|
|
|
|
local field_end, sep_end, this_sep
|
2013-12-05 02:01:41 +00:00
|
|
|
local tidy
|
2013-12-04 09:16:11 +00:00
|
|
|
|
|
|
|
-- If the field is quoted, go find the other quote
|
|
|
|
if sub(1, 1) == '"' then
|
|
|
|
anchor_pos = anchor_pos + 1
|
|
|
|
local current_pos = 0
|
|
|
|
repeat
|
|
|
|
local a, b, c = find('"("?)', current_pos + 1)
|
|
|
|
current_pos = b
|
|
|
|
until c ~= '"'
|
|
|
|
if not current_pos then
|
|
|
|
error(("%s:%d:%d: unmatched quote"):
|
|
|
|
format(filename, field_start_line, field_start_column))
|
|
|
|
end
|
|
|
|
tidy = fix_quotes
|
2013-12-05 08:20:10 +00:00
|
|
|
field_end, sep_end, this_sep = find(" *([^ ])", current_pos+1)
|
|
|
|
if this_sep and not this_sep:match(sep) then
|
2013-12-04 09:16:11 +00:00
|
|
|
error(("%s:%d:%d: unmatched quote"):
|
|
|
|
format(filename, field_start_line, field_start_column))
|
|
|
|
end
|
|
|
|
else
|
|
|
|
field_end, sep_end, this_sep = find(sep, 1)
|
|
|
|
tidy = trim_space
|
|
|
|
end
|
|
|
|
|
2013-12-05 08:20:10 +00:00
|
|
|
-- Look for the separator or a newline or the end of the file
|
2013-12-04 09:16:11 +00:00
|
|
|
field_end = (field_end or 0) - 1
|
|
|
|
|
|
|
|
-- Read the field, then convert all the line endings to \n, and
|
|
|
|
-- count any embedded line endings
|
|
|
|
local value = sub(1, field_end)
|
|
|
|
value = value:gsub("\r\n", "\n"):gsub("\r", "\n")
|
|
|
|
for nl in value:gmatch("\n()") do
|
|
|
|
line = line + 1
|
|
|
|
line_start = nl + anchor_pos
|
|
|
|
end
|
|
|
|
|
|
|
|
value = tidy(value)
|
|
|
|
field_count = field_count + 1
|
|
|
|
|
|
|
|
-- Insert the value into the table for this "line"
|
|
|
|
local key
|
|
|
|
if column_index_map then
|
|
|
|
value, key = transform_field(value, field_count, column_index_map,
|
|
|
|
filename, field_start_line, field_start_column)
|
|
|
|
elseif header then
|
|
|
|
key = header[field_count]
|
|
|
|
else
|
|
|
|
key = field_count
|
|
|
|
end
|
|
|
|
if key then
|
|
|
|
fields[key] = value
|
|
|
|
starts[key] = { line=field_start_line, column=field_start_column }
|
|
|
|
end
|
|
|
|
|
|
|
|
-- if we ended on a newline then yield the fields on this line.
|
|
|
|
if not this_sep or this_sep == "\r" or this_sep == "\n" then
|
|
|
|
if column_name_map and not column_index_map then
|
|
|
|
column_index_map = build_column_index_map(fields, column_name_map)
|
|
|
|
elseif parameters.header and not header then
|
|
|
|
header = fields
|
|
|
|
else
|
2014-01-29 00:28:47 +00:00
|
|
|
local k, v = next(fields)
|
|
|
|
if v ~= "" or field_count > 1 then -- ignore blank lines
|
2013-12-05 23:56:02 +00:00
|
|
|
coroutine.yield(fields, starts)
|
|
|
|
end
|
2013-12-04 09:16:11 +00:00
|
|
|
end
|
|
|
|
field_count, fields, starts = 0, {}, {}
|
|
|
|
end
|
|
|
|
|
|
|
|
-- If we *really* didn't find a separator then we're done.
|
|
|
|
if not sep_end then break end
|
|
|
|
|
|
|
|
-- If we ended on a newline then count it.
|
|
|
|
if this_sep == "\r" or this_sep == "\n" then
|
|
|
|
if this_sep == "\r" and sub(sep_end+1, sep_end+1) == "\n" then
|
|
|
|
sep_end = sep_end + 1
|
|
|
|
end
|
|
|
|
line = line + 1
|
|
|
|
line_start = anchor_pos + sep_end
|
|
|
|
end
|
|
|
|
|
|
|
|
anchor_pos = anchor_pos + sep_end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
local file_mt =
|
|
|
|
{
|
|
|
|
lines = function(t)
|
|
|
|
return coroutine.wrap(function()
|
|
|
|
separated_values_iterator(t.file, t.parameters)
|
|
|
|
end)
|
|
|
|
end,
|
|
|
|
close = function(t)
|
|
|
|
t.file:close()
|
|
|
|
end,
|
2014-05-18 17:52:16 +00:00
|
|
|
name = function(t)
|
|
|
|
return t.parameters.filename
|
|
|
|
end,
|
2013-12-04 09:16:11 +00:00
|
|
|
}
|
|
|
|
file_mt.__index = file_mt
|
|
|
|
|
|
|
|
|
|
|
|
local function use(file, parameters)
|
|
|
|
local f = { file = file, parameters = parameters }
|
|
|
|
return setmetatable(f, file_mt)
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
--- Open a file for reading as a delimited file
|
|
|
|
-- @return a file object
|
|
|
|
local function open(
|
|
|
|
filename, -- string: name of the file to open
|
|
|
|
parameters) -- ?table: parameters controlling reading the file.
|
|
|
|
-- See README.md
|
|
|
|
local file, message = io.open(filename, "r")
|
|
|
|
if not file then return nil, message end
|
|
|
|
|
|
|
|
parameters = parameters or {}
|
|
|
|
parameters.filename = filename
|
|
|
|
return use(file, parameters)
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
------------------------------------------------------------------------------
|
|
|
|
|
2014-05-18 17:52:16 +00:00
|
|
|
local stringfh_mt =
|
|
|
|
{
|
|
|
|
read = function(self, bytes)
|
|
|
|
if not self._string then return nil end
|
|
|
|
|
|
|
|
local read_rv
|
|
|
|
read_rv, self._string =
|
|
|
|
self._string:sub(1, bytes), self._string:sub(bytes+1)
|
|
|
|
|
|
|
|
if #self._string == 0 then
|
|
|
|
self._string = nil
|
|
|
|
end
|
|
|
|
|
|
|
|
return read_rv
|
|
|
|
end,
|
|
|
|
close = function()
|
|
|
|
end
|
|
|
|
}
|
|
|
|
stringfh_mt.__index = stringfh_mt
|
|
|
|
|
|
|
|
--- Open a string for reading as a delimited file
|
|
|
|
-- @return a file object
|
|
|
|
local function openstring(
|
|
|
|
filecontents, -- string: The contents of the delimited file
|
|
|
|
parameters) -- ?table: parameters controlling reading the file.
|
|
|
|
-- See README.md
|
|
|
|
|
|
|
|
parameters = parameters or {}
|
|
|
|
|
|
|
|
local function makename()
|
|
|
|
local t = {}
|
|
|
|
t[#t+1] = "<(String) "
|
|
|
|
t[#t+1] = (filecontents:gmatch("[^\n]+")() or ""):sub(1,15)
|
|
|
|
if #t[#t] > 14 then t[#t+1] = "..." end
|
|
|
|
t[#t+1] = " >"
|
|
|
|
return table.concat(t)
|
|
|
|
end
|
|
|
|
|
|
|
|
parameters.filename = parameters.filename or makename()
|
|
|
|
parameters.buffer_size = parameters.buffer_size or #filecontents
|
|
|
|
local fileh = setmetatable({_string = filecontents}, stringfh_mt)
|
|
|
|
return use(fileh, parameters)
|
|
|
|
end
|
|
|
|
|
|
|
|
------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
return { open = open, openstring = openstring, use = use }
|
2013-12-04 09:16:11 +00:00
|
|
|
|
|
|
|
------------------------------------------------------------------------------
|