--- Read a comma or tab (or other delimiter) separated file. -- This version of a CSV reader differs from others I've seen in that it -- -- + handles embedded newlines in fields (if they're delimited with double -- quotes) -- + is line-ending agnostic -- + reads the file line-by-line, so it can potientially handle large -- files. -- -- Of course, for such a simple format, CSV is horribly complicated, so it -- likely gets something wrong. -- (c) Copyright 2013 Incremental IP Limited. -- (c) Copyright 2014 Kevin Martin -- Available under the MIT licence. See LICENSE for more information. local DEFAULT_BUFFER_SIZE = 4096 ------------------------------------------------------------------------------ local function trim_space(s) return s:match("^%s*(.-)%s*$") end local function fix_quotes(s) -- the sub(..., -2) is to strip the trailing quote return string.sub(s:gsub('""', '"'), 1, -2) end ------------------------------------------------------------------------------ --- Parse a list of columns. -- The main job here is normalising column names and dealing with columns -- for which we have more than one possible name in the header. local function build_column_name_map(columns) local column_name_map = {} for n, v in pairs(columns) do local names local t if type(v) == "table" then t = { transform = v.transform, default = v.default } if v.name then names = { (v.name:gsub("_+", " ")) } elseif v.names then names = v.names for i, n in ipairs(names) do names[i] = n:gsub("_+", " ") end end else if type(v) == "function" then t = { transform = v } else t = {} end end if not names then names = { (n:lower():gsub("_", " ")) } end t.name = n for _, n in ipairs(names) do column_name_map[n:lower()] = t end end return column_name_map end --- Map "virtual" columns to file columns. -- Once we've read the header, work out which columns we're interested in and -- what to do with them. Mostly this is about checking we've got the columns -- we need and writing a nice complaint if we haven't. local function build_column_index_map(header, column_name_map) local column_index_map = {} -- Match the columns in the file to the columns in the name map local found = {} for i, word in ipairs(header) do word = word:lower():gsub("[^%w%d]+", " "):gsub("^ *(.-) *$", "%1") local r = column_name_map[word] if r then column_index_map[i] = r found[r.name] = true end end -- check we found all the columns we need local not_found = {} for name, r in pairs(column_name_map) do if not found[r.name] then local nf = not_found[r.name] if nf then nf[#nf+1] = name else not_found[r.name] = { name } end end end -- If any columns are missing, assemble an error message if next(not_found) then local problems = {} for k, v in pairs(not_found) do local missing if #v == 1 then missing = "'"..v[1].."'" else missing = v[1] for i = 2, #v - 1 do missing = missing..", '"..v[i].."'" end missing = missing.." or '"..v[#v].."'" end problems[#problems+1] = "Couldn't find a column named "..missing end error(table.concat(problems, "\n"), 0) end return column_index_map end local function transform_field(value, index, map, filename, line, column) local field = map[index] if field then if field.transform then local ok ok, value = pcall(field.transform, value) if not ok then error(("%s:%d:%d: Couldn't read field '%s': %s"): format(filename or "", line, column, field.name, value)) end end return value or field.default, field.name end end ------------------------------------------------------------------------------ --- Iterate through the records in a file -- Since records might be more than one line (if there's a newline in quotes) -- and line-endings might not be native, we read the file in chunks of -- `buffer_size`. -- For some reason I do this by writing a `find` and `sub` tha local function separated_values_iterator(file, parameters) local buffer_size = parameters.buffer_size or DEFAULT_BUFFER_SIZE local filename = parameters.filename or "" local buffer = "" local anchor_pos = 1 local line, line_start = 1, 1, 1 local column_name_map = parameters.columns and build_column_name_map(parameters.columns) local column_index_map -- Cut the front off the buffer if we've already read it local function truncate() if anchor_pos > buffer_size then local remove = math.floor(anchor_pos / buffer_size) * buffer_size buffer = buffer:sub(remove + 1) anchor_pos = anchor_pos - remove line_start = line_start - remove end end -- Extend the buffer so we can see more local function extend(offset) local extra = anchor_pos + offset - 1 - #buffer if extra > 0 then local size = math.ceil(extra / buffer_size) * buffer_size local s = file:read(size) if not s then return end buffer = buffer..s end end -- Find something in the buffer, extending it if necessary local function find(pattern, offset) truncate() local first, last, capture while true do first, last, capture = buffer:find(pattern, anchor_pos + offset - 1) if not first then local s = file:read(buffer_size) if not s then return end buffer = buffer..s else return first - anchor_pos + 1, last - anchor_pos + 1, capture end end end -- Get a substring from the buffer, extending it if necessary local function sub(a, b) truncate() extend(b) local b = b == -1 and b or anchor_pos + b - 1 return buffer:sub(anchor_pos + a - 1, b) end -- If the user hasn't specified a separator, try to work out what it is. local sep = parameters.separator if not sep then local _ _, _, sep = find("([,\t])", 1) end sep = "(["..sep.."\n\r])" -- Start reading the file local field_count, fields, starts = 0, {}, {} local header while true do local field_start_line = line local field_start_column = anchor_pos - line_start + 1 local field_end, sep_end, this_sep local tidy -- If the field is quoted, go find the other quote if sub(1, 1) == '"' then anchor_pos = anchor_pos + 1 local current_pos = 0 repeat local a, b, c = find('"("?)', current_pos + 1) current_pos = b until c ~= '"' if not current_pos then error(("%s:%d:%d: unmatched quote"): format(filename, field_start_line, field_start_column)) end tidy = fix_quotes field_end, sep_end, this_sep = find(" *([^ ])", current_pos+1) if this_sep and not this_sep:match(sep) then error(("%s:%d:%d: unmatched quote"): format(filename, field_start_line, field_start_column)) end else field_end, sep_end, this_sep = find(sep, 1) tidy = trim_space end -- Look for the separator or a newline or the end of the file field_end = (field_end or 0) - 1 -- Read the field, then convert all the line endings to \n, and -- count any embedded line endings local value = sub(1, field_end) value = value:gsub("\r\n", "\n"):gsub("\r", "\n") for nl in value:gmatch("\n()") do line = line + 1 line_start = nl + anchor_pos end value = tidy(value) field_count = field_count + 1 -- Insert the value into the table for this "line" local key if column_index_map then value, key = transform_field(value, field_count, column_index_map, filename, field_start_line, field_start_column) elseif header then key = header[field_count] else key = field_count end if key then fields[key] = value starts[key] = { line=field_start_line, column=field_start_column } end -- if we ended on a newline then yield the fields on this line. if not this_sep or this_sep == "\r" or this_sep == "\n" then if column_name_map and not column_index_map then column_index_map = build_column_index_map(fields, column_name_map) elseif parameters.header and not header then header = fields else local k, v = next(fields) if v ~= "" or field_count > 1 then -- ignore blank lines coroutine.yield(fields, starts) end end field_count, fields, starts = 0, {}, {} end -- If we *really* didn't find a separator then we're done. if not sep_end then break end -- If we ended on a newline then count it. if this_sep == "\r" or this_sep == "\n" then if this_sep == "\r" and sub(sep_end+1, sep_end+1) == "\n" then sep_end = sep_end + 1 end line = line + 1 line_start = anchor_pos + sep_end end anchor_pos = anchor_pos + sep_end end end ------------------------------------------------------------------------------ local file_mt = { lines = function(t) return coroutine.wrap(function() separated_values_iterator(t.file, t.parameters) end) end, close = function(t) t.file:close() end, name = function(t) return t.parameters.filename end, } file_mt.__index = file_mt local function use(file, parameters) local f = { file = file, parameters = parameters } return setmetatable(f, file_mt) end --- Open a file for reading as a delimited file -- @return a file object local function open( filename, -- string: name of the file to open parameters) -- ?table: parameters controlling reading the file. -- See README.md local file, message = io.open(filename, "r") if not file then return nil, message end parameters = parameters or {} parameters.filename = filename return use(file, parameters) end ------------------------------------------------------------------------------ local stringfh_mt = { read = function(self, bytes) if not self._string then return nil end local read_rv read_rv, self._string = self._string:sub(1, bytes), self._string:sub(bytes+1) if #self._string == 0 then self._string = nil end return read_rv end, close = function() end } stringfh_mt.__index = stringfh_mt --- Open a string for reading as a delimited file -- @return a file object local function openstring( filecontents, -- string: The contents of the delimited file parameters) -- ?table: parameters controlling reading the file. -- See README.md parameters = parameters or {} local function makename() local t = {} t[#t+1] = "<(String) " t[#t+1] = (filecontents:gmatch("[^\n]+")() or ""):sub(1,15) if #t[#t] > 14 then t[#t+1] = "..." end t[#t+1] = " >" return table.concat(t) end parameters.filename = parameters.filename or makename() parameters.buffer_size = parameters.buffer_size or #filecontents local fileh = setmetatable({_string = filecontents}, stringfh_mt) return use(fileh, parameters) end ------------------------------------------------------------------------------ return { open = open, openstring = openstring, use = use } ------------------------------------------------------------------------------