first commit of lua-csv

2024-10-15 04:17:42 +00:00 · 2013-12-04 22:16:11 +13:00 · 2013-12-04 22:16:11 +13:00 · da57f60673
commit da57f60673
7 changed files with 483 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+.DS_Store
+lua/docs
--- a/1
+++ b/1
@ -0,0 +1 @@
+Leyland, Geoff
--- a/21
+++ b/21
@ -0,0 +1,21 @@
+Copyright (c) 2013 Incremental IP Limited
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+
--- a/README.md
+++ b/README.md
@ -0,0 +1,83 @@
+# Lua-CSV - delimited file reading
+
+## 1. What?
+
+Lua-CSV is a Lua module for reading delimited text files (popularly CSV and
+tab-separated files, but you can specify the separator).
+
+Lua-CSV tries to auto-detect whether a file is delimited with commas or tabs,
+copes with non-native newlines, survives newlines and quotes inside quoted
+fields and offers an iterator interface so it can handle large files.
+
+
+## 2. How?
+
+    local csv = require("csv")
+    local f = csv.open("file.csv")
+    for fields in f:lines() do
+      for i, v in ipairs(fields) do print(i, v) end
+    end
+
+`csv.open` takes a second argument `parameters`, a table of parameters
+controlling how the file is read:
+
+ `separator` sets the separator.  It'll probably guess the separator
+  correctly if it's a comma or a tab (unless, say, the first field in a
+  tab-delimited file contains a comma), but if you want something else you'll
+  have to set this.  It could be more than one character, but it's used as
+  part of a set: `"["..sep.."\n\r]"`
+
+ Set `header` to true if the file contains a header and each set of fields
+  will be keyed by the names in the header rather than by integer index.
+
+ `columns` provides a mechanism for column remapping.
+  Suppose you have a csv file as follows:
+
+        Word,Number
+        ONE,10
+
+    And columns is:
+
+      + `{ word = true }` then the only field in the file would be
+        `{ word = "ONE" }`
+      + `{ first = { name = "word"} }` then it would be { first = "ONE" }
+      + `{ word = { transform = string.lower }}` would give { word = "one" }
+      +
+            { word = true
+              number = { transform = function(x) return tonumber(x) / 10 end }}
+        would give `{ word = "ONE", number = 1 }`
+
+    A column can have more than one name: 
+    `{ first = { names = {"word", "worm"}}}` to help cope with badly specified
+    file formats and spelling mistakes.
+
+ `buffer_size` controls the size of the blocks the file is read in.  The
+  default is 4096, which is what `pagesize` says on my system.
+
+
+## 3. Requirements
+
+Lua 5.1 or 5.1 or LuaJIT.
+
+
+## 4. Issues
+
+ It won't cope with multiple delimiter characters between fields as might be
+  seen in a whitespace delimited file.  Instead it'll think there's lots of
+  empty fields.
+
+## 5. Wishlist
+
+ Tests would be nice.
+ So would better LDoc documentation.
+
+
+## 6. Alternatives
+
+ [Penlight](http://github.com/stevedonovan/penlight) contains delimited
+  file reading.  It reads the whole file in one go.
+ The Lua Wiki contains two pages on CSV
+  [here](http://lua-users.org/wiki/LuaCsv) and
+  [here](http://lua-users.org/wiki/CsvUtils).
+ There's an example using [LPeg](http://www.inf.puc-rio.br/~roberto/lpeg/)
+  to parse CSV [here](http://www.inf.puc-rio.br/~roberto/lpeg/#CSV)
--- a/lua/config.ld
+++ b/lua/config.ld
@ -0,0 +1,4 @@
+project = "Lua-CSV"
+title = "Lua-CSV Source Documentation"
+description = "Lua-CSV reads delimited text files"
+format = "markdown"
--- a/lua/csv.lua
+++ b/lua/csv.lua
@ -0,0 +1,348 @@
+--- Read a comma or tab (or other delimiter) separated file.
+--  This version of a CSV reader differs from others I've seen in that it
+--
+--  + handles embedded newlines in fields (if they're delimited with double
+--    quotes)
+--  + is line-ending agnostic
+--  + reads the file line-by-line, so it can potientially handle large
+--    files.
+--
+--  Of course, for such a simple format, CSV is horribly complicated, so it
+--  likely gets something wrong.
+
+--  (c) Copyright 2013 Incremental IP Limited.
+--  Available under the MIT licence.  See LICENSE for more information.
+
+local DEFAULT_BUFFER_SIZE = 1024
+
+
+------------------------------------------------------------------------------
+
+local function trim_space(s)
+  return s:match("^%s*(.-)%s*$")
+end
+
+
+local function fix_quotes(s)
+  -- the sub(..., -2) is to strip the trailing quote
+  return string.sub(s:gsub('""', '"'), 1, -2)
+end
+
+
+------------------------------------------------------------------------------
+
+--- Parse a list of columns.
+--  The main job here is normalising column names and dealing with columns
+--  for which we have more than one possible name in the header.
+local function build_column_name_map(columns)
+  local column_name_map = {}
+  for n, v in pairs(columns) do
+    local names
+    local t
+    if type(v) == "table" then
+      t = { transform = v.transform, default = v.default }
+      if v.name then
+        names = { (v.name:gsub("_+", " ")) }
+      else
+        names = v.names
+        for i, n in ipairs(names) do names[i] = n:gsub("_+", " ") end
+      end
+    else
+      if type(v) == "function" then
+        t = { transform = v }
+      else
+        t = {}
+      end
+    end
+
+    if not names then
+      names = { (n:lower():gsub("_", " ")) }
+    end
+
+    t.name = n
+    for _, n in ipairs(names) do
+      column_name_map[n:lower()] = t
+    end
+  end
+
+  return column_name_map
+end
+
+
+--- Map "virtual" columns to file columns.
+--  Once we've read the header, work out which columns we're interested in and
+--  what to do with them.  Mostly this is about checking we've got the columns
+--  we need and writing a nice complaint if we haven't.
+local function build_column_index_map(header, column_name_map)
+  column_index_map = {}
+
+  -- Match the columns in the file to the columns in the name map
+  local found = {}
+  for i, word in ipairs(header) do
+    word = word:lower():gsub("[^%w%d]+", " "):gsub("^ *(.-) *$", "%1")
+    local r = column_name_map[word]
+    if r then
+      column_index_map[i] = r
+      found[r.name] = true
+    end
+  end
+
+  -- check we found all the columns we need
+  local not_found = {}
+  for name, r in pairs(column_name_map) do
+    if not found[r.name] then
+      local nf = not_found[r.name]
+      if nf then
+        nf[#nf+1] = name
+      else
+        not_found[r.name] = { name }
+      end
+    end
+  end
+  -- If any columns are missing, assemble an error message
+  if next(not_found) then
+    local problems = {}
+    for k, v in pairs(not_found) do
+      local missing
+      if #v == 1 then
+        missing = "'"..v[1].."'"
+      else
+        missing = v[1]
+        for i = 2, #v - 1 do
+          missing = missing..", '"..v[i].."'"
+        end
+        missing = missing.." or '"..v[#v].."'"
+      end
+      problems[#problems+1] = "Couldn't find a column named "..missing
+    end
+    error(table.concat(problems, "\n"), 0)
+  end
+
+  return column_index_map
+end
+
+
+local function transform_field(value, index, map, filename, line, column)
+  local field = map[index]
+  if field then
+    if field.transform then
+      ok, value = pcall(field.transform, value)
+      if not ok then
+        error(("%s:%d:%d: Couldn't read field '%s': %s"):
+              format(filename or "<unknown>", line, column,
+              field.name, value))
+      end
+    end
+    return value or field.default, field.name
+  end
+end
+
+
+------------------------------------------------------------------------------
+
+--- Iterate through the records in a file
+--  Since records might be more than one line (if there's a newline in quotes)
+--  and line-endings might not be native, we read the file in chunks of
+--  `buffer_size`.
+--  For some reason I do this by writing a `find` and `sub` tha
+local function separated_values_iterator(file, parameters)
+  local buffer_size = parameters.buffer_size or DEFAULT_BUFFER_SIZE
+  local filename = parameters.filename or "<unknown>"
+  local buffer = ""
+  local anchor_pos = 1
+  local line, line_start = 1, 1, 1
+  local column_name_map = parameters.columns and
+    build_column_name_map(parameters.columns)
+  local column_index_map
+
+
+  -- Cut the front off the buffer if we've already read it
+  local function truncate()
+    if anchor_pos > buffer_size then
+      local remove = math.floor(anchor_pos / buffer_size) * buffer_size
+      buffer = buffer:sub(remove + 1)
+      anchor_pos = anchor_pos - remove
+      line_start = line_start - remove
+    end
+  end
+
+
+  -- Extend the buffer so we can see more
+  local function extend(offset)
+    local extra = anchor_pos + offset - 1 - #buffer
+    if extra > 0 then
+      local size = math.ceil(extra / buffer_size) * buffer_size
+      local s = file:read(buffer_size)
+      if not s then return end
+      buffer = buffer..s
+    end
+  end
+
+
+  -- Find something in the buffer, extending it if necessary
+  local function find(pattern, offset)
+    truncate()
+    local first, last, capture
+    while true do
+      first, last, capture = buffer:find(pattern, anchor_pos + offset - 1)
+      if not first then
+        local s = file:read(buffer_size)
+        if not s then return end
+        buffer = buffer..s
+      else
+        return first - anchor_pos + 1, last - anchor_pos + 1, capture
+      end
+    end
+  end
+
+
+  -- Get a substring from the buffer, extending it if necessary
+  local function sub(a, b)
+    truncate()
+    extend(b)
+    return buffer:sub(anchor_pos + a - 1, anchor_pos + b - 1)
+  end
+
+
+  -- If the user hasn't specified a separator, try to work out what it is.
+  local sep = parameters.separator
+  if not sep then
+    local _
+    _, _, sep = find("([,\t])", 1)
+  end
+  sep = "(["..sep.."\n\r])"
+
+
+  -- Start reading the file
+  local field_count, fields, starts = 0, {}, {}
+  local header
+
+  while true do
+    local field_start_line = line
+    local field_start_column = anchor_pos - line_start + 1
+    local field_end, sep_end, this_sep
+
+    -- If the field is quoted, go find the other quote
+    if sub(1, 1) == '"' then
+      anchor_pos = anchor_pos + 1
+      local current_pos = 0
+      repeat
+        local a, b, c = find('"("?)', current_pos + 1)
+        current_pos = b
+      until c ~= '"'
+      if not current_pos then
+        error(("%s:%d:%d: unmatched quote"):
+          format(filename, field_start_line, field_start_column))
+      end
+      tidy = fix_quotes
+      field_end, sep_end, this_sep = find("%s*(%S)", current_pos+1)
+      if not this_sep:match(sep) then
+        error(("%s:%d:%d: unmatched quote"):
+          format(filename, field_start_line, field_start_column))
+      end
+    else
+      field_end, sep_end, this_sep = find(sep, 1)
+      tidy = trim_space
+    end
+
+    -- Look for the separator or a newline.
+    field_end = (field_end or 0) - 1
+
+    -- Read the field, then convert all the line endings to \n, and
+    -- count any embedded line endings
+    local value = sub(1, field_end)
+    value = value:gsub("\r\n", "\n"):gsub("\r", "\n")
+    for nl in value:gmatch("\n()") do
+      line = line + 1
+      line_start = nl + anchor_pos
+    end
+
+    value = tidy(value)
+    field_count = field_count + 1
+
+    -- Insert the value into the table for this "line"
+    local key
+    if column_index_map then
+      value, key = transform_field(value, field_count, column_index_map,
+        filename, field_start_line, field_start_column)
+    elseif header then
+      key = header[field_count]
+    else
+      key = field_count
+    end
+    if key then
+      fields[key] = value
+      starts[key] = { line=field_start_line, column=field_start_column }
+    end
+
+    -- if we ended on a newline then yield the fields on this line.
+    if not this_sep or this_sep == "\r" or this_sep == "\n" then
+      if column_name_map and not column_index_map then
+        column_index_map = build_column_index_map(fields, column_name_map)
+      elseif parameters.header and not header then
+        header = fields
+      else
+        coroutine.yield(fields, starts)
+      end
+      field_count, fields, starts = 0, {}, {}
+    end
+
+    -- If we *really* didn't find a separator then we're done.
+    if not sep_end then break end
+
+    -- If we ended on a newline then count it.
+    if this_sep == "\r" or this_sep == "\n" then
+      if this_sep == "\r" and sub(sep_end+1, sep_end+1) == "\n" then
+        sep_end = sep_end + 1
+      end
+      line = line + 1
+      line_start = anchor_pos + sep_end
+    end
+
+    anchor_pos = anchor_pos + sep_end
+  end
+end
+
+
+------------------------------------------------------------------------------
+
+local file_mt =
+{
+  lines = function(t)
+      return coroutine.wrap(function()
+          separated_values_iterator(t.file, t.parameters)
+        end)
+    end,
+  close = function(t)
+      t.file:close()
+    end,
+}
+file_mt.__index = file_mt
+
+
+local function use(file, parameters)
+  local f = { file = file, parameters = parameters }
+  return setmetatable(f, file_mt)
+end
+
+
+--- Open a file for reading as a delimited file
+--  @return a file object
+local function open(
+  filename,         -- string: name of the file to open
+  parameters)       -- ?table: parameters controlling reading the file.
+                    -- See README.md
+  local file, message = io.open(filename, "r")
+  if not file then return nil, message end
+
+  parameters = parameters or {}
+  parameters.filename = filename
+  return use(file, parameters)
+end
+
+
+------------------------------------------------------------------------------
+
+return { open = open, use = use }
+
+------------------------------------------------------------------------------
--- a/rockspecs/csv-1-1.rockspec
+++ b/rockspecs/csv-1-1.rockspec
@ -0,0 +1,24 @@
+package = "csv"
+version = "1-1"
+source =
+{
+  url = "git://github.com/geoffleyland/lua-csv.git",
+  branch = "master",
+  tag = "v1",
+}
+description =
+{
+  summary = "CSV and other delimited file reading",
+  homepage = "http://github.com/geoffleyland/lua-csv",
+  license = "MIT/X11",
+  maintainer = "Geoff Leyland <geoff.leyland@incremental.co.nz>"
+}
+dependencies = { "lua >= 5.1" }
+build =
+{
+  type = "builtin",
+  modules =
+  {
+    csv = "lua/csv.lua",
+  },
+}