mirror of
https://github.com/geoffleyland/lua-csv.git
synced 2025-01-13 21:54:19 +00:00
first commit of lua-csv
This commit is contained in:
commit
da57f60673
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
.DS_Store
|
||||
lua/docs
|
21
LICENSE
Normal file
21
LICENSE
Normal file
@ -0,0 +1,21 @@
|
||||
Copyright (c) 2013 Incremental IP Limited
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
|
||||
|
83
README.md
Normal file
83
README.md
Normal file
@ -0,0 +1,83 @@
|
||||
# Lua-CSV - delimited file reading
|
||||
|
||||
## 1. What?
|
||||
|
||||
Lua-CSV is a Lua module for reading delimited text files (popularly CSV and
|
||||
tab-separated files, but you can specify the separator).
|
||||
|
||||
Lua-CSV tries to auto-detect whether a file is delimited with commas or tabs,
|
||||
copes with non-native newlines, survives newlines and quotes inside quoted
|
||||
fields and offers an iterator interface so it can handle large files.
|
||||
|
||||
|
||||
## 2. How?
|
||||
|
||||
local csv = require("csv")
|
||||
local f = csv.open("file.csv")
|
||||
for fields in f:lines() do
|
||||
for i, v in ipairs(fields) do print(i, v) end
|
||||
end
|
||||
|
||||
`csv.open` takes a second argument `parameters`, a table of parameters
|
||||
controlling how the file is read:
|
||||
|
||||
+ `separator` sets the separator. It'll probably guess the separator
|
||||
correctly if it's a comma or a tab (unless, say, the first field in a
|
||||
tab-delimited file contains a comma), but if you want something else you'll
|
||||
have to set this. It could be more than one character, but it's used as
|
||||
part of a set: `"["..sep.."\n\r]"`
|
||||
|
||||
+ Set `header` to true if the file contains a header and each set of fields
|
||||
will be keyed by the names in the header rather than by integer index.
|
||||
|
||||
+ `columns` provides a mechanism for column remapping.
|
||||
Suppose you have a csv file as follows:
|
||||
|
||||
Word,Number
|
||||
ONE,10
|
||||
|
||||
And columns is:
|
||||
|
||||
+ `{ word = true }` then the only field in the file would be
|
||||
`{ word = "ONE" }`
|
||||
+ `{ first = { name = "word"} }` then it would be { first = "ONE" }
|
||||
+ `{ word = { transform = string.lower }}` would give { word = "one" }
|
||||
+
|
||||
{ word = true
|
||||
number = { transform = function(x) return tonumber(x) / 10 end }}
|
||||
would give `{ word = "ONE", number = 1 }`
|
||||
|
||||
A column can have more than one name:
|
||||
`{ first = { names = {"word", "worm"}}}` to help cope with badly specified
|
||||
file formats and spelling mistakes.
|
||||
|
||||
+ `buffer_size` controls the size of the blocks the file is read in. The
|
||||
default is 4096, which is what `pagesize` says on my system.
|
||||
|
||||
|
||||
## 3. Requirements
|
||||
|
||||
Lua 5.1 or 5.1 or LuaJIT.
|
||||
|
||||
|
||||
## 4. Issues
|
||||
|
||||
+ It won't cope with multiple delimiter characters between fields as might be
|
||||
seen in a whitespace delimited file. Instead it'll think there's lots of
|
||||
empty fields.
|
||||
|
||||
## 5. Wishlist
|
||||
|
||||
+ Tests would be nice.
|
||||
+ So would better LDoc documentation.
|
||||
|
||||
|
||||
## 6. Alternatives
|
||||
|
||||
+ [Penlight](http://github.com/stevedonovan/penlight) contains delimited
|
||||
file reading. It reads the whole file in one go.
|
||||
+ The Lua Wiki contains two pages on CSV
|
||||
[here](http://lua-users.org/wiki/LuaCsv) and
|
||||
[here](http://lua-users.org/wiki/CsvUtils).
|
||||
+ There's an example using [LPeg](http://www.inf.puc-rio.br/~roberto/lpeg/)
|
||||
to parse CSV [here](http://www.inf.puc-rio.br/~roberto/lpeg/#CSV)
|
4
lua/config.ld
Normal file
4
lua/config.ld
Normal file
@ -0,0 +1,4 @@
|
||||
project = "Lua-CSV"
|
||||
title = "Lua-CSV Source Documentation"
|
||||
description = "Lua-CSV reads delimited text files"
|
||||
format = "markdown"
|
348
lua/csv.lua
Normal file
348
lua/csv.lua
Normal file
@ -0,0 +1,348 @@
|
||||
--- Read a comma or tab (or other delimiter) separated file.
|
||||
-- This version of a CSV reader differs from others I've seen in that it
|
||||
--
|
||||
-- + handles embedded newlines in fields (if they're delimited with double
|
||||
-- quotes)
|
||||
-- + is line-ending agnostic
|
||||
-- + reads the file line-by-line, so it can potientially handle large
|
||||
-- files.
|
||||
--
|
||||
-- Of course, for such a simple format, CSV is horribly complicated, so it
|
||||
-- likely gets something wrong.
|
||||
|
||||
-- (c) Copyright 2013 Incremental IP Limited.
|
||||
-- Available under the MIT licence. See LICENSE for more information.
|
||||
|
||||
local DEFAULT_BUFFER_SIZE = 1024
|
||||
|
||||
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
local function trim_space(s)
|
||||
return s:match("^%s*(.-)%s*$")
|
||||
end
|
||||
|
||||
|
||||
local function fix_quotes(s)
|
||||
-- the sub(..., -2) is to strip the trailing quote
|
||||
return string.sub(s:gsub('""', '"'), 1, -2)
|
||||
end
|
||||
|
||||
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
--- Parse a list of columns.
|
||||
-- The main job here is normalising column names and dealing with columns
|
||||
-- for which we have more than one possible name in the header.
|
||||
local function build_column_name_map(columns)
|
||||
local column_name_map = {}
|
||||
for n, v in pairs(columns) do
|
||||
local names
|
||||
local t
|
||||
if type(v) == "table" then
|
||||
t = { transform = v.transform, default = v.default }
|
||||
if v.name then
|
||||
names = { (v.name:gsub("_+", " ")) }
|
||||
else
|
||||
names = v.names
|
||||
for i, n in ipairs(names) do names[i] = n:gsub("_+", " ") end
|
||||
end
|
||||
else
|
||||
if type(v) == "function" then
|
||||
t = { transform = v }
|
||||
else
|
||||
t = {}
|
||||
end
|
||||
end
|
||||
|
||||
if not names then
|
||||
names = { (n:lower():gsub("_", " ")) }
|
||||
end
|
||||
|
||||
t.name = n
|
||||
for _, n in ipairs(names) do
|
||||
column_name_map[n:lower()] = t
|
||||
end
|
||||
end
|
||||
|
||||
return column_name_map
|
||||
end
|
||||
|
||||
|
||||
--- Map "virtual" columns to file columns.
|
||||
-- Once we've read the header, work out which columns we're interested in and
|
||||
-- what to do with them. Mostly this is about checking we've got the columns
|
||||
-- we need and writing a nice complaint if we haven't.
|
||||
local function build_column_index_map(header, column_name_map)
|
||||
column_index_map = {}
|
||||
|
||||
-- Match the columns in the file to the columns in the name map
|
||||
local found = {}
|
||||
for i, word in ipairs(header) do
|
||||
word = word:lower():gsub("[^%w%d]+", " "):gsub("^ *(.-) *$", "%1")
|
||||
local r = column_name_map[word]
|
||||
if r then
|
||||
column_index_map[i] = r
|
||||
found[r.name] = true
|
||||
end
|
||||
end
|
||||
|
||||
-- check we found all the columns we need
|
||||
local not_found = {}
|
||||
for name, r in pairs(column_name_map) do
|
||||
if not found[r.name] then
|
||||
local nf = not_found[r.name]
|
||||
if nf then
|
||||
nf[#nf+1] = name
|
||||
else
|
||||
not_found[r.name] = { name }
|
||||
end
|
||||
end
|
||||
end
|
||||
-- If any columns are missing, assemble an error message
|
||||
if next(not_found) then
|
||||
local problems = {}
|
||||
for k, v in pairs(not_found) do
|
||||
local missing
|
||||
if #v == 1 then
|
||||
missing = "'"..v[1].."'"
|
||||
else
|
||||
missing = v[1]
|
||||
for i = 2, #v - 1 do
|
||||
missing = missing..", '"..v[i].."'"
|
||||
end
|
||||
missing = missing.." or '"..v[#v].."'"
|
||||
end
|
||||
problems[#problems+1] = "Couldn't find a column named "..missing
|
||||
end
|
||||
error(table.concat(problems, "\n"), 0)
|
||||
end
|
||||
|
||||
return column_index_map
|
||||
end
|
||||
|
||||
|
||||
local function transform_field(value, index, map, filename, line, column)
|
||||
local field = map[index]
|
||||
if field then
|
||||
if field.transform then
|
||||
ok, value = pcall(field.transform, value)
|
||||
if not ok then
|
||||
error(("%s:%d:%d: Couldn't read field '%s': %s"):
|
||||
format(filename or "<unknown>", line, column,
|
||||
field.name, value))
|
||||
end
|
||||
end
|
||||
return value or field.default, field.name
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
--- Iterate through the records in a file
|
||||
-- Since records might be more than one line (if there's a newline in quotes)
|
||||
-- and line-endings might not be native, we read the file in chunks of
|
||||
-- `buffer_size`.
|
||||
-- For some reason I do this by writing a `find` and `sub` tha
|
||||
local function separated_values_iterator(file, parameters)
|
||||
local buffer_size = parameters.buffer_size or DEFAULT_BUFFER_SIZE
|
||||
local filename = parameters.filename or "<unknown>"
|
||||
local buffer = ""
|
||||
local anchor_pos = 1
|
||||
local line, line_start = 1, 1, 1
|
||||
local column_name_map = parameters.columns and
|
||||
build_column_name_map(parameters.columns)
|
||||
local column_index_map
|
||||
|
||||
|
||||
-- Cut the front off the buffer if we've already read it
|
||||
local function truncate()
|
||||
if anchor_pos > buffer_size then
|
||||
local remove = math.floor(anchor_pos / buffer_size) * buffer_size
|
||||
buffer = buffer:sub(remove + 1)
|
||||
anchor_pos = anchor_pos - remove
|
||||
line_start = line_start - remove
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
-- Extend the buffer so we can see more
|
||||
local function extend(offset)
|
||||
local extra = anchor_pos + offset - 1 - #buffer
|
||||
if extra > 0 then
|
||||
local size = math.ceil(extra / buffer_size) * buffer_size
|
||||
local s = file:read(buffer_size)
|
||||
if not s then return end
|
||||
buffer = buffer..s
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
-- Find something in the buffer, extending it if necessary
|
||||
local function find(pattern, offset)
|
||||
truncate()
|
||||
local first, last, capture
|
||||
while true do
|
||||
first, last, capture = buffer:find(pattern, anchor_pos + offset - 1)
|
||||
if not first then
|
||||
local s = file:read(buffer_size)
|
||||
if not s then return end
|
||||
buffer = buffer..s
|
||||
else
|
||||
return first - anchor_pos + 1, last - anchor_pos + 1, capture
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
-- Get a substring from the buffer, extending it if necessary
|
||||
local function sub(a, b)
|
||||
truncate()
|
||||
extend(b)
|
||||
return buffer:sub(anchor_pos + a - 1, anchor_pos + b - 1)
|
||||
end
|
||||
|
||||
|
||||
-- If the user hasn't specified a separator, try to work out what it is.
|
||||
local sep = parameters.separator
|
||||
if not sep then
|
||||
local _
|
||||
_, _, sep = find("([,\t])", 1)
|
||||
end
|
||||
sep = "(["..sep.."\n\r])"
|
||||
|
||||
|
||||
-- Start reading the file
|
||||
local field_count, fields, starts = 0, {}, {}
|
||||
local header
|
||||
|
||||
while true do
|
||||
local field_start_line = line
|
||||
local field_start_column = anchor_pos - line_start + 1
|
||||
local field_end, sep_end, this_sep
|
||||
|
||||
-- If the field is quoted, go find the other quote
|
||||
if sub(1, 1) == '"' then
|
||||
anchor_pos = anchor_pos + 1
|
||||
local current_pos = 0
|
||||
repeat
|
||||
local a, b, c = find('"("?)', current_pos + 1)
|
||||
current_pos = b
|
||||
until c ~= '"'
|
||||
if not current_pos then
|
||||
error(("%s:%d:%d: unmatched quote"):
|
||||
format(filename, field_start_line, field_start_column))
|
||||
end
|
||||
tidy = fix_quotes
|
||||
field_end, sep_end, this_sep = find("%s*(%S)", current_pos+1)
|
||||
if not this_sep:match(sep) then
|
||||
error(("%s:%d:%d: unmatched quote"):
|
||||
format(filename, field_start_line, field_start_column))
|
||||
end
|
||||
else
|
||||
field_end, sep_end, this_sep = find(sep, 1)
|
||||
tidy = trim_space
|
||||
end
|
||||
|
||||
-- Look for the separator or a newline.
|
||||
field_end = (field_end or 0) - 1
|
||||
|
||||
-- Read the field, then convert all the line endings to \n, and
|
||||
-- count any embedded line endings
|
||||
local value = sub(1, field_end)
|
||||
value = value:gsub("\r\n", "\n"):gsub("\r", "\n")
|
||||
for nl in value:gmatch("\n()") do
|
||||
line = line + 1
|
||||
line_start = nl + anchor_pos
|
||||
end
|
||||
|
||||
value = tidy(value)
|
||||
field_count = field_count + 1
|
||||
|
||||
-- Insert the value into the table for this "line"
|
||||
local key
|
||||
if column_index_map then
|
||||
value, key = transform_field(value, field_count, column_index_map,
|
||||
filename, field_start_line, field_start_column)
|
||||
elseif header then
|
||||
key = header[field_count]
|
||||
else
|
||||
key = field_count
|
||||
end
|
||||
if key then
|
||||
fields[key] = value
|
||||
starts[key] = { line=field_start_line, column=field_start_column }
|
||||
end
|
||||
|
||||
-- if we ended on a newline then yield the fields on this line.
|
||||
if not this_sep or this_sep == "\r" or this_sep == "\n" then
|
||||
if column_name_map and not column_index_map then
|
||||
column_index_map = build_column_index_map(fields, column_name_map)
|
||||
elseif parameters.header and not header then
|
||||
header = fields
|
||||
else
|
||||
coroutine.yield(fields, starts)
|
||||
end
|
||||
field_count, fields, starts = 0, {}, {}
|
||||
end
|
||||
|
||||
-- If we *really* didn't find a separator then we're done.
|
||||
if not sep_end then break end
|
||||
|
||||
-- If we ended on a newline then count it.
|
||||
if this_sep == "\r" or this_sep == "\n" then
|
||||
if this_sep == "\r" and sub(sep_end+1, sep_end+1) == "\n" then
|
||||
sep_end = sep_end + 1
|
||||
end
|
||||
line = line + 1
|
||||
line_start = anchor_pos + sep_end
|
||||
end
|
||||
|
||||
anchor_pos = anchor_pos + sep_end
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
local file_mt =
|
||||
{
|
||||
lines = function(t)
|
||||
return coroutine.wrap(function()
|
||||
separated_values_iterator(t.file, t.parameters)
|
||||
end)
|
||||
end,
|
||||
close = function(t)
|
||||
t.file:close()
|
||||
end,
|
||||
}
|
||||
file_mt.__index = file_mt
|
||||
|
||||
|
||||
local function use(file, parameters)
|
||||
local f = { file = file, parameters = parameters }
|
||||
return setmetatable(f, file_mt)
|
||||
end
|
||||
|
||||
|
||||
--- Open a file for reading as a delimited file
|
||||
-- @return a file object
|
||||
local function open(
|
||||
filename, -- string: name of the file to open
|
||||
parameters) -- ?table: parameters controlling reading the file.
|
||||
-- See README.md
|
||||
local file, message = io.open(filename, "r")
|
||||
if not file then return nil, message end
|
||||
|
||||
parameters = parameters or {}
|
||||
parameters.filename = filename
|
||||
return use(file, parameters)
|
||||
end
|
||||
|
||||
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
return { open = open, use = use }
|
||||
|
||||
------------------------------------------------------------------------------
|
24
rockspecs/csv-1-1.rockspec
Normal file
24
rockspecs/csv-1-1.rockspec
Normal file
@ -0,0 +1,24 @@
|
||||
package = "csv"
|
||||
version = "1-1"
|
||||
source =
|
||||
{
|
||||
url = "git://github.com/geoffleyland/lua-csv.git",
|
||||
branch = "master",
|
||||
tag = "v1",
|
||||
}
|
||||
description =
|
||||
{
|
||||
summary = "CSV and other delimited file reading",
|
||||
homepage = "http://github.com/geoffleyland/lua-csv",
|
||||
license = "MIT/X11",
|
||||
maintainer = "Geoff Leyland <geoff.leyland@incremental.co.nz>"
|
||||
}
|
||||
dependencies = { "lua >= 5.1" }
|
||||
build =
|
||||
{
|
||||
type = "builtin",
|
||||
modules =
|
||||
{
|
||||
csv = "lua/csv.lua",
|
||||
},
|
||||
}
|
Loading…
Reference in New Issue
Block a user