mirror of
https://github.com/geoffleyland/lua-csv.git
synced 2024-11-23 01:34:19 +00:00
be more thorough about guessing delimiters
This commit is contained in:
parent
22d84c44ee
commit
6d254a27a4
52
lua/csv.lua
52
lua/csv.lua
@ -228,15 +228,41 @@ end
|
|||||||
|
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
--- If the user hasn't specified a separator, try to work out what it is.
|
local separator_candidates = { ",", "\t", "|" }
|
||||||
local function guess_separator(buffer, parameters)
|
local guess_separator_params = { record_limit = 8; }
|
||||||
local sep = parameters.separator
|
|
||||||
if not sep then
|
|
||||||
local _
|
local function try_separator(buffer, sep, f)
|
||||||
_, _, sep = buffer:find("([,\t])", 1)
|
guess_separator_params.separator = sep
|
||||||
|
local min, max = math.huge, 0
|
||||||
|
local lines, split_lines = 0, 0
|
||||||
|
local iterator = coroutine.wrap(function() f(buffer, guess_separator_params) end)
|
||||||
|
for t in iterator do
|
||||||
|
min = math.min(min, #t)
|
||||||
|
max = math.max(max, #t)
|
||||||
|
split_lines = split_lines + (t[2] and 1 or 0)
|
||||||
|
lines = lines + 1
|
||||||
end
|
end
|
||||||
sep = "(["..sep.."\n\r])"
|
if split_lines / lines > 0.75 then
|
||||||
return sep
|
return max - min
|
||||||
|
else
|
||||||
|
return math.huge
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
--- If the user hasn't specified a separator, try to work out what it is.
|
||||||
|
function guess_separator(buffer, f)
|
||||||
|
local best_separator, lowest_diff = "", math.huge
|
||||||
|
for _, s in ipairs(separator_candidates) do
|
||||||
|
local ok, diff = pcall(function() return try_separator(buffer, s, f) end)
|
||||||
|
if ok and diff < lowest_diff then
|
||||||
|
best_separator = s
|
||||||
|
lowest_diff = diff
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
return best_separator
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
@ -309,13 +335,16 @@ local function separated_values_iterator(buffer, parameters)
|
|||||||
-- Is there some kind of Unicode BOM here?
|
-- Is there some kind of Unicode BOM here?
|
||||||
advance(find_unicode_BOM(field_sub))
|
advance(find_unicode_BOM(field_sub))
|
||||||
|
|
||||||
|
|
||||||
-- Start reading the file
|
-- Start reading the file
|
||||||
local sep = guess_separator(buffer, parameters)
|
local sep = "(["..(parameters.separator or
|
||||||
|
guess_separator(buffer, separated_values_iterator)).."\n\r])"
|
||||||
local line_start = 1
|
local line_start = 1
|
||||||
local line = 1
|
local line = 1
|
||||||
local field_count, fields, starts, nonblanks = 0, {}, {}
|
local field_count, fields, starts, nonblanks = 0, {}, {}
|
||||||
local header, header_read
|
local header, header_read
|
||||||
local field_start_line, field_start_column
|
local field_start_line, field_start_column
|
||||||
|
local record_count = 0
|
||||||
|
|
||||||
|
|
||||||
local function problem(message)
|
local function problem(message)
|
||||||
@ -393,6 +422,11 @@ local function separated_values_iterator(buffer, parameters)
|
|||||||
else
|
else
|
||||||
if nonblanks or field_count > 1 then -- ignore blank lines
|
if nonblanks or field_count > 1 then -- ignore blank lines
|
||||||
coroutine.yield(fields, starts)
|
coroutine.yield(fields, starts)
|
||||||
|
record_count = record_count + 1
|
||||||
|
if parameters.record_limit and
|
||||||
|
record_count >= parameters.record_limit then
|
||||||
|
break
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
field_count, fields, starts, nonblanks = 0, {}, {}
|
field_count, fields, starts, nonblanks = 0, {}, {}
|
||||||
|
@ -81,6 +81,14 @@ apple:four,charlie:60!]],
|
|||||||
apple = { name = "ALPHA", transform = string.lower },
|
apple = { name = "ALPHA", transform = string.lower },
|
||||||
charlie = { transform = function(x) return tonumber(x) * 10 end }}})
|
charlie = { transform = function(x) return tonumber(x) * 10 end }}})
|
||||||
|
|
||||||
|
test("../test-data/bars.txt", [[
|
||||||
|
there's a comma in this field, but no newline,embedded
|
||||||
|
newline,embedded
|
||||||
|
newline!
|
||||||
|
embedded
|
||||||
|
newline,embedded
|
||||||
|
newline,embedded
|
||||||
|
newline!]])
|
||||||
|
|
||||||
|
|
||||||
if errors == 0 then
|
if errors == 0 then
|
||||||
|
7
test-data/bars.txt
Normal file
7
test-data/bars.txt
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
there's a comma in this field, but no newline|"embedded
|
||||||
|
newline"|"embedded
|
||||||
|
newline"
|
||||||
|
"embedded
|
||||||
|
newline"|"embedded
|
||||||
|
newline"|"embedded
|
||||||
|
newline"
|
Loading…
Reference in New Issue
Block a user