utf8 lib update

This commit is contained in:
João Lopes 2020-08-04 11:28:04 +01:00
parent 36e3d5874c
commit 72886439e7
19 changed files with 309 additions and 138 deletions

View File

@ -7,7 +7,7 @@ utf8.config.begins = utf8.config.begins or {
function utf8.regex.compiletime.begins.parse(regex, c, bs, ctx) function utf8.regex.compiletime.begins.parse(regex, c, bs, ctx)
for _, m in ipairs(utf8.config.begins) do for _, m in ipairs(utf8.config.begins) do
local functions, move = m.parse(regex, c, bs, ctx) local functions, move = m.parse(regex, c, bs, ctx)
utf8.debug("begins", _, c, bs, nbs, move, functions) utf8.debug("begins", _, c, bs, move, functions)
if functions then if functions then
return functions, move return functions, move
end end

View File

@ -4,26 +4,25 @@ local matchers = {
sliding = function() sliding = function()
return [[ return [[
add(function(ctx) -- sliding add(function(ctx) -- sliding
local saved = ctx:clone() while ctx.pos <= ctx.len do
local start_pos = ctx.pos local clone = ctx:clone()
while ctx.pos <= 1 + utf8len(ctx.str) do -- debug('starting from', clone, "start_pos", clone.pos)
debug('starting from', ctx, "start_pos", start_pos) clone.result.start = clone.pos
ctx.result.start = ctx.pos clone:next_function()
ctx:next_function() clone:get_function()(clone)
ctx:get_function()(ctx)
ctx = saved:clone() ctx:next_char()
start_pos = start_pos + 1 end
ctx.pos = start_pos ctx:terminate()
end
ctx:terminate()
end) end)
]] ]]
end, end,
fromstart = function(ctx) fromstart = function(ctx)
return [[ return [[
add(function(ctx) -- fromstart add(function(ctx) -- fromstart
local saved = ctx:clone() if ctx.byte_pos > ctx.len then
return
end
ctx.result.start = ctx.pos ctx.result.start = ctx.pos
ctx:next_function() ctx:next_function()
ctx:get_function()(ctx) ctx:get_function()(ctx)

View File

@ -89,34 +89,38 @@ function builder:include(b)
end end
function builder:build() function builder:build()
local codes_list = table.concat(self.codes or {}, ', ') if self.codes and #self.codes == 1 and not self.inverted and not self.ranges and not self.classes and not self.not_classes and not self.includes then
local ranges_list = '' return "{test = function(self, cc) return cc == " .. self.codes[1] .. " end}"
for i, r in ipairs(self.ranges or {}) do ranges_list = ranges_list .. (i > 1 and ', {' or '{') .. tostring(r[1]) .. ', ' .. tostring(r[2]) .. '}' end else
local classes_list = '' local codes_list = table.concat(self.codes or {}, ', ')
if self.classes then classes_list = "'" .. table.concat(self.classes, "', '") .. "'" end local ranges_list = ''
local not_classes_list = '' for i, r in ipairs(self.ranges or {}) do ranges_list = ranges_list .. (i > 1 and ', {' or '{') .. tostring(r[1]) .. ', ' .. tostring(r[2]) .. '}' end
if self.not_classes then not_classes_list = "'" .. table.concat(self.not_classes, "', '") .. "'" end local classes_list = ''
if self.classes then classes_list = "'" .. table.concat(self.classes, "', '") .. "'" end
local not_classes_list = ''
if self.not_classes then not_classes_list = "'" .. table.concat(self.not_classes, "', '") .. "'" end
local subs_list = '' local subs_list = ''
for i, r in ipairs(self.includes or {}) do subs_list = subs_list .. (i > 1 and ', ' or '') .. r:build() .. '' end for i, r in ipairs(self.includes or {}) do subs_list = subs_list .. (i > 1 and ', ' or '') .. r:build() .. '' end
local src = [[cl.new():with_codes( local src = [[cl.new():with_codes(
]] .. codes_list .. [[ ]] .. codes_list .. [[
):with_ranges( ):with_ranges(
]] .. ranges_list .. [[ ]] .. ranges_list .. [[
):with_classes( ):with_classes(
]] .. classes_list .. [[ ]] .. classes_list .. [[
):without_classes( ):without_classes(
]] .. not_classes_list .. [[ ]] .. not_classes_list .. [[
):with_subs( ):with_subs(
]] .. subs_list .. [[ ]] .. subs_list .. [[
)]] )]]
if self.inverted then if self.inverted then
src = src .. ':invert()' src = src .. ':invert()'
end
return src
end end
return src
end end
return builder return builder

View File

@ -28,7 +28,7 @@ return function(str, c, bs, ctx)
if c == '%' then if c == '%' then
c, nbs = next(str, nbs) c, nbs = next(str, nbs)
r2 = c r2 = c
elseif c ~= '' then elseif c ~= '' and c ~= ']' then
r2 = c r2 = c
end end

View File

@ -16,6 +16,9 @@ local function parse(str, c, bs, ctx)
if c == '%' then if c == '%' then
c, nbs = next(str, bs) c, nbs = next(str, bs)
if c == '' then
error("malformed pattern (ends with '%')")
end
local _c = utf8.raw.lower(c) local _c = utf8.raw.lower(c)
local matched local matched
if _c == 'a' then if _c == 'a' then
@ -46,8 +49,15 @@ local function parse(str, c, bs, ctx)
else else
class = cl.new():with_classes(matched) class = cl.new():with_classes(matched)
end end
elseif _c == 'z' then
class = cl.new():with_codes(0)
if _c ~= c then
class = class:invert()
end
else
class = cl.new():with_codes(c)
end end
elseif c == '[' then elseif c == '[' and not ctx.internal then
local old_internal = ctx.internal local old_internal = ctx.internal
ctx.internal = true ctx.internal = true
class = cl.new() class = cl.new()
@ -58,9 +68,18 @@ local function parse(str, c, bs, ctx)
utf8.debug("next", tttt, c, nbs) utf8.debug("next", tttt, c, nbs)
if c == '^' and firstletter then if c == '^' and firstletter then
class:invert() class:invert()
local nc, nnbs = next(str, nbs)
if nc == ']' then
class:with_codes(nc)
nbs = nnbs
end
elseif c == ']' then elseif c == ']' then
utf8.debug('] on pos', tttt, nbs) if firstletter then
break class:with_codes(c)
else
utf8.debug('] on pos', tttt, nbs)
break
end
elseif c == '' then elseif c == '' then
error "malformed pattern (missing ']')" error "malformed pattern (missing ']')"
else else

View File

@ -71,7 +71,7 @@ function class:with_subs(...)
end end
function class:in_codes(item) function class:in_codes(item)
if not self.codes then return false end if not self.codes or #self.codes == 0 then return nil end
local head, tail = 1, #self.codes local head, tail = 1, #self.codes
local mid = math.floor((head + tail)/2) local mid = math.floor((head + tail)/2)
@ -93,7 +93,7 @@ function class:in_codes(item)
end end
function class:in_ranges(char_code) function class:in_ranges(char_code)
if not self.ranges then return false end if not self.ranges or #self.ranges == 0 then return nil end
for _,r in ipairs(self.ranges) do for _,r in ipairs(self.ranges) do
if r[1] <= char_code and char_code <= r[2] then if r[1] <= char_code and char_code <= r[2] then
@ -104,7 +104,7 @@ function class:in_ranges(char_code)
end end
function class:in_classes(char_code) function class:in_classes(char_code)
if not self.classes then return false end if not self.classes or #self.classes == 0 then return nil end
for _, class in ipairs(self.classes) do for _, class in ipairs(self.classes) do
if self:is(class, char_code) then if self:is(class, char_code) then
@ -115,7 +115,7 @@ function class:in_classes(char_code)
end end
function class:in_not_classes(char_code) function class:in_not_classes(char_code)
if not self.not_classes then return false end if not self.not_classes or #self.not_classes == 0 then return nil end
for _, class in ipairs(self.not_classes) do for _, class in ipairs(self.not_classes) do
if self:is(class, char_code) then if self:is(class, char_code) then
@ -130,7 +130,7 @@ function class:is(class, char_code)
end end
function class:in_subs(char_code) function class:in_subs(char_code)
if not self.subs or #self.subs == 0 then return false end if not self.subs or #self.subs == 0 then return nil end
for _, c in ipairs(self.subs) do for _, c in ipairs(self.subs) do
if not c:test(char_code) then if not c:test(char_code) then
@ -142,20 +142,40 @@ end
function class:test(char_code) function class:test(char_code)
local result = self:do_test(char_code) local result = self:do_test(char_code)
utf8.debug('class:test', result, "'" .. (char_code and utf8.char(char_code) or 'nil') .. "'", char_code) -- utf8.debug('class:test', result, "'" .. (char_code and utf8.char(char_code) or 'nil') .. "'", char_code)
return result return result
end end
function class:do_test(char_code) function class:do_test(char_code)
if not char_code then return false end if not char_code then return false end
local found = (self:in_codes(char_code) or self:in_ranges(char_code) or self:in_classes(char_code) or self:in_subs(char_code)) and not self:in_not_classes(char_code) local in_not_classes = self:in_not_classes(char_code)
utf8.debug('class:do_test', 'found', found, 'inverted', self.inverted, 'result', self.inverted and not found or found) if in_not_classes then
-- utf8.debug(self:in_codes(char_code), self:in_ranges(char_code), self:in_classes(char_code), self:in_subs(char_code), not self:in_not_classes(char_code)) return not not self.inverted
-- ternary if ideom (self.inverted and not found or found) doesn't work with booleans >_< end
if self.inverted then local in_codes = self:in_codes(char_code)
return not found if in_codes then
return not self.inverted
end
local in_ranges = self:in_ranges(char_code)
if in_ranges then
return not self.inverted
end
local in_classes = self:in_classes(char_code)
if in_classes then
return not self.inverted
end
local in_subs = self:in_subs(char_code)
if in_subs then
return not self.inverted
end
if (in_codes == nil)
and (in_ranges == nil)
and (in_classes == nil)
and (in_subs == nil)
and (in_not_classes == false) then
return not self.inverted
else else
return found return not not self.inverted
end end
end end

View File

@ -1,9 +1,14 @@
return function(utf8) return function(utf8)
local utf8unicode = utf8.byte local utf8unicode = utf8.unicode
local utf8sub = utf8.sub local utf8sub = utf8.sub
local sub = utf8.raw.sub
local byte = utf8.raw.byte
local utf8len = utf8.len local utf8len = utf8.len
local utf8next = utf8.next
local rawgsub = utf8.raw.gsub local rawgsub = utf8.raw.gsub
local utf8offset = utf8.offset
local utf8char = utf8.char
local util = utf8.util local util = utf8.util
@ -22,16 +27,42 @@ local mt = {
function ctx.new(obj) function ctx.new(obj)
obj = obj or {} obj = obj or {}
return setmetatable({ local res = setmetatable({
pos = obj.pos or 1, pos = obj.pos or 1,
str = obj.str or nil, byte_pos = obj.pos or 1,
str = assert(obj.str, "str is required"),
len = obj.len,
rawlen = obj.rawlen,
bytes = obj.bytes,
offsets = obj.offsets,
starts = obj.starts or nil, starts = obj.starts or nil,
functions = obj.functions or {}, functions = obj.functions or {},
func_pos = obj.func_pos or 1, func_pos = obj.func_pos or 1,
ends = obj.ends or nil, ends = obj.ends or nil,
result = obj.result and util.copy(obj.result) or {}, result = obj.result and util.copy(obj.result) or {},
captures = obj.captures and util.copy(obj.captures, true) or {active = {}}, captures = obj.captures and util.copy(obj.captures, true) or {active = {}},
modified = false,
}, mt) }, mt)
if not res.bytes then
local str = res.str
local l = #str
local bytes = utf8.config.int32array(l)
local offsets = utf8.config.int32array(l)
local c, bs, i = nil, 1, 1
while bs <= l do
bytes[i] = utf8unicode(str, bs, bs)
offsets[i] = bs
bs = utf8.next(str, bs)
i = i + 1
end
res.bytes = bytes
res.offsets = offsets
res.byte_pos = res.pos
res.len = i
res.rawlen = l
end
return res
end end
function ctx:clone() function ctx:clone()
@ -40,15 +71,22 @@ end
function ctx:next_char() function ctx:next_char()
self.pos = self.pos + 1 self.pos = self.pos + 1
self.byte_pos = self.pos
end
function ctx:prev_char()
self.pos = self.pos - 1
self.byte_pos = self.pos
end end
function ctx:get_char() function ctx:get_char()
return utf8sub(self.str, self.pos, self.pos) if self.len <= self.pos then return "" end
return utf8char(self.bytes[self.pos])
end end
function ctx:get_charcode() function ctx:get_charcode()
if utf8len(self.str) < self.pos then return nil end if self.len <= self.pos then return nil end
return utf8unicode(self:get_char()) return self.bytes[self.pos]
end end
function ctx:next_function() function ctx:next_function()

View File

@ -7,7 +7,7 @@ utf8.config.ends = utf8.config.ends or {
function utf8.regex.compiletime.ends.parse(regex, c, bs, ctx) function utf8.regex.compiletime.ends.parse(regex, c, bs, ctx)
for _, m in ipairs(utf8.config.ends) do for _, m in ipairs(utf8.config.ends) do
local functions, move = m.parse(regex, c, bs, ctx) local functions, move = m.parse(regex, c, bs, ctx)
utf8.debug("ends", _, c, bs, nbs, move, functions) utf8.debug("ends", _, c, bs, move, functions)
if functions then if functions then
return functions, move return functions, move
end end

View File

@ -13,7 +13,8 @@ local matchers = {
return [[ return [[
add(function(ctx) -- toend add(function(ctx) -- toend
ctx.result.finish = ctx.pos - 1 ctx.result.finish = ctx.pos - 1
if ctx.pos == #ctx.str + 1 then ctx:done() end ctx.modified = true
if ctx.pos == utf8len(ctx.str) + 1 then ctx:done() end
end) end)
]] ]]
end, end,

View File

@ -3,7 +3,22 @@ return function(utf8)
local utf8sub = utf8.sub local utf8sub = utf8.sub
local utf8gensub = utf8.gensub local utf8gensub = utf8.gensub
local unpack = utf8.config.unpack local unpack = utf8.config.unpack
local get_matcher_function = utf8:require 'regex_parser' local generate_matcher_function = utf8:require 'regex_parser'
function get_matcher_function(regex, plain)
local res
if utf8.config.cache then
res = utf8.config.cache[plain and "plain" or "regex"][regex]
end
if res then
return res
end
res = generate_matcher_function(regex, plain)
if utf8.config.cache then
utf8.config.cache[plain and "plain" or "regex"][regex] = res
end
return res
end
local function utf8find(str, regex, init, plain) local function utf8find(str, regex, init, plain)
local func = get_matcher_function(regex, plain) local func = get_matcher_function(regex, plain)
@ -19,7 +34,8 @@ local function utf8find(str, regex, init, plain)
end end
local function utf8match(str, regex, init) local function utf8match(str, regex, init)
local func = get_matcher_function(regex, plain, utf8) local func = get_matcher_function(regex, false)
init = ((init or 1) < 0) and (utf8.len(str) + init + 1) or init
local ctx, result, captures = func(str, init, utf8) local ctx, result, captures = func(str, init, utf8)
if not ctx then return nil end if not ctx then return nil end
@ -34,7 +50,7 @@ end
local function utf8gmatch(str, regex) local function utf8gmatch(str, regex)
regex = (utf8sub(regex,1,1) ~= '^') and regex or '%' .. regex regex = (utf8sub(regex,1,1) ~= '^') and regex or '%' .. regex
local func = get_matcher_function(regex, plain, utf8) local func = get_matcher_function(regex, false)
local ctx, result, captures local ctx, result, captures
local continue_pos = 1 local continue_pos = 1
@ -71,7 +87,7 @@ local function replace(repl, args)
else else
num = tonumber(c) num = tonumber(c)
if num then if num then
ret = ret .. args[num] ret = ret .. assert(args[num], "invalid capture index %" .. c)
else else
ret = ret .. c ret = ret .. c
end end
@ -79,13 +95,9 @@ local function replace(repl, args)
end end
end end
elseif type(repl) == 'table' then elseif type(repl) == 'table' then
ret = repl[args[1] or args[0]] or '' ret = repl[args[1]] or args[0]
elseif type(repl) == 'function' then elseif type(repl) == 'function' then
if #args > 0 then ret = repl(unpack(args, 1)) or args[0]
ret = repl(unpack(args, 1)) or ''
else
ret = repl(args[0]) or ''
end
end end
return ret return ret
end end
@ -95,8 +107,7 @@ local function utf8gsub(str, regex, repl, limit)
local subbed = '' local subbed = ''
local prev_sub_finish = 1 local prev_sub_finish = 1
regex = (utf8sub(regex,1,1) ~= '^') and regex or '%' .. regex local func = get_matcher_function(regex, false)
local func = get_matcher_function(regex, plain, utf8)
local ctx, result, captures local ctx, result, captures
local continue_pos = 1 local continue_pos = 1
@ -111,7 +122,13 @@ local function utf8gsub(str, regex, repl, limit)
utf8.debug('captures:', captures) utf8.debug('captures:', captures)
continue_pos = math.max(result.finish + 1, result.start + 1) continue_pos = math.max(result.finish + 1, result.start + 1)
local args = {[0] = utf8sub(str, result.start, result.finish), unpack(captures)} local args
if #captures > 0 then
args = {[0] = utf8sub(str, result.start, result.finish), unpack(captures)}
else
args = {[0] = utf8sub(str, result.start, result.finish)}
args[1] = args[0]
end
subbed = subbed .. utf8sub(str, prev_sub_finish, result.start - 1) subbed = subbed .. utf8sub(str, prev_sub_finish, result.start - 1)
subbed = subbed .. replace(repl, args) subbed = subbed .. replace(repl, args)

View File

@ -1,5 +1,8 @@
local module_path = ... local module_path = ...
module_path = module_path:match("^(.-)init$") or (module_path .. '.') module_path = module_path:match("^(.-)init$") or (module_path .. '.')
local ffi_enabled, ffi = pcall(require, 'ffi')
local utf8 = { local utf8 = {
config = {}, config = {},
default = { default = {
@ -15,7 +18,14 @@ local utf8 = {
__mode = 'kv' __mode = 'kv'
}), }),
}, },
locale = "C.UTF-8", locale = nil,
int32array = function(size)
if ffi_enabled then
return ffi.new("uint32_t[?]", size + 1)
else
return {}
end
end
}, },
regex = { regex = {
compiletime = { compiletime = {

View File

@ -7,12 +7,13 @@ local matchers = {
local ]] .. class_name .. [[ = ]] .. class .. [[ local ]] .. class_name .. [[ = ]] .. class .. [[
add(function(ctx) -- frontier add(function(ctx) -- frontier
ctx.pos = ctx.pos - 1 ctx:prev_char()
local prev_charcode = ctx:get_charcode() local prev_charcode = ctx:get_charcode() or 0
ctx:next_char() ctx:next_char()
debug("frontier pos", ctx.pos, "prev_charcode", prev_charcode, "charcode", ctx:get_charcode()) local charcode = ctx:get_charcode() or 0
-- debug("frontier pos", ctx.pos, "prev_charcode", prev_charcode, "charcode", charcode)
if ]] .. class_name .. [[:test(prev_charcode) then return end if ]] .. class_name .. [[:test(prev_charcode) then return end
if ]] .. class_name .. [[:test(ctx:get_charcode()) then if ]] .. class_name .. [[:test(charcode) then
ctx:next_function() ctx:next_function()
return ctx:get_function()(ctx) return ctx:get_function()(ctx)
end end

View File

@ -9,7 +9,7 @@ utf8.config.modifier = utf8.config.modifier or {
function utf8.regex.compiletime.modifier.parse(regex, c, bs, ctx) function utf8.regex.compiletime.modifier.parse(regex, c, bs, ctx)
for _, m in ipairs(utf8.config.modifier) do for _, m in ipairs(utf8.config.modifier) do
local functions, move = m.parse(regex, c, bs, ctx) local functions, move = m.parse(regex, c, bs, ctx)
utf8.debug("mod", _, c, bs, nbs, move, functions and utf8.config.unpack(functions)) utf8.debug("mod", _, c, bs, move, functions and utf8.config.unpack(functions))
if functions then if functions then
ctx.prev_class = nil ctx.prev_class = nil
return functions, move return functions, move

View File

@ -7,7 +7,7 @@ local matchers = {
local ]] .. class_name .. [[ = ]] .. class .. [[ local ]] .. class_name .. [[ = ]] .. class .. [[
add(function(ctx) -- simple add(function(ctx) -- simple
debug(ctx, 'simple', ']] .. class_name .. [[') -- debug(ctx, 'simple', ']] .. class_name .. [[')
if ]] .. class_name .. [[:test(ctx:get_charcode()) then if ]] .. class_name .. [[:test(ctx:get_charcode()) then
ctx:next_char() ctx:next_char()
ctx:next_function() ctx:next_function()

View File

@ -15,7 +15,7 @@ end
local function check(ctx) local function check(ctx)
if ctx.prev_class then if ctx.prev_class then
table.insert(ctx.funcs, matchers.simple(ctx.prev_class, tostring(bs))) table.insert(ctx.funcs, matchers.simple(ctx.prev_class, tostring(ctx.pos)))
ctx.prev_class = nil ctx.prev_class = nil
end end
end end

View File

@ -10,18 +10,21 @@ local matchers = {
local ]] .. class_name .. [[ = ]] .. class .. [[ local ]] .. class_name .. [[ = ]] .. class .. [[
add(function(ctx) -- star add(function(ctx) -- star
debug(ctx, 'star', ']] .. class_name .. [[') -- debug(ctx, 'star', ']] .. class_name .. [[')
local saved = {ctx:clone()} local clone = ctx:clone()
while ]] .. class_name .. [[:test(ctx:get_charcode()) do while ]] .. class_name .. [[:test(clone:get_charcode()) do
ctx:next_char() clone:next_char()
table.insert(saved, ctx:clone())
debug('#saved <<', #saved)
end end
while #saved > 0 do local pos = clone.pos
ctx = table.remove(saved) while pos >= ctx.pos do
ctx:next_function() clone.pos = pos
ctx:get_function()(ctx) clone.func_pos = ctx.func_pos
debug('#saved >>', #saved) clone:next_function()
clone:get_function()(clone)
if clone.modified then
clone = ctx:clone()
end
pos = pos - 1
end end
end) end)
]] ]]
@ -32,15 +35,23 @@ local matchers = {
local ]] .. class_name .. [[ = ]] .. class .. [[ local ]] .. class_name .. [[ = ]] .. class .. [[
add(function(ctx) -- minus add(function(ctx) -- minus
debug(ctx, 'minus', ']] .. class_name .. [[') -- debug(ctx, 'minus', ']] .. class_name .. [[')
local clone = ctx:clone()
local pos
repeat repeat
local saved = ctx:clone() pos = clone.pos
ctx:next_function() clone:next_function()
ctx:get_function()(ctx) clone:get_function()(clone)
ctx = saved if clone.modified then
local match = ]] .. class_name .. [[:test(ctx:get_charcode()) clone = ctx:clone()
ctx:next_char() clone.pos = pos
else
clone.pos = pos
clone.func_pos = ctx.func_pos
end
local match = ]] .. class_name .. [[:test(clone:get_charcode())
clone:next_char()
until not match until not match
end) end)
]] ]]
@ -51,7 +62,7 @@ local matchers = {
local ]] .. class_name .. [[ = ]] .. class .. [[ local ]] .. class_name .. [[ = ]] .. class .. [[
add(function(ctx) -- question add(function(ctx) -- question
debug(ctx, 'question', ']] .. class_name .. [[') -- debug(ctx, 'question', ']] .. class_name .. [[')
local saved = ctx:clone() local saved = ctx:clone()
if ]] .. class_name .. [[:test(ctx:get_charcode()) then if ]] .. class_name .. [[:test(ctx:get_charcode()) then
ctx:next_char() ctx:next_char()
@ -67,8 +78,9 @@ local matchers = {
capture_start = function(number) capture_start = function(number)
return [[ return [[
add(function(ctx) add(function(ctx)
debug(ctx, 'capture_start', ']] .. tostring(number) .. [[') ctx.modified = true
table.insert(ctx.captures.active, { id = ]] .. tostring(number) .. [[, start_byte = byte_pos, start = ctx.pos }) -- debug(ctx, 'capture_start', ']] .. tostring(number) .. [[')
table.insert(ctx.captures.active, { id = ]] .. tostring(number) .. [[, start = ctx.pos })
ctx:next_function() ctx:next_function()
return ctx:get_function()(ctx) return ctx:get_function()(ctx)
end) end)
@ -77,12 +89,34 @@ local matchers = {
capture_finish = function(number) capture_finish = function(number)
return [[ return [[
add(function(ctx) add(function(ctx)
debug(ctx, 'capture_finish', ']] .. tostring(number) .. [[') ctx.modified = true
-- debug(ctx, 'capture_finish', ']] .. tostring(number) .. [[')
local cap = table.remove(ctx.captures.active) local cap = table.remove(ctx.captures.active)
cap.finish_byte = byte_pos
cap.finish = ctx.pos cap.finish = ctx.pos
ctx.captures[cap.id] = utf8sub(ctx.str, cap.start, cap.finish - 1) local b, e = ctx.offsets[cap.start], ctx.offsets[cap.finish]
debug('capture#' .. tostring(cap.id), '[' .. tostring(cap.start).. ',' .. tostring(cap.finish) .. ']' , 'is', ctx.captures[cap.id]) if cap.start < 1 then
b = 1
elseif cap.start >= ctx.len then
b = ctx.rawlen + 1
end
if cap.finish < 1 then
e = 1
elseif cap.finish >= ctx.len then
e = ctx.rawlen + 1
end
ctx.captures[cap.id] = rawsub(ctx.str, b, e - 1)
-- debug('capture#' .. tostring(cap.id), '[' .. tostring(cap.start).. ',' .. tostring(cap.finish) .. ']' , 'is', ctx.captures[cap.id])
ctx:next_function()
return ctx:get_function()(ctx)
end)
]]
end,
capture_position = function(number)
return [[
add(function(ctx)
ctx.modified = true
-- debug(ctx, 'capture_position', ']] .. tostring(number) .. [[')
ctx.captures[ ]] .. tostring(number) .. [[ ] = ctx.pos
ctx:next_function() ctx:next_function()
return ctx:get_function()(ctx) return ctx:get_function()(ctx)
end) end)
@ -91,11 +125,11 @@ local matchers = {
capture = function(number) capture = function(number)
return [[ return [[
add(function(ctx) add(function(ctx)
debug(ctx, 'capture', ']] .. tostring(number) .. [[') -- debug(ctx, 'capture', ']] .. tostring(number) .. [[')
local cap = ctx.captures[ ]] .. tostring(number) .. [[ ] local cap = ctx.captures[ ]] .. tostring(number) .. [[ ]
local len = utf8len(cap) local len = utf8len(cap)
local check = utf8sub(ctx.str, ctx.pos, ctx.pos + len - 1) local check = utf8sub(ctx.str, ctx.pos, ctx.pos + len - 1)
debug("capture check:", cap, check) -- debug("capture check:", cap, check)
if cap == check then if cap == check then
ctx.pos = ctx.pos + len ctx.pos = ctx.pos + len
ctx:next_function() ctx:next_function()
@ -121,9 +155,9 @@ local matchers = {
elseif c == b then elseif c == b then
balance = balance - 1 balance = balance - 1
end end
debug("balancer: balance=", balance, ", d=", d, ", b=", b, ", charcode=", ctx:get_charcode()) -- debug("balancer: balance=", balance, ", d=", d, ", b=", b, ", charcode=", ctx:get_charcode())
ctx:next_char() ctx:next_char()
until balance == 0 until balance == 0 or (balance == 2 and d == b)
ctx:next_function() ctx:next_function()
return ctx:get_function()(ctx) return ctx:get_function()(ctx)
end) end)
@ -139,6 +173,9 @@ local function parse(regex, c, bs, ctx)
if c == '%' then if c == '%' then
c, nbs = next(regex, bs) c, nbs = next(regex, bs)
utf8.debug("next", c, bs) utf8.debug("next", c, bs)
if c == '' then
error("malformed pattern (ends with '%')")
end
if utf8.raw.find('123456789', c, 1, true) then if utf8.raw.find('123456789', c, 1, true) then
functions = { matchers.capture(tonumber(c)) } functions = { matchers.capture(tonumber(c)) }
nbs = utf8.next(regex, nbs) nbs = utf8.next(regex, nbs)
@ -146,6 +183,7 @@ local function parse(regex, c, bs, ctx)
local d, b local d, b
d, nbs = next(regex, nbs) d, nbs = next(regex, nbs)
b, nbs = next(regex, nbs) b, nbs = next(regex, nbs)
assert(d ~= '' and b ~= '', "unbalanced pattern")
functions = { matchers.balancer({d, b}, tostring(bs)) } functions = { matchers.balancer({d, b}, tostring(bs)) }
nbs = utf8.next(regex, nbs) nbs = utf8.next(regex, nbs)
end end
@ -191,13 +229,19 @@ local function parse(regex, c, bs, ctx)
nbs = bs + 1 nbs = bs + 1
elseif c == '(' then elseif c == '(' then
ctx.capture = ctx.capture or {balance = 0, id = 0} ctx.capture = ctx.capture or {balance = 0, id = 0}
ctx.capture.balance = ctx.capture.balance + 1
ctx.capture.id = ctx.capture.id + 1 ctx.capture.id = ctx.capture.id + 1
functions = { matchers.capture_start(ctx.capture.id) } local nc = next(regex, nbs)
if nc == ')' then
functions = {matchers.capture_position(ctx.capture.id)}
nbs = bs + 2
else
ctx.capture.balance = ctx.capture.balance + 1
functions = {matchers.capture_start(ctx.capture.id)}
nbs = bs + 1
end
if ctx.prev_class then if ctx.prev_class then
table.insert(functions, 1, matchers.simple(ctx.prev_class, tostring(bs))) table.insert(functions, 1, matchers.simple(ctx.prev_class, tostring(bs)))
end end
nbs = bs + 1
elseif c == ')' then elseif c == ')' then
ctx.capture = ctx.capture or {balance = 0, id = 0} ctx.capture = ctx.capture or {balance = 0, id = 0}
functions = { matchers.capture_finish(ctx.capture.id) } functions = { matchers.capture_finish(ctx.capture.id) }

View File

@ -80,8 +80,14 @@ local function utf8symbollen(byte)
return not byte and 0 or (byte < 0x80 and 1) or (byte >= 0xF0 and 4) or (byte >= 0xE0 and 3) or (byte >= 0xC0 and 2) or 1 return not byte and 0 or (byte < 0x80 and 1) or (byte >= 0xF0 and 4) or (byte >= 0xE0 and 3) or (byte >= 0xC0 and 2) or 1
end end
local head_table = utf8.config.int32array(256)
for i = 0, 255 do
head_table[i] = utf8symbollen(i)
end
head_table[256] = 0
local function utf8charbytes(str, bs) local function utf8charbytes(str, bs)
return utf8symbollen(byte(str, bs)) return head_table[byte(str, bs) or 256]
end end
local function utf8next(str, bs) local function utf8next(str, bs)
@ -201,13 +207,12 @@ utf8unicode = function(str, ibs, jbs)
bytes = utf8charbytes(str, ibs) bytes = utf8charbytes(str, ibs)
if bytes == 0 then return end if bytes == 0 then return end
ch = sub(str,ibs,ibs-1+bytes)
local unicode local unicode
if bytes == 1 then unicode = byte(ch) end if bytes == 1 then unicode = byte(str, ibs, ibs) end
if bytes == 2 then if bytes == 2 then
local byte0,byte1 = byte(ch,1,2) local byte0,byte1 = byte(str, ibs, ibs + 1)
if byte0 and byte1 then if byte0 and byte1 then
local code0,code1 = byte0-0xC0,byte1-0x80 local code0,code1 = byte0-0xC0,byte1-0x80
unicode = code0*shift_6 + code1 unicode = code0*shift_6 + code1
@ -216,7 +221,7 @@ utf8unicode = function(str, ibs, jbs)
end end
end end
if bytes == 3 then if bytes == 3 then
local byte0,byte1,byte2 = byte(ch,1,3) local byte0,byte1,byte2 = byte(str, ibs, ibs + 2)
if byte0 and byte1 and byte2 then if byte0 and byte1 and byte2 then
local code0,code1,code2 = byte0-0xE0,byte1-0x80,byte2-0x80 local code0,code1,code2 = byte0-0xE0,byte1-0x80,byte2-0x80
unicode = code0*shift_12 + code1*shift_6 + code2 unicode = code0*shift_12 + code1*shift_6 + code2
@ -225,7 +230,7 @@ utf8unicode = function(str, ibs, jbs)
end end
end end
if bytes == 4 then if bytes == 4 then
local byte0,byte1,byte2,byte3 = byte(ch,1,4) local byte0,byte1,byte2,byte3 = byte(str, ibs, ibs + 3)
if byte0 and byte1 and byte2 and byte3 then if byte0 and byte1 and byte2 and byte3 then
local code0,code1,code2,code3 = byte0-0xF0,byte1-0x80,byte2-0x80,byte3-0x80 local code0,code1,code2,code3 = byte0-0xF0,byte1-0x80,byte2-0x80,byte3-0x80
unicode = code0*shift_18 + code1*shift_12 + code2*shift_6 + code3 unicode = code0*shift_18 + code1*shift_12 + code2*shift_6 + code3
@ -234,7 +239,11 @@ utf8unicode = function(str, ibs, jbs)
end end
end end
return unicode,utf8unicode(str, ibs+bytes, jbs) if ibs == jbs then
return unicode
else
return unicode,utf8unicode(str, ibs+bytes, jbs)
end
end end
local function utf8byte(str, i, j) local function utf8byte(str, i, j)
@ -281,7 +290,7 @@ local function utf8gensub(str, sub_len)
return function(skip_ptr, bs) return function(skip_ptr, bs)
bs = (bs and bs or 1) + (skip_ptr and (skip_ptr[1] or 0) or 0) bs = (bs and bs or 1) + (skip_ptr and (skip_ptr[1] or 0) or 0)
nbs = bs local nbs = bs
if bs > max_len then return nil end if bs > max_len then return nil end
for i = 1, sub_len do for i = 1, sub_len do
nbs = utf8next(str, nbs) nbs = utf8next(str, nbs)
@ -427,7 +436,7 @@ local function utf8offset(str, n, bs)
bs = 1 bs = 1
end end
end end
if bs < 0 or bs > l + 1 then if bs <= 0 or bs > l + 1 then
error("bad argument #3 to 'offset' (position out of range)") error("bad argument #3 to 'offset' (position out of range)")
end end
@ -437,8 +446,8 @@ local function utf8offset(str, n, bs)
end end
while true do while true do
local b = byte(str, bs) local b = byte(str, bs)
if 0 < b and b < 127 if (0 < b and b < 127)
or 194 < b and b < 244 then or (194 < b and b < 244) then
return bs return bs
end end
bs = bs - 1 bs = bs - 1
@ -454,8 +463,8 @@ local function utf8offset(str, n, bs)
end end
local b = byte(str, bs) local b = byte(str, bs)
if 0 < b and b < 127 if (0 < b and b < 127)
or 194 < b and b < 244 then or (194 < b and b < 244) then
n = n + 1 n = n + 1
end end
bs = bs - 1 bs = bs - 1
@ -468,8 +477,8 @@ local function utf8offset(str, n, bs)
end end
local b = byte(str, bs) local b = byte(str, bs)
if 0 < b and b < 127 if (0 < b and b < 127)
or 194 < b and b < 244 then or (194 < b and b < 244) then
n = n - 1 n = n - 1
for i = 1, n do for i = 1, n do
if bs > l then if bs > l then

View File

@ -1,12 +1,19 @@
return function(utf8) return function(utf8)
os.setlocale(utf8.config.locale, "ctype") local ffi = require("ffi")
if ffi.os == "Windows" then
local ffi = require("ffi") os.setlocale(utf8.config.locale or "english_us.65001", "ctype")
ffi.cdef[[ ffi.cdef[[
int towupper(int c); short towupper(short c);
int towlower(int c); short towlower(short c);
]] ]]
else
os.setlocale(utf8.config.locale or "C.UTF-8", "ctype")
ffi.cdef[[
int towupper(int c);
int towlower(int c);
]]
end
utf8:require "primitives.dummy" utf8:require "primitives.dummy"

View File

@ -58,7 +58,9 @@ return function(regex, plain)
local ctx = utf8:require("context.runtime").new({str = str, pos = init or 1}) local ctx = utf8:require("context.runtime").new({str = str, pos = init or 1})
local cl = utf8:require("charclass.runtime.init") local cl = utf8:require("charclass.runtime.init")
local utf8sub = utf8.sub local utf8sub = utf8.sub
local rawsub = utf8.raw.sub
local utf8len = utf8.len local utf8len = utf8.len
local utf8next = utf8.next
local debug = utf8.debug local debug = utf8.debug
local function add(fun) local function add(fun)
ctx.functions[#ctx.functions + 1] = fun ctx.functions[#ctx.functions + 1] = fun