utf8 lib update

This commit is contained in:
João Lopes 2020-08-04 11:28:04 +01:00
parent 36e3d5874c
commit 72886439e7
19 changed files with 309 additions and 138 deletions

View File

@ -7,7 +7,7 @@ utf8.config.begins = utf8.config.begins or {
function utf8.regex.compiletime.begins.parse(regex, c, bs, ctx)
for _, m in ipairs(utf8.config.begins) do
local functions, move = m.parse(regex, c, bs, ctx)
utf8.debug("begins", _, c, bs, nbs, move, functions)
utf8.debug("begins", _, c, bs, move, functions)
if functions then
return functions, move
end

View File

@ -4,26 +4,25 @@ local matchers = {
sliding = function()
return [[
add(function(ctx) -- sliding
local saved = ctx:clone()
local start_pos = ctx.pos
while ctx.pos <= 1 + utf8len(ctx.str) do
debug('starting from', ctx, "start_pos", start_pos)
ctx.result.start = ctx.pos
ctx:next_function()
ctx:get_function()(ctx)
while ctx.pos <= ctx.len do
local clone = ctx:clone()
-- debug('starting from', clone, "start_pos", clone.pos)
clone.result.start = clone.pos
clone:next_function()
clone:get_function()(clone)
ctx = saved:clone()
start_pos = start_pos + 1
ctx.pos = start_pos
end
ctx:terminate()
ctx:next_char()
end
ctx:terminate()
end)
]]
end,
fromstart = function(ctx)
return [[
add(function(ctx) -- fromstart
local saved = ctx:clone()
if ctx.byte_pos > ctx.len then
return
end
ctx.result.start = ctx.pos
ctx:next_function()
ctx:get_function()(ctx)

View File

@ -89,34 +89,38 @@ function builder:include(b)
end
function builder:build()
local codes_list = table.concat(self.codes or {}, ', ')
local ranges_list = ''
for i, r in ipairs(self.ranges or {}) do ranges_list = ranges_list .. (i > 1 and ', {' or '{') .. tostring(r[1]) .. ', ' .. tostring(r[2]) .. '}' end
local classes_list = ''
if self.classes then classes_list = "'" .. table.concat(self.classes, "', '") .. "'" end
local not_classes_list = ''
if self.not_classes then not_classes_list = "'" .. table.concat(self.not_classes, "', '") .. "'" end
if self.codes and #self.codes == 1 and not self.inverted and not self.ranges and not self.classes and not self.not_classes and not self.includes then
return "{test = function(self, cc) return cc == " .. self.codes[1] .. " end}"
else
local codes_list = table.concat(self.codes or {}, ', ')
local ranges_list = ''
for i, r in ipairs(self.ranges or {}) do ranges_list = ranges_list .. (i > 1 and ', {' or '{') .. tostring(r[1]) .. ', ' .. tostring(r[2]) .. '}' end
local classes_list = ''
if self.classes then classes_list = "'" .. table.concat(self.classes, "', '") .. "'" end
local not_classes_list = ''
if self.not_classes then not_classes_list = "'" .. table.concat(self.not_classes, "', '") .. "'" end
local subs_list = ''
for i, r in ipairs(self.includes or {}) do subs_list = subs_list .. (i > 1 and ', ' or '') .. r:build() .. '' end
local subs_list = ''
for i, r in ipairs(self.includes or {}) do subs_list = subs_list .. (i > 1 and ', ' or '') .. r:build() .. '' end
local src = [[cl.new():with_codes(
]] .. codes_list .. [[
):with_ranges(
]] .. ranges_list .. [[
):with_classes(
]] .. classes_list .. [[
):without_classes(
]] .. not_classes_list .. [[
):with_subs(
]] .. subs_list .. [[
)]]
local src = [[cl.new():with_codes(
]] .. codes_list .. [[
):with_ranges(
]] .. ranges_list .. [[
):with_classes(
]] .. classes_list .. [[
):without_classes(
]] .. not_classes_list .. [[
):with_subs(
]] .. subs_list .. [[
)]]
if self.inverted then
src = src .. ':invert()'
if self.inverted then
src = src .. ':invert()'
end
return src
end
return src
end
return builder

View File

@ -28,7 +28,7 @@ return function(str, c, bs, ctx)
if c == '%' then
c, nbs = next(str, nbs)
r2 = c
elseif c ~= '' then
elseif c ~= '' and c ~= ']' then
r2 = c
end

View File

@ -16,6 +16,9 @@ local function parse(str, c, bs, ctx)
if c == '%' then
c, nbs = next(str, bs)
if c == '' then
error("malformed pattern (ends with '%')")
end
local _c = utf8.raw.lower(c)
local matched
if _c == 'a' then
@ -46,8 +49,15 @@ local function parse(str, c, bs, ctx)
else
class = cl.new():with_classes(matched)
end
elseif _c == 'z' then
class = cl.new():with_codes(0)
if _c ~= c then
class = class:invert()
end
else
class = cl.new():with_codes(c)
end
elseif c == '[' then
elseif c == '[' and not ctx.internal then
local old_internal = ctx.internal
ctx.internal = true
class = cl.new()
@ -58,9 +68,18 @@ local function parse(str, c, bs, ctx)
utf8.debug("next", tttt, c, nbs)
if c == '^' and firstletter then
class:invert()
local nc, nnbs = next(str, nbs)
if nc == ']' then
class:with_codes(nc)
nbs = nnbs
end
elseif c == ']' then
utf8.debug('] on pos', tttt, nbs)
break
if firstletter then
class:with_codes(c)
else
utf8.debug('] on pos', tttt, nbs)
break
end
elseif c == '' then
error "malformed pattern (missing ']')"
else

View File

@ -71,7 +71,7 @@ function class:with_subs(...)
end
function class:in_codes(item)
if not self.codes then return false end
if not self.codes or #self.codes == 0 then return nil end
local head, tail = 1, #self.codes
local mid = math.floor((head + tail)/2)
@ -93,7 +93,7 @@ function class:in_codes(item)
end
function class:in_ranges(char_code)
if not self.ranges then return false end
if not self.ranges or #self.ranges == 0 then return nil end
for _,r in ipairs(self.ranges) do
if r[1] <= char_code and char_code <= r[2] then
@ -104,7 +104,7 @@ function class:in_ranges(char_code)
end
function class:in_classes(char_code)
if not self.classes then return false end
if not self.classes or #self.classes == 0 then return nil end
for _, class in ipairs(self.classes) do
if self:is(class, char_code) then
@ -115,7 +115,7 @@ function class:in_classes(char_code)
end
function class:in_not_classes(char_code)
if not self.not_classes then return false end
if not self.not_classes or #self.not_classes == 0 then return nil end
for _, class in ipairs(self.not_classes) do
if self:is(class, char_code) then
@ -130,7 +130,7 @@ function class:is(class, char_code)
end
function class:in_subs(char_code)
if not self.subs or #self.subs == 0 then return false end
if not self.subs or #self.subs == 0 then return nil end
for _, c in ipairs(self.subs) do
if not c:test(char_code) then
@ -142,20 +142,40 @@ end
function class:test(char_code)
local result = self:do_test(char_code)
utf8.debug('class:test', result, "'" .. (char_code and utf8.char(char_code) or 'nil') .. "'", char_code)
-- utf8.debug('class:test', result, "'" .. (char_code and utf8.char(char_code) or 'nil') .. "'", char_code)
return result
end
function class:do_test(char_code)
if not char_code then return false end
local found = (self:in_codes(char_code) or self:in_ranges(char_code) or self:in_classes(char_code) or self:in_subs(char_code)) and not self:in_not_classes(char_code)
utf8.debug('class:do_test', 'found', found, 'inverted', self.inverted, 'result', self.inverted and not found or found)
-- utf8.debug(self:in_codes(char_code), self:in_ranges(char_code), self:in_classes(char_code), self:in_subs(char_code), not self:in_not_classes(char_code))
-- ternary if ideom (self.inverted and not found or found) doesn't work with booleans >_<
if self.inverted then
return not found
local in_not_classes = self:in_not_classes(char_code)
if in_not_classes then
return not not self.inverted
end
local in_codes = self:in_codes(char_code)
if in_codes then
return not self.inverted
end
local in_ranges = self:in_ranges(char_code)
if in_ranges then
return not self.inverted
end
local in_classes = self:in_classes(char_code)
if in_classes then
return not self.inverted
end
local in_subs = self:in_subs(char_code)
if in_subs then
return not self.inverted
end
if (in_codes == nil)
and (in_ranges == nil)
and (in_classes == nil)
and (in_subs == nil)
and (in_not_classes == false) then
return not self.inverted
else
return found
return not not self.inverted
end
end

View File

@ -1,9 +1,14 @@
return function(utf8)
local utf8unicode = utf8.byte
local utf8unicode = utf8.unicode
local utf8sub = utf8.sub
local sub = utf8.raw.sub
local byte = utf8.raw.byte
local utf8len = utf8.len
local utf8next = utf8.next
local rawgsub = utf8.raw.gsub
local utf8offset = utf8.offset
local utf8char = utf8.char
local util = utf8.util
@ -22,16 +27,42 @@ local mt = {
function ctx.new(obj)
obj = obj or {}
return setmetatable({
local res = setmetatable({
pos = obj.pos or 1,
str = obj.str or nil,
byte_pos = obj.pos or 1,
str = assert(obj.str, "str is required"),
len = obj.len,
rawlen = obj.rawlen,
bytes = obj.bytes,
offsets = obj.offsets,
starts = obj.starts or nil,
functions = obj.functions or {},
func_pos = obj.func_pos or 1,
ends = obj.ends or nil,
result = obj.result and util.copy(obj.result) or {},
captures = obj.captures and util.copy(obj.captures, true) or {active = {}},
modified = false,
}, mt)
if not res.bytes then
local str = res.str
local l = #str
local bytes = utf8.config.int32array(l)
local offsets = utf8.config.int32array(l)
local c, bs, i = nil, 1, 1
while bs <= l do
bytes[i] = utf8unicode(str, bs, bs)
offsets[i] = bs
bs = utf8.next(str, bs)
i = i + 1
end
res.bytes = bytes
res.offsets = offsets
res.byte_pos = res.pos
res.len = i
res.rawlen = l
end
return res
end
function ctx:clone()
@ -40,15 +71,22 @@ end
function ctx:next_char()
self.pos = self.pos + 1
self.byte_pos = self.pos
end
function ctx:prev_char()
self.pos = self.pos - 1
self.byte_pos = self.pos
end
function ctx:get_char()
return utf8sub(self.str, self.pos, self.pos)
if self.len <= self.pos then return "" end
return utf8char(self.bytes[self.pos])
end
function ctx:get_charcode()
if utf8len(self.str) < self.pos then return nil end
return utf8unicode(self:get_char())
if self.len <= self.pos then return nil end
return self.bytes[self.pos]
end
function ctx:next_function()

View File

@ -7,7 +7,7 @@ utf8.config.ends = utf8.config.ends or {
function utf8.regex.compiletime.ends.parse(regex, c, bs, ctx)
for _, m in ipairs(utf8.config.ends) do
local functions, move = m.parse(regex, c, bs, ctx)
utf8.debug("ends", _, c, bs, nbs, move, functions)
utf8.debug("ends", _, c, bs, move, functions)
if functions then
return functions, move
end

View File

@ -13,7 +13,8 @@ local matchers = {
return [[
add(function(ctx) -- toend
ctx.result.finish = ctx.pos - 1
if ctx.pos == #ctx.str + 1 then ctx:done() end
ctx.modified = true
if ctx.pos == utf8len(ctx.str) + 1 then ctx:done() end
end)
]]
end,

View File

@ -3,7 +3,22 @@ return function(utf8)
local utf8sub = utf8.sub
local utf8gensub = utf8.gensub
local unpack = utf8.config.unpack
local get_matcher_function = utf8:require 'regex_parser'
local generate_matcher_function = utf8:require 'regex_parser'
function get_matcher_function(regex, plain)
local res
if utf8.config.cache then
res = utf8.config.cache[plain and "plain" or "regex"][regex]
end
if res then
return res
end
res = generate_matcher_function(regex, plain)
if utf8.config.cache then
utf8.config.cache[plain and "plain" or "regex"][regex] = res
end
return res
end
local function utf8find(str, regex, init, plain)
local func = get_matcher_function(regex, plain)
@ -19,7 +34,8 @@ local function utf8find(str, regex, init, plain)
end
local function utf8match(str, regex, init)
local func = get_matcher_function(regex, plain, utf8)
local func = get_matcher_function(regex, false)
init = ((init or 1) < 0) and (utf8.len(str) + init + 1) or init
local ctx, result, captures = func(str, init, utf8)
if not ctx then return nil end
@ -34,7 +50,7 @@ end
local function utf8gmatch(str, regex)
regex = (utf8sub(regex,1,1) ~= '^') and regex or '%' .. regex
local func = get_matcher_function(regex, plain, utf8)
local func = get_matcher_function(regex, false)
local ctx, result, captures
local continue_pos = 1
@ -71,7 +87,7 @@ local function replace(repl, args)
else
num = tonumber(c)
if num then
ret = ret .. args[num]
ret = ret .. assert(args[num], "invalid capture index %" .. c)
else
ret = ret .. c
end
@ -79,13 +95,9 @@ local function replace(repl, args)
end
end
elseif type(repl) == 'table' then
ret = repl[args[1] or args[0]] or ''
ret = repl[args[1]] or args[0]
elseif type(repl) == 'function' then
if #args > 0 then
ret = repl(unpack(args, 1)) or ''
else
ret = repl(args[0]) or ''
end
ret = repl(unpack(args, 1)) or args[0]
end
return ret
end
@ -95,8 +107,7 @@ local function utf8gsub(str, regex, repl, limit)
local subbed = ''
local prev_sub_finish = 1
regex = (utf8sub(regex,1,1) ~= '^') and regex or '%' .. regex
local func = get_matcher_function(regex, plain, utf8)
local func = get_matcher_function(regex, false)
local ctx, result, captures
local continue_pos = 1
@ -111,7 +122,13 @@ local function utf8gsub(str, regex, repl, limit)
utf8.debug('captures:', captures)
continue_pos = math.max(result.finish + 1, result.start + 1)
local args = {[0] = utf8sub(str, result.start, result.finish), unpack(captures)}
local args
if #captures > 0 then
args = {[0] = utf8sub(str, result.start, result.finish), unpack(captures)}
else
args = {[0] = utf8sub(str, result.start, result.finish)}
args[1] = args[0]
end
subbed = subbed .. utf8sub(str, prev_sub_finish, result.start - 1)
subbed = subbed .. replace(repl, args)

View File

@ -1,5 +1,8 @@
local module_path = ...
module_path = module_path:match("^(.-)init$") or (module_path .. '.')
local ffi_enabled, ffi = pcall(require, 'ffi')
local utf8 = {
config = {},
default = {
@ -15,7 +18,14 @@ local utf8 = {
__mode = 'kv'
}),
},
locale = "C.UTF-8",
locale = nil,
int32array = function(size)
if ffi_enabled then
return ffi.new("uint32_t[?]", size + 1)
else
return {}
end
end
},
regex = {
compiletime = {

View File

@ -7,12 +7,13 @@ local matchers = {
local ]] .. class_name .. [[ = ]] .. class .. [[
add(function(ctx) -- frontier
ctx.pos = ctx.pos - 1
local prev_charcode = ctx:get_charcode()
ctx:prev_char()
local prev_charcode = ctx:get_charcode() or 0
ctx:next_char()
debug("frontier pos", ctx.pos, "prev_charcode", prev_charcode, "charcode", ctx:get_charcode())
local charcode = ctx:get_charcode() or 0
-- debug("frontier pos", ctx.pos, "prev_charcode", prev_charcode, "charcode", charcode)
if ]] .. class_name .. [[:test(prev_charcode) then return end
if ]] .. class_name .. [[:test(ctx:get_charcode()) then
if ]] .. class_name .. [[:test(charcode) then
ctx:next_function()
return ctx:get_function()(ctx)
end

View File

@ -9,7 +9,7 @@ utf8.config.modifier = utf8.config.modifier or {
function utf8.regex.compiletime.modifier.parse(regex, c, bs, ctx)
for _, m in ipairs(utf8.config.modifier) do
local functions, move = m.parse(regex, c, bs, ctx)
utf8.debug("mod", _, c, bs, nbs, move, functions and utf8.config.unpack(functions))
utf8.debug("mod", _, c, bs, move, functions and utf8.config.unpack(functions))
if functions then
ctx.prev_class = nil
return functions, move

View File

@ -7,7 +7,7 @@ local matchers = {
local ]] .. class_name .. [[ = ]] .. class .. [[
add(function(ctx) -- simple
debug(ctx, 'simple', ']] .. class_name .. [[')
-- debug(ctx, 'simple', ']] .. class_name .. [[')
if ]] .. class_name .. [[:test(ctx:get_charcode()) then
ctx:next_char()
ctx:next_function()

View File

@ -15,7 +15,7 @@ end
local function check(ctx)
if ctx.prev_class then
table.insert(ctx.funcs, matchers.simple(ctx.prev_class, tostring(bs)))
table.insert(ctx.funcs, matchers.simple(ctx.prev_class, tostring(ctx.pos)))
ctx.prev_class = nil
end
end

View File

@ -10,18 +10,21 @@ local matchers = {
local ]] .. class_name .. [[ = ]] .. class .. [[
add(function(ctx) -- star
debug(ctx, 'star', ']] .. class_name .. [[')
local saved = {ctx:clone()}
while ]] .. class_name .. [[:test(ctx:get_charcode()) do
ctx:next_char()
table.insert(saved, ctx:clone())
debug('#saved <<', #saved)
-- debug(ctx, 'star', ']] .. class_name .. [[')
local clone = ctx:clone()
while ]] .. class_name .. [[:test(clone:get_charcode()) do
clone:next_char()
end
while #saved > 0 do
ctx = table.remove(saved)
ctx:next_function()
ctx:get_function()(ctx)
debug('#saved >>', #saved)
local pos = clone.pos
while pos >= ctx.pos do
clone.pos = pos
clone.func_pos = ctx.func_pos
clone:next_function()
clone:get_function()(clone)
if clone.modified then
clone = ctx:clone()
end
pos = pos - 1
end
end)
]]
@ -32,15 +35,23 @@ local matchers = {
local ]] .. class_name .. [[ = ]] .. class .. [[
add(function(ctx) -- minus
debug(ctx, 'minus', ']] .. class_name .. [[')
-- debug(ctx, 'minus', ']] .. class_name .. [[')
local clone = ctx:clone()
local pos
repeat
local saved = ctx:clone()
ctx:next_function()
ctx:get_function()(ctx)
ctx = saved
local match = ]] .. class_name .. [[:test(ctx:get_charcode())
ctx:next_char()
pos = clone.pos
clone:next_function()
clone:get_function()(clone)
if clone.modified then
clone = ctx:clone()
clone.pos = pos
else
clone.pos = pos
clone.func_pos = ctx.func_pos
end
local match = ]] .. class_name .. [[:test(clone:get_charcode())
clone:next_char()
until not match
end)
]]
@ -51,7 +62,7 @@ local matchers = {
local ]] .. class_name .. [[ = ]] .. class .. [[
add(function(ctx) -- question
debug(ctx, 'question', ']] .. class_name .. [[')
-- debug(ctx, 'question', ']] .. class_name .. [[')
local saved = ctx:clone()
if ]] .. class_name .. [[:test(ctx:get_charcode()) then
ctx:next_char()
@ -67,8 +78,9 @@ local matchers = {
capture_start = function(number)
return [[
add(function(ctx)
debug(ctx, 'capture_start', ']] .. tostring(number) .. [[')
table.insert(ctx.captures.active, { id = ]] .. tostring(number) .. [[, start_byte = byte_pos, start = ctx.pos })
ctx.modified = true
-- debug(ctx, 'capture_start', ']] .. tostring(number) .. [[')
table.insert(ctx.captures.active, { id = ]] .. tostring(number) .. [[, start = ctx.pos })
ctx:next_function()
return ctx:get_function()(ctx)
end)
@ -77,12 +89,34 @@ local matchers = {
capture_finish = function(number)
return [[
add(function(ctx)
debug(ctx, 'capture_finish', ']] .. tostring(number) .. [[')
ctx.modified = true
-- debug(ctx, 'capture_finish', ']] .. tostring(number) .. [[')
local cap = table.remove(ctx.captures.active)
cap.finish_byte = byte_pos
cap.finish = ctx.pos
ctx.captures[cap.id] = utf8sub(ctx.str, cap.start, cap.finish - 1)
debug('capture#' .. tostring(cap.id), '[' .. tostring(cap.start).. ',' .. tostring(cap.finish) .. ']' , 'is', ctx.captures[cap.id])
local b, e = ctx.offsets[cap.start], ctx.offsets[cap.finish]
if cap.start < 1 then
b = 1
elseif cap.start >= ctx.len then
b = ctx.rawlen + 1
end
if cap.finish < 1 then
e = 1
elseif cap.finish >= ctx.len then
e = ctx.rawlen + 1
end
ctx.captures[cap.id] = rawsub(ctx.str, b, e - 1)
-- debug('capture#' .. tostring(cap.id), '[' .. tostring(cap.start).. ',' .. tostring(cap.finish) .. ']' , 'is', ctx.captures[cap.id])
ctx:next_function()
return ctx:get_function()(ctx)
end)
]]
end,
capture_position = function(number)
return [[
add(function(ctx)
ctx.modified = true
-- debug(ctx, 'capture_position', ']] .. tostring(number) .. [[')
ctx.captures[ ]] .. tostring(number) .. [[ ] = ctx.pos
ctx:next_function()
return ctx:get_function()(ctx)
end)
@ -91,11 +125,11 @@ local matchers = {
capture = function(number)
return [[
add(function(ctx)
debug(ctx, 'capture', ']] .. tostring(number) .. [[')
-- debug(ctx, 'capture', ']] .. tostring(number) .. [[')
local cap = ctx.captures[ ]] .. tostring(number) .. [[ ]
local len = utf8len(cap)
local check = utf8sub(ctx.str, ctx.pos, ctx.pos + len - 1)
debug("capture check:", cap, check)
-- debug("capture check:", cap, check)
if cap == check then
ctx.pos = ctx.pos + len
ctx:next_function()
@ -121,9 +155,9 @@ local matchers = {
elseif c == b then
balance = balance - 1
end
debug("balancer: balance=", balance, ", d=", d, ", b=", b, ", charcode=", ctx:get_charcode())
-- debug("balancer: balance=", balance, ", d=", d, ", b=", b, ", charcode=", ctx:get_charcode())
ctx:next_char()
until balance == 0
until balance == 0 or (balance == 2 and d == b)
ctx:next_function()
return ctx:get_function()(ctx)
end)
@ -139,6 +173,9 @@ local function parse(regex, c, bs, ctx)
if c == '%' then
c, nbs = next(regex, bs)
utf8.debug("next", c, bs)
if c == '' then
error("malformed pattern (ends with '%')")
end
if utf8.raw.find('123456789', c, 1, true) then
functions = { matchers.capture(tonumber(c)) }
nbs = utf8.next(regex, nbs)
@ -146,6 +183,7 @@ local function parse(regex, c, bs, ctx)
local d, b
d, nbs = next(regex, nbs)
b, nbs = next(regex, nbs)
assert(d ~= '' and b ~= '', "unbalanced pattern")
functions = { matchers.balancer({d, b}, tostring(bs)) }
nbs = utf8.next(regex, nbs)
end
@ -191,13 +229,19 @@ local function parse(regex, c, bs, ctx)
nbs = bs + 1
elseif c == '(' then
ctx.capture = ctx.capture or {balance = 0, id = 0}
ctx.capture.balance = ctx.capture.balance + 1
ctx.capture.id = ctx.capture.id + 1
functions = { matchers.capture_start(ctx.capture.id) }
local nc = next(regex, nbs)
if nc == ')' then
functions = {matchers.capture_position(ctx.capture.id)}
nbs = bs + 2
else
ctx.capture.balance = ctx.capture.balance + 1
functions = {matchers.capture_start(ctx.capture.id)}
nbs = bs + 1
end
if ctx.prev_class then
table.insert(functions, 1, matchers.simple(ctx.prev_class, tostring(bs)))
end
nbs = bs + 1
elseif c == ')' then
ctx.capture = ctx.capture or {balance = 0, id = 0}
functions = { matchers.capture_finish(ctx.capture.id) }

View File

@ -80,8 +80,14 @@ local function utf8symbollen(byte)
return not byte and 0 or (byte < 0x80 and 1) or (byte >= 0xF0 and 4) or (byte >= 0xE0 and 3) or (byte >= 0xC0 and 2) or 1
end
local head_table = utf8.config.int32array(256)
for i = 0, 255 do
head_table[i] = utf8symbollen(i)
end
head_table[256] = 0
local function utf8charbytes(str, bs)
return utf8symbollen(byte(str, bs))
return head_table[byte(str, bs) or 256]
end
local function utf8next(str, bs)
@ -201,13 +207,12 @@ utf8unicode = function(str, ibs, jbs)
bytes = utf8charbytes(str, ibs)
if bytes == 0 then return end
ch = sub(str,ibs,ibs-1+bytes)
local unicode
if bytes == 1 then unicode = byte(ch) end
if bytes == 1 then unicode = byte(str, ibs, ibs) end
if bytes == 2 then
local byte0,byte1 = byte(ch,1,2)
local byte0,byte1 = byte(str, ibs, ibs + 1)
if byte0 and byte1 then
local code0,code1 = byte0-0xC0,byte1-0x80
unicode = code0*shift_6 + code1
@ -216,7 +221,7 @@ utf8unicode = function(str, ibs, jbs)
end
end
if bytes == 3 then
local byte0,byte1,byte2 = byte(ch,1,3)
local byte0,byte1,byte2 = byte(str, ibs, ibs + 2)
if byte0 and byte1 and byte2 then
local code0,code1,code2 = byte0-0xE0,byte1-0x80,byte2-0x80
unicode = code0*shift_12 + code1*shift_6 + code2
@ -225,7 +230,7 @@ utf8unicode = function(str, ibs, jbs)
end
end
if bytes == 4 then
local byte0,byte1,byte2,byte3 = byte(ch,1,4)
local byte0,byte1,byte2,byte3 = byte(str, ibs, ibs + 3)
if byte0 and byte1 and byte2 and byte3 then
local code0,code1,code2,code3 = byte0-0xF0,byte1-0x80,byte2-0x80,byte3-0x80
unicode = code0*shift_18 + code1*shift_12 + code2*shift_6 + code3
@ -234,7 +239,11 @@ utf8unicode = function(str, ibs, jbs)
end
end
return unicode,utf8unicode(str, ibs+bytes, jbs)
if ibs == jbs then
return unicode
else
return unicode,utf8unicode(str, ibs+bytes, jbs)
end
end
local function utf8byte(str, i, j)
@ -281,7 +290,7 @@ local function utf8gensub(str, sub_len)
return function(skip_ptr, bs)
bs = (bs and bs or 1) + (skip_ptr and (skip_ptr[1] or 0) or 0)
nbs = bs
local nbs = bs
if bs > max_len then return nil end
for i = 1, sub_len do
nbs = utf8next(str, nbs)
@ -427,7 +436,7 @@ local function utf8offset(str, n, bs)
bs = 1
end
end
if bs < 0 or bs > l + 1 then
if bs <= 0 or bs > l + 1 then
error("bad argument #3 to 'offset' (position out of range)")
end
@ -437,8 +446,8 @@ local function utf8offset(str, n, bs)
end
while true do
local b = byte(str, bs)
if 0 < b and b < 127
or 194 < b and b < 244 then
if (0 < b and b < 127)
or (194 < b and b < 244) then
return bs
end
bs = bs - 1
@ -454,8 +463,8 @@ local function utf8offset(str, n, bs)
end
local b = byte(str, bs)
if 0 < b and b < 127
or 194 < b and b < 244 then
if (0 < b and b < 127)
or (194 < b and b < 244) then
n = n + 1
end
bs = bs - 1
@ -468,8 +477,8 @@ local function utf8offset(str, n, bs)
end
local b = byte(str, bs)
if 0 < b and b < 127
or 194 < b and b < 244 then
if (0 < b and b < 127)
or (194 < b and b < 244) then
n = n - 1
for i = 1, n do
if bs > l then

View File

@ -1,12 +1,19 @@
return function(utf8)
os.setlocale(utf8.config.locale, "ctype")
local ffi = require("ffi")
ffi.cdef[[
int towupper(int c);
int towlower(int c);
]]
local ffi = require("ffi")
if ffi.os == "Windows" then
os.setlocale(utf8.config.locale or "english_us.65001", "ctype")
ffi.cdef[[
short towupper(short c);
short towlower(short c);
]]
else
os.setlocale(utf8.config.locale or "C.UTF-8", "ctype")
ffi.cdef[[
int towupper(int c);
int towlower(int c);
]]
end
utf8:require "primitives.dummy"

View File

@ -58,7 +58,9 @@ return function(regex, plain)
local ctx = utf8:require("context.runtime").new({str = str, pos = init or 1})
local cl = utf8:require("charclass.runtime.init")
local utf8sub = utf8.sub
local rawsub = utf8.raw.sub
local utf8len = utf8.len
local utf8next = utf8.next
local debug = utf8.debug
local function add(fun)
ctx.functions[#ctx.functions + 1] = fun