LoveFrames/libraries/utf8.lua

1066 lines
26 KiB
Lua
Raw Normal View History

-- $Id: utf8.lua 179 2009-04-03 18:10:03Z pasta $
--
-- Provides UTF-8 aware string functions implemented in pure lua:
2014-11-20 15:42:59 +00:00
-- * utf8len(s)
-- * utf8sub(s, i, j)
-- * utf8reverse(s)
-- * utf8char(unicode)
-- * utf8unicode(s, i, j)
-- * utf8gensub(s, sub_len)
-- * utf8find(str, regex, init, plain)
-- * utf8match(str, regex, init)
-- * utf8gmatch(str, regex, all)
-- * utf8gsub(str, regex, repl, limit)
--
-- If utf8data.lua (containing the lower<->upper case mappings) is loaded, these
-- additional functions are available:
2014-11-20 15:42:59 +00:00
-- * utf8upper(s)
-- * utf8lower(s)
--
-- All functions behave as their non UTF-8 aware counterparts with the exception
-- that UTF-8 characters are used instead of bytes for all units.
--[[
Copyright (c) 2006-2007, Kyle Smith
All rights reserved.
2014-11-20 15:42:59 +00:00
Contributors:
Alimov Stepan
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the author nor the names of its contributors may be
used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--]]
-- ABNF from RFC 3629
--
-- UTF8-octets = *( UTF8-char )
-- UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
-- UTF8-1 = %x00-7F
-- UTF8-2 = %xC2-DF UTF8-tail
-- UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
-- %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
-- UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
-- %xF4 %x80-8F 2( UTF8-tail )
-- UTF8-tail = %x80-BF
--
2014-11-20 15:42:59 +00:00
local byte = string.byte
local char = string.char
local dump = string.dump
local find = string.find
local format = string.format
local gmatch = string.gmatch
local gsub = string.gsub
local len = string.len
local lower = string.lower
local match = string.match
local rep = string.rep
local reverse = string.reverse
local sub = string.sub
local upper = string.upper
-- returns the number of bytes used by the UTF-8 character at byte i in s
-- also doubles as a UTF-8 character validator
local function utf8charbytes (s, i)
-- argument defaults
i = i or 1
-- argument checking
if type(s) ~= "string" then
error("bad argument #1 to 'utf8charbytes' (string expected, got ".. type(s).. ")")
end
if type(i) ~= "number" then
error("bad argument #2 to 'utf8charbytes' (number expected, got ".. type(i).. ")")
end
2014-11-20 15:42:59 +00:00
local c = byte(s, i)
-- determine bytes needed for character, based on RFC 3629
-- validate byte 1
if c > 0 and c <= 127 then
-- UTF8-1
return 1
elseif c >= 194 and c <= 223 then
-- UTF8-2
2014-11-20 15:42:59 +00:00
local c2 = byte(s, i + 1)
if not c2 then
error("UTF-8 string terminated early")
end
-- validate byte 2
if c2 < 128 or c2 > 191 then
error("Invalid UTF-8 character")
end
return 2
elseif c >= 224 and c <= 239 then
-- UTF8-3
2014-11-20 15:42:59 +00:00
local c2 = byte(s, i + 1)
local c3 = byte(s, i + 2)
if not c2 or not c3 then
error("UTF-8 string terminated early")
end
-- validate byte 2
if c == 224 and (c2 < 160 or c2 > 191) then
error("Invalid UTF-8 character")
elseif c == 237 and (c2 < 128 or c2 > 159) then
error("Invalid UTF-8 character")
elseif c2 < 128 or c2 > 191 then
error("Invalid UTF-8 character")
end
-- validate byte 3
if c3 < 128 or c3 > 191 then
error("Invalid UTF-8 character")
end
return 3
elseif c >= 240 and c <= 244 then
-- UTF8-4
2014-11-20 15:42:59 +00:00
local c2 = byte(s, i + 1)
local c3 = byte(s, i + 2)
local c4 = byte(s, i + 3)
if not c2 or not c3 or not c4 then
error("UTF-8 string terminated early")
end
-- validate byte 2
if c == 240 and (c2 < 144 or c2 > 191) then
error("Invalid UTF-8 character")
elseif c == 244 and (c2 < 128 or c2 > 143) then
error("Invalid UTF-8 character")
elseif c2 < 128 or c2 > 191 then
error("Invalid UTF-8 character")
end
-- validate byte 3
if c3 < 128 or c3 > 191 then
error("Invalid UTF-8 character")
end
-- validate byte 4
if c4 < 128 or c4 > 191 then
error("Invalid UTF-8 character")
end
return 4
else
error("Invalid UTF-8 character")
end
end
-- returns the number of characters in a UTF-8 string
local function utf8len (s)
-- argument checking
if type(s) ~= "string" then
for k,v in pairs(s) do print('"',tostring(k),'"',tostring(v),'"') end
error("bad argument #1 to 'utf8len' (string expected, got ".. type(s).. ")")
end
local pos = 1
local bytes = len(s)
local len = 0
while pos <= bytes do
len = len + 1
pos = pos + utf8charbytes(s, pos)
end
return len
end
-- functions identically to string.sub except that i and j are UTF-8 characters
-- instead of bytes
local function utf8sub (s, i, j)
-- argument defaults
j = j or -1
local pos = 1
local bytes = len(s)
local len = 0
-- only set l if i or j is negative
local l = (i >= 0 and j >= 0) or utf8len(s)
local startChar = (i >= 0) and i or l + i + 1
local endChar = (j >= 0) and j or l + j + 1
-- can't have start before end!
if startChar > endChar then
return ""
end
-- byte offsets to pass to string.sub
local startByte,endByte = 1,bytes
while pos <= bytes do
len = len + 1
if len == startChar then
startByte = pos
end
pos = pos + utf8charbytes(s, pos)
if len == endChar then
endByte = pos - 1
break
end
end
if startChar > len then startByte = bytes+1 end
if endChar < 1 then endByte = 0 end
return sub(s, startByte, endByte)
end
-- replace UTF-8 characters based on a mapping table
local function utf8replace (s, mapping)
-- argument checking
if type(s) ~= "string" then
error("bad argument #1 to 'utf8replace' (string expected, got ".. type(s).. ")")
end
if type(mapping) ~= "table" then
error("bad argument #2 to 'utf8replace' (table expected, got ".. type(mapping).. ")")
end
local pos = 1
local bytes = len(s)
local charbytes
local newstr = ""
while pos <= bytes do
charbytes = utf8charbytes(s, pos)
local c = sub(s, pos, pos + charbytes - 1)
newstr = newstr .. (mapping[c] or c)
pos = pos + charbytes
end
return newstr
end
-- identical to string.upper except it knows about unicode simple case conversions
local function utf8upper (s)
return utf8replace(s, utf8_lc_uc)
end
-- identical to string.lower except it knows about unicode simple case conversions
local function utf8lower (s)
return utf8replace(s, utf8_uc_lc)
end
-- identical to string.reverse except that it supports UTF-8
local function utf8reverse (s)
-- argument checking
if type(s) ~= "string" then
error("bad argument #1 to 'utf8reverse' (string expected, got ".. type(s).. ")")
end
local bytes = len(s)
local pos = bytes
local charbytes
local newstr = ""
while pos > 0 do
2014-11-20 15:42:59 +00:00
c = byte(s, pos)
while c >= 128 and c <= 191 do
pos = pos - 1
2014-11-20 15:42:59 +00:00
c = byte(s, pos)
end
charbytes = utf8charbytes(s, pos)
newstr = newstr .. sub(s, pos, pos + charbytes - 1)
pos = pos - 1
end
return newstr
end
-- http://en.wikipedia.org/wiki/Utf8
-- http://developer.coronalabs.com/code/utf-8-conversion-utility
local function utf8char(unicode)
if unicode <= 0x7F then return char(unicode) end
if (unicode <= 0x7FF) then
local Byte0 = 0xC0 + math.floor(unicode / 0x40);
local Byte1 = 0x80 + (unicode % 0x40);
return char(Byte0, Byte1);
end;
if (unicode <= 0xFFFF) then
local Byte0 = 0xE0 + math.floor(unicode / 0x1000);
local Byte1 = 0x80 + (math.floor(unicode / 0x40) % 0x40);
local Byte2 = 0x80 + (unicode % 0x40);
return char(Byte0, Byte1, Byte2);
end;
if (unicode <= 0x10FFFF) then
local code = unicode
local Byte3= 0x80 + (code % 0x40);
code = math.floor(code / 0x40)
local Byte2= 0x80 + (code % 0x40);
code = math.floor(code / 0x40)
local Byte1= 0x80 + (code % 0x40);
code = math.floor(code / 0x40)
local Byte0= 0xF0 + code;
return char(Byte0, Byte1, Byte2, Byte3);
end;
error 'Unicode cannot be greater than U+10FFFF!'
end
local shift_6 = 2^6
local shift_12 = 2^12
local shift_18 = 2^18
local utf8unicode
utf8unicode = function(str, i, j, byte_pos)
i = i or 1
j = j or i
if i > j then return end
local char,bytes
if byte_pos then
bytes = utf8charbytes(str,byte_pos)
char = sub(str,byte_pos,byte_pos-1+bytes)
else
2014-11-20 15:42:59 +00:00
char,byte_pos = utf8sub(str,i,i), 0
bytes = #char
end
local unicode
2014-11-20 15:42:59 +00:00
if bytes == 1 then unicode = byte(char) end
if bytes == 2 then
2014-11-20 15:42:59 +00:00
local byte0,byte1 = byte(char,1,2)
local code0,code1 = byte0-0xC0,byte1-0x80
unicode = code0*shift_6 + code1
end
if bytes == 3 then
2014-11-20 15:42:59 +00:00
local byte0,byte1,byte2 = byte(char,1,3)
local code0,code1,code2 = byte0-0xE0,byte1-0x80,byte2-0x80
unicode = code0*shift_12 + code1*shift_6 + code2
end
if bytes == 4 then
2014-11-20 15:42:59 +00:00
local byte0,byte1,byte2,byte3 = byte(char,1,4)
local code0,code1,code2,code3 = byte0-0xF0,byte1-0x80,byte2-0x80,byte3-0x80
unicode = code0*shift_18 + code1*shift_12 + code2*shift_6 + code3
end
return unicode,utf8unicode(str, i+1, j, byte_pos+bytes)
end
-- Returns an iterator which returns the next substring and its byte interval
local function utf8gensub(str, sub_len)
sub_len = sub_len or 1
local byte_pos = 1
local len = #str
2014-11-20 15:42:59 +00:00
return function(skip)
if skip then byte_pos = byte_pos + skip end
local char_count = 0
local start = byte_pos
repeat
if byte_pos > len then return end
char_count = char_count + 1
local bytes = utf8charbytes(str,byte_pos)
byte_pos = byte_pos+bytes
until char_count == sub_len
local last = byte_pos-1
local sub = sub(str,start,last)
return sub, start, last
end
end
2014-11-20 15:42:59 +00:00
local function binsearch(sortedTable, item, comp)
local head, tail = 1, #sortedTable
local mid = math.floor((head + tail)/2)
if not comp then
while (tail - head) > 1 do
if sortedTable[tonumber(mid)] > item then
tail = mid
else
head = mid
end
mid = math.floor((head + tail)/2)
end
else
end
if sortedTable[tonumber(head)] == item then
return true, tonumber(head)
elseif sortedTable[tonumber(tail)] == item then
return true, tonumber(tail)
else
return false
end
end
local function classMatchGenerator(class, plain)
local codes = {}
local ranges = {}
local ignore = false
local range = false
local firstletter = true
local unmatch = false
local it = utf8gensub(class)
local skip
for c,bs,be in it do
skip = be
if not ignore and not plain then
if c == "%" then
ignore = true
elseif c == "-" then
table.insert(codes, utf8unicode(c))
range = true
elseif c == "^" then
if not firstletter then
error('!!!')
else
unmatch = true
end
elseif c == ']' then
break
else
if not range then
table.insert(codes, utf8unicode(c))
else
table.remove(codes) -- removing '-'
table.insert(ranges, {table.remove(codes), utf8unicode(c)})
range = false
end
end
elseif ignore and not plain then
if c == 'a' then -- %a: represents all letters. (ONLY ASCII)
table.insert(ranges, {65, 90}) -- A - Z
table.insert(ranges, {97, 122}) -- a - z
elseif c == 'c' then -- %c: represents all control characters.
table.insert(ranges, {0, 31})
table.insert(codes, 127)
elseif c == 'd' then -- %d: represents all digits.
table.insert(ranges, {48, 57}) -- 0 - 9
elseif c == 'g' then -- %g: represents all printable characters except space.
table.insert(ranges, {1, 8})
table.insert(ranges, {14, 31})
table.insert(ranges, {33, 132})
table.insert(ranges, {134, 159})
table.insert(ranges, {161, 5759})
table.insert(ranges, {5761, 8191})
table.insert(ranges, {8203, 8231})
table.insert(ranges, {8234, 8238})
table.insert(ranges, {8240, 8286})
table.insert(ranges, {8288, 12287})
elseif c == 'l' then -- %l: represents all lowercase letters. (ONLY ASCII)
table.insert(ranges, {97, 122}) -- a - z
elseif c == 'p' then -- %p: represents all punctuation characters. (ONLY ASCII)
table.insert(ranges, {33, 47})
table.insert(ranges, {58, 64})
table.insert(ranges, {91, 96})
table.insert(ranges, {123, 126})
elseif c == 's' then -- %s: represents all space characters.
table.insert(ranges, {9, 13})
table.insert(codes, 32)
table.insert(codes, 133)
table.insert(codes, 160)
table.insert(codes, 5760)
table.insert(ranges, {8192, 8202})
table.insert(codes, 8232)
table.insert(codes, 8233)
table.insert(codes, 8239)
table.insert(codes, 8287)
table.insert(codes, 12288)
elseif c == 'u' then -- %u: represents all uppercase letters. (ONLY ASCII)
table.insert(ranges, {65, 90}) -- A - Z
elseif c == 'w' then -- %w: represents all alphanumeric characters. (ONLY ASCII)
table.insert(ranges, {48, 57}) -- 0 - 9
table.insert(ranges, {65, 90}) -- A - Z
table.insert(ranges, {97, 122}) -- a - z
elseif c == 'x' then -- %x: represents all hexadecimal digits.
table.insert(ranges, {48, 57}) -- 0 - 9
table.insert(ranges, {65, 70}) -- A - F
table.insert(ranges, {97, 102}) -- a - f
else
if not range then
table.insert(codes, utf8unicode(c))
else
table.remove(codes) -- removing '-'
table.insert(ranges, {table.remove(codes), utf8unicode(c)})
range = false
end
end
ignore = false
else
if not range then
table.insert(codes, utf8unicode(c))
else
table.remove(codes) -- removing '-'
table.insert(ranges, {table.remove(codes), utf8unicode(c)})
range = false
end
ignore = false
end
firstletter = false
end
table.sort(codes)
local function inRanges(charCode)
for _,r in ipairs(ranges) do
if r[1] <= charCode and charCode <= r[2] then
return true
end
end
return false
end
if not unmatch then
return function(charCode)
return binsearch(codes, charCode) or inRanges(charCode)
end, skip
else
return function(charCode)
return charCode ~= -1 and not (binsearch(codes, charCode) or inRanges(charCode))
end, skip
end
end
-- utf8sub with extra argument, and extra result value
local function utf8subWithBytes (s, i, j, sb)
-- argument defaults
j = j or -1
local pos = sb or 1
local bytes = len(s)
local len = 0
-- only set l if i or j is negative
local l = (i >= 0 and j >= 0) or utf8len(s)
local startChar = (i >= 0) and i or l + i + 1
local endChar = (j >= 0) and j or l + j + 1
-- can't have start before end!
if startChar > endChar then
return ""
end
-- byte offsets to pass to string.sub
local startByte,endByte = 1,bytes
while pos <= bytes do
len = len + 1
if len == startChar then
startByte = pos
end
pos = pos + utf8charbytes(s, pos)
if len == endChar then
endByte = pos - 1
break
end
end
if startChar > len then startByte = bytes+1 end
if endChar < 1 then endByte = 0 end
return sub(s, startByte, endByte), endByte + 1
end
local cache = setmetatable({},{
__mode = 'kv'
})
local cachePlain = setmetatable({},{
__mode = 'kv'
})
local function matcherGenerator(regex, plain)
local matcher = {
functions = {},
captures = {}
}
if not plain then
cache[regex] = matcher
else
cachePlain[regex] = matcher
end
local function simple(func)
return function(cC)
if func(cC) then
matcher:nextFunc()
matcher:nextStr()
else
matcher:reset()
end
end
end
local function star(func)
return function(cC)
if func(cC) then
matcher:fullResetOnNextFunc()
matcher:nextStr()
else
matcher:nextFunc()
end
end
end
local function minus(func)
return function(cC)
if func(cC) then
matcher:fullResetOnNextStr()
end
matcher:nextFunc()
end
end
local function question(func)
return function(cC)
if func(cC) then
matcher:fullResetOnNextFunc()
matcher:nextStr()
end
matcher:nextFunc()
end
end
local function capture(id)
return function(cC)
local l = matcher.captures[id][2] - matcher.captures[id][1]
local captured = utf8sub(matcher.string, matcher.captures[id][1], matcher.captures[id][2])
local check = utf8sub(matcher.string, matcher.str, matcher.str + l)
if captured == check then
for i = 0, l do
matcher:nextStr()
end
matcher:nextFunc()
else
matcher:reset()
end
end
end
local function captureStart(id)
return function(cC)
matcher.captures[id][1] = matcher.str
matcher:nextFunc()
end
end
local function captureStop(id)
return function(cC)
matcher.captures[id][2] = matcher.str - 1
matcher:nextFunc()
end
end
local function balancer(str)
local sum = 0
local bc, ec = utf8sub(str, 1, 1), utf8sub(str, 2, 2)
local skip = len(bc) + len(ec)
bc, ec = utf8unicode(bc), utf8unicode(ec)
return function(cC)
if cC == ec and sum > 0 then
sum = sum - 1
if sum == 0 then
matcher:nextFunc()
end
matcher:nextStr()
elseif cC == bc then
sum = sum + 1
matcher:nextStr()
else
if sum == 0 or cC == -1 then
sum = 0
matcher:reset()
else
matcher:nextStr()
end
end
end, skip
end
matcher.functions[1] = function(cC)
matcher:fullResetOnNextStr()
matcher.seqStart = matcher.str
matcher:nextFunc()
if (matcher.str > matcher.startStr and matcher.fromStart) or matcher.str >= matcher.stringLen then
matcher.stop = true
matcher.seqStart = nil
end
end
local lastFunc
local ignore = false
local skip = nil
local it = (function()
local gen = utf8gensub(regex)
return function()
return gen(skip)
end
end)()
local cs = {}
for c, bs, be in it do
skip = nil
if plain then
table.insert(matcher.functions, simple(classMatchGenerator(c, plain)))
else
if ignore then
if find('123456789', c, 1, true) then
if lastFunc then
table.insert(matcher.functions, simple(lastFunc))
lastFunc = nil
end
table.insert(matcher.functions, capture(tonumber(c)))
elseif c == 'b' then
if lastFunc then
table.insert(matcher.functions, simple(lastFunc))
lastFunc = nil
end
local b
b, skip = balancer(sub(regex, be + 1, be + 9))
table.insert(matcher.functions, b)
else
lastFunc = classMatchGenerator('%' .. c)
end
ignore = false
else
if c == '*' then
if lastFunc then
table.insert(matcher.functions, star(lastFunc))
lastFunc = nil
else
error('invalid regex after ' .. sub(regex, 1, bs))
end
elseif c == '+' then
if lastFunc then
table.insert(matcher.functions, simple(lastFunc))
table.insert(matcher.functions, star(lastFunc))
lastFunc = nil
else
error('invalid regex after ' .. sub(regex, 1, bs))
end
elseif c == '-' then
if lastFunc then
table.insert(matcher.functions, minus(lastFunc))
lastFunc = nil
else
error('invalid regex after ' .. sub(regex, 1, bs))
end
elseif c == '?' then
if lastFunc then
table.insert(matcher.functions, question(lastFunc))
lastFunc = nil
else
error('invalid regex after ' .. sub(regex, 1, bs))
end
elseif c == '^' then
if bs == 1 then
matcher.fromStart = true
else
error('invalid regex after ' .. sub(regex, 1, bs))
end
elseif c == '$' then
if be == len(regex) then
matcher.toEnd = true
else
error('invalid regex after ' .. sub(regex, 1, bs))
end
elseif c == '[' then
if lastFunc then
table.insert(matcher.functions, simple(lastFunc))
end
lastFunc, skip = classMatchGenerator(sub(regex, be + 1))
elseif c == '(' then
if lastFunc then
table.insert(matcher.functions, simple(lastFunc))
lastFunc = nil
end
table.insert(matcher.captures, {})
table.insert(cs, #matcher.captures)
table.insert(matcher.functions, captureStart(cs[#cs]))
if sub(regex, be + 1, be + 1) == ')' then matcher.captures[#matcher.captures].empty = true end
elseif c == ')' then
if lastFunc then
table.insert(matcher.functions, simple(lastFunc))
lastFunc = nil
end
local cap = table.remove(cs)
if not cap then
error('invalid capture: "(" missing')
end
table.insert(matcher.functions, captureStop(cap))
elseif c == '.' then
if lastFunc then
table.insert(matcher.functions, simple(lastFunc))
end
lastFunc = function(cC) return cC ~= -1 end
elseif c == '%' then
ignore = true
else
if lastFunc then
table.insert(matcher.functions, simple(lastFunc))
end
lastFunc = classMatchGenerator(c)
end
end
end
end
if #cs > 0 then
error('invalid capture: ")" missing')
end
if lastFunc then
table.insert(matcher.functions, simple(lastFunc))
end
lastFunc = nil
ignore = nil
table.insert(matcher.functions, function()
if matcher.toEnd and matcher.str ~= matcher.stringLen then
matcher:reset()
else
matcher.stop = true
end
end)
matcher.nextFunc = function(self)
self.func = self.func + 1
end
matcher.nextStr = function(self)
self.str = self.str + 1
end
matcher.strReset = function(self)
local oldReset = self.reset
local str = self.str
self.reset = function(s)
s.str = str
s.reset = oldReset
end
end
matcher.fullResetOnNextFunc = function(self)
local oldReset = self.reset
local func = self.func +1
local str = self.str
self.reset = function(s)
s.func = func
s.str = str
s.reset = oldReset
end
end
matcher.fullResetOnNextStr = function(self)
local oldReset = self.reset
local str = self.str + 1
local func = self.func
self.reset = function(s)
s.func = func
s.str = str
s.reset = oldReset
end
end
matcher.process = function(self, str, start)
self.func = 1
start = start or 1
self.startStr = (start >= 0) and start or utf8len(str) + start + 1
self.seqStart = self.startStr
self.str = self.startStr
self.stringLen = utf8len(str) + 1
self.string = str
self.stop = false
self.reset = function(s)
s.func = 1
end
local lastPos = self.str
local lastByte
local char
while not self.stop do
if self.str < self.stringLen then
--[[ if lastPos < self.str then
print('last byte', lastByte)
char, lastByte = utf8subWithBytes(str, 1, self.str - lastPos - 1, lastByte)
char, lastByte = utf8subWithBytes(str, 1, 1, lastByte)
lastByte = lastByte - 1
else
char, lastByte = utf8subWithBytes(str, self.str, self.str)
end
lastPos = self.str ]]
char = utf8sub(str, self.str,self.str)
--print('char', char, utf8unicode(char))
self.functions[self.func](utf8unicode(char))
else
self.functions[self.func](-1)
end
end
if self.seqStart then
local captures = {}
for _,pair in pairs(self.captures) do
if pair.empty then
table.insert(captures, pair[1])
else
table.insert(captures, utf8sub(str, pair[1], pair[2]))
end
end
return self.seqStart, self.str - 1, unpack(captures)
end
end
return matcher
end
-- string.find
local function utf8find(str, regex, init, plain)
local matcher = cache[regex] or matcherGenerator(regex, plain)
return matcher:process(str, init)
end
-- string.match
local function utf8match(str, regex, init)
init = init or 1
local found = {utf8find(str, regex, init)}
if found[1] then
if found[3] then
return unpack(found, 3)
end
return utf8sub(str, found[1], found[2])
end
end
-- string.gmatch
local function utf8gmatch(str, regex, all)
regex = (utf8sub(regex,1,1) ~= '^') and regex or '%' .. regex
local lastChar = 1
return function()
local found = {utf8find(str, regex, lastChar)}
if found[1] then
lastChar = found[2] + 1
if found[all and 1 or 3] then
return unpack(found, all and 1 or 3)
end
return utf8sub(str, found[1], found[2])
end
end
end
local function replace(repl, args)
local ret = ''
if type(repl) == 'string' then
local ignore = false
local num = 0
for c in utf8gensub(repl) do
if not ignore then
if c == '%' then
ignore = true
else
ret = ret .. c
end
else
num = tonumber(c)
if num then
ret = ret .. args[num]
else
ret = ret .. c
end
ignore = false
end
end
elseif type(repl) == 'table' then
ret = repl[args[1] or args[0]] or ''
elseif type(repl) == 'function' then
if #args > 0 then
ret = repl(unpack(args, 1)) or ''
else
ret = repl(args[0]) or ''
end
end
return ret
end
-- string.gsub
local function utf8gsub(str, regex, repl, limit)
limit = limit or -1
local ret = ''
local prevEnd = 1
local it = utf8gmatch(str, regex, true)
local found = {it()}
local n = 0
while #found > 0 and limit ~= n do
local args = {[0] = utf8sub(str, found[1], found[2]), unpack(found, 3)}
ret = ret .. utf8sub(str, prevEnd, found[1] - 1)
.. replace(repl, args)
prevEnd = found[2] + 1
n = n + 1
found = {it()}
end
return ret .. utf8sub(str, prevEnd), n
end
string.byte = utf8unicode
string.char = utf8char
string.dump = dump
string.find = utf8find
string.format = format
string.gmatch = utf8gmatch
string.gsub = utf8gsub
string.len = utf8len
string.lower = lower--utf8lower
string.match = utf8match
string.rep = rep
string.reverse = utf8reverse
string.sub = utf8sub
string.upper = upper--utf8upper
string.gensub = utf8gensub
--[[ local utf8 = {}
utf8.len = utf8len
utf8.sub = utf8sub
utf8.reverse = utf8reverse
utf8.char = utf8char
utf8.unicode = utf8unicode
utf8.gensub = utf8gensub
utf8.byte = utf8unicode
utf8.find = utf8find
utf8.match = utf8match
utf8.gmatch = utf8gmatch
utf8.gsub = utf8gsub
utf8.dump = dump
utf8.format = format
utf8.lower = lower
utf8.upper = upper
utf8.rep = rep
return utf8 ]]