utf8 fullsupport

This commit is contained in:
Stepets 2014-11-20 18:42:59 +03:00
parent 34a84f6823
commit 2d8fbdecf0

View File

@ -1,17 +1,21 @@
-- $Id: utf8.lua 179 2009-04-03 18:10:03Z pasta $
--
-- Provides UTF-8 aware string functions implemented in pure lua:
-- * string.utf8len(s)
-- * string.utf8sub(s, i, j)
-- * string.utf8reverse(s)
-- * string.utf8char(unicode)
-- * string.utf8unicode(s, i, j)
-- * string.utf8gensub(s, sub_len)
-- * utf8len(s)
-- * utf8sub(s, i, j)
-- * utf8reverse(s)
-- * utf8char(unicode)
-- * utf8unicode(s, i, j)
-- * utf8gensub(s, sub_len)
-- * utf8find(str, regex, init, plain)
-- * utf8match(str, regex, init)
-- * utf8gmatch(str, regex, all)
-- * utf8gsub(str, regex, repl, limit)
--
-- If utf8data.lua (containing the lower<->upper case mappings) is loaded, these
-- additional functions are available:
-- * string.utf8upper(s)
-- * string.utf8lower(s)
-- * utf8upper(s)
-- * utf8lower(s)
--
-- All functions behave as their non UTF-8 aware counterparts with the exception
-- that UTF-8 characters are used instead of bytes for all units.
@ -20,6 +24,9 @@
Copyright (c) 2006-2007, Kyle Smith
All rights reserved.
Contributors:
Alimov Stepan
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
@ -57,9 +64,20 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-- UTF8-tail = %x80-BF
--
local len = string.len
local sub = string.sub
local char = string.char
local byte = string.byte
local char = string.char
local dump = string.dump
local find = string.find
local format = string.format
local gmatch = string.gmatch
local gsub = string.gsub
local len = string.len
local lower = string.lower
local match = string.match
local rep = string.rep
local reverse = string.reverse
local sub = string.sub
local upper = string.upper
-- returns the number of bytes used by the UTF-8 character at byte i in s
-- also doubles as a UTF-8 character validator
@ -75,7 +93,7 @@ local function utf8charbytes (s, i)
error("bad argument #2 to 'utf8charbytes' (number expected, got ".. type(i).. ")")
end
local c = s:byte(i)
local c = byte(s, i)
-- determine bytes needed for character, based on RFC 3629
-- validate byte 1
@ -85,7 +103,7 @@ local function utf8charbytes (s, i)
elseif c >= 194 and c <= 223 then
-- UTF8-2
local c2 = s:byte(i + 1)
local c2 = byte(s, i + 1)
if not c2 then
error("UTF-8 string terminated early")
@ -100,8 +118,8 @@ local function utf8charbytes (s, i)
elseif c >= 224 and c <= 239 then
-- UTF8-3
local c2 = s:byte(i + 1)
local c3 = s:byte(i + 2)
local c2 = byte(s, i + 1)
local c3 = byte(s, i + 2)
if not c2 or not c3 then
error("UTF-8 string terminated early")
@ -125,9 +143,9 @@ local function utf8charbytes (s, i)
elseif c >= 240 and c <= 244 then
-- UTF8-4
local c2 = s:byte(i + 1)
local c3 = s:byte(i + 2)
local c4 = s:byte(i + 3)
local c2 = byte(s, i + 1)
local c3 = byte(s, i + 2)
local c4 = byte(s, i + 3)
if not c2 or not c3 or not c4 then
error("UTF-8 string terminated early")
@ -275,10 +293,10 @@ local function utf8reverse (s)
local newstr = ""
while pos > 0 do
c = s:byte(pos)
c = byte(s, pos)
while c >= 128 and c <= 191 do
pos = pos - 1
c = s:byte(pos)
c = byte(s, pos)
end
charbytes = utf8charbytes(s, pos)
@ -342,25 +360,25 @@ utf8unicode = function(str, i, j, byte_pos)
bytes = utf8charbytes(str,byte_pos)
char = sub(str,byte_pos,byte_pos-1+bytes)
else
char,byte_pos = utf8sub(str,i,i)
char,byte_pos = utf8sub(str,i,i), 0
bytes = #char
end
local unicode
if bytes == 1 then unicode = string.byte(char) end
if bytes == 1 then unicode = byte(char) end
if bytes == 2 then
local byte0,byte1 = string.byte(char,1,2)
local byte0,byte1 = byte(char,1,2)
local code0,code1 = byte0-0xC0,byte1-0x80
unicode = code0*shift_6 + code1
end
if bytes == 3 then
local byte0,byte1,byte2 = string.byte(char,1,3)
local byte0,byte1,byte2 = byte(char,1,3)
local code0,code1,code2 = byte0-0xE0,byte1-0x80,byte2-0x80
unicode = code0*shift_12 + code1*shift_6 + code2
end
if bytes == 4 then
local byte0,byte1,byte2,byte3 = string.byte(char,1,4)
local byte0,byte1,byte2,byte3 = byte(char,1,4)
local code0,code1,code2,code3 = byte0-0xF0,byte1-0x80,byte2-0x80,byte3-0x80
unicode = code0*shift_18 + code1*shift_12 + code2*shift_6 + code3
end
@ -373,7 +391,8 @@ local function utf8gensub(str, sub_len)
sub_len = sub_len or 1
local byte_pos = 1
local len = #str
return function()
return function(skip)
if skip then byte_pos = byte_pos + skip end
local char_count = 0
local start = byte_pos
repeat
@ -390,9 +409,657 @@ local function utf8gensub(str, sub_len)
end
end
string.len = utf8len
string.sub = utf8sub
string.reverse = utf8reverse
string.char = utf8char
string.unicode = utf8unicode
string.gensub = utf8gensub
local function binsearch(sortedTable, item, comp)
local head, tail = 1, #sortedTable
local mid = math.floor((head + tail)/2)
if not comp then
while (tail - head) > 1 do
if sortedTable[tonumber(mid)] > item then
tail = mid
else
head = mid
end
mid = math.floor((head + tail)/2)
end
else
end
if sortedTable[tonumber(head)] == item then
return true, tonumber(head)
elseif sortedTable[tonumber(tail)] == item then
return true, tonumber(tail)
else
return false
end
end
local function classMatchGenerator(class, plain)
local codes = {}
local ranges = {}
local ignore = false
local range = false
local firstletter = true
local unmatch = false
local it = utf8gensub(class)
local skip
for c,bs,be in it do
skip = be
if not ignore and not plain then
if c == "%" then
ignore = true
elseif c == "-" then
table.insert(codes, utf8unicode(c))
range = true
elseif c == "^" then
if not firstletter then
error('!!!')
else
unmatch = true
end
elseif c == ']' then
break
else
if not range then
table.insert(codes, utf8unicode(c))
else
table.remove(codes) -- removing '-'
table.insert(ranges, {table.remove(codes), utf8unicode(c)})
range = false
end
end
elseif ignore and not plain then
if c == 'a' then -- %a: represents all letters. (ONLY ASCII)
table.insert(ranges, {65, 90}) -- A - Z
table.insert(ranges, {97, 122}) -- a - z
elseif c == 'c' then -- %c: represents all control characters.
table.insert(ranges, {0, 31})
table.insert(codes, 127)
elseif c == 'd' then -- %d: represents all digits.
table.insert(ranges, {48, 57}) -- 0 - 9
elseif c == 'g' then -- %g: represents all printable characters except space.
table.insert(ranges, {1, 8})
table.insert(ranges, {14, 31})
table.insert(ranges, {33, 132})
table.insert(ranges, {134, 159})
table.insert(ranges, {161, 5759})
table.insert(ranges, {5761, 8191})
table.insert(ranges, {8203, 8231})
table.insert(ranges, {8234, 8238})
table.insert(ranges, {8240, 8286})
table.insert(ranges, {8288, 12287})
elseif c == 'l' then -- %l: represents all lowercase letters. (ONLY ASCII)
table.insert(ranges, {97, 122}) -- a - z
elseif c == 'p' then -- %p: represents all punctuation characters. (ONLY ASCII)
table.insert(ranges, {33, 47})
table.insert(ranges, {58, 64})
table.insert(ranges, {91, 96})
table.insert(ranges, {123, 126})
elseif c == 's' then -- %s: represents all space characters.
table.insert(ranges, {9, 13})
table.insert(codes, 32)
table.insert(codes, 133)
table.insert(codes, 160)
table.insert(codes, 5760)
table.insert(ranges, {8192, 8202})
table.insert(codes, 8232)
table.insert(codes, 8233)
table.insert(codes, 8239)
table.insert(codes, 8287)
table.insert(codes, 12288)
elseif c == 'u' then -- %u: represents all uppercase letters. (ONLY ASCII)
table.insert(ranges, {65, 90}) -- A - Z
elseif c == 'w' then -- %w: represents all alphanumeric characters. (ONLY ASCII)
table.insert(ranges, {48, 57}) -- 0 - 9
table.insert(ranges, {65, 90}) -- A - Z
table.insert(ranges, {97, 122}) -- a - z
elseif c == 'x' then -- %x: represents all hexadecimal digits.
table.insert(ranges, {48, 57}) -- 0 - 9
table.insert(ranges, {65, 70}) -- A - F
table.insert(ranges, {97, 102}) -- a - f
else
if not range then
table.insert(codes, utf8unicode(c))
else
table.remove(codes) -- removing '-'
table.insert(ranges, {table.remove(codes), utf8unicode(c)})
range = false
end
end
ignore = false
else
if not range then
table.insert(codes, utf8unicode(c))
else
table.remove(codes) -- removing '-'
table.insert(ranges, {table.remove(codes), utf8unicode(c)})
range = false
end
ignore = false
end
firstletter = false
end
table.sort(codes)
local function inRanges(charCode)
for _,r in ipairs(ranges) do
if r[1] <= charCode and charCode <= r[2] then
return true
end
end
return false
end
if not unmatch then
return function(charCode)
return binsearch(codes, charCode) or inRanges(charCode)
end, skip
else
return function(charCode)
return charCode ~= -1 and not (binsearch(codes, charCode) or inRanges(charCode))
end, skip
end
end
-- utf8sub with extra argument, and extra result value
local function utf8subWithBytes (s, i, j, sb)
-- argument defaults
j = j or -1
local pos = sb or 1
local bytes = len(s)
local len = 0
-- only set l if i or j is negative
local l = (i >= 0 and j >= 0) or utf8len(s)
local startChar = (i >= 0) and i or l + i + 1
local endChar = (j >= 0) and j or l + j + 1
-- can't have start before end!
if startChar > endChar then
return ""
end
-- byte offsets to pass to string.sub
local startByte,endByte = 1,bytes
while pos <= bytes do
len = len + 1
if len == startChar then
startByte = pos
end
pos = pos + utf8charbytes(s, pos)
if len == endChar then
endByte = pos - 1
break
end
end
if startChar > len then startByte = bytes+1 end
if endChar < 1 then endByte = 0 end
return sub(s, startByte, endByte), endByte + 1
end
local cache = setmetatable({},{
__mode = 'kv'
})
local cachePlain = setmetatable({},{
__mode = 'kv'
})
local function matcherGenerator(regex, plain)
local matcher = {
functions = {},
captures = {}
}
if not plain then
cache[regex] = matcher
else
cachePlain[regex] = matcher
end
local function simple(func)
return function(cC)
if func(cC) then
matcher:nextFunc()
matcher:nextStr()
else
matcher:reset()
end
end
end
local function star(func)
return function(cC)
if func(cC) then
matcher:fullResetOnNextFunc()
matcher:nextStr()
else
matcher:nextFunc()
end
end
end
local function minus(func)
return function(cC)
if func(cC) then
matcher:fullResetOnNextStr()
end
matcher:nextFunc()
end
end
local function question(func)
return function(cC)
if func(cC) then
matcher:fullResetOnNextFunc()
matcher:nextStr()
end
matcher:nextFunc()
end
end
local function capture(id)
return function(cC)
local l = matcher.captures[id][2] - matcher.captures[id][1]
local captured = utf8sub(matcher.string, matcher.captures[id][1], matcher.captures[id][2])
local check = utf8sub(matcher.string, matcher.str, matcher.str + l)
if captured == check then
for i = 0, l do
matcher:nextStr()
end
matcher:nextFunc()
else
matcher:reset()
end
end
end
local function captureStart(id)
return function(cC)
matcher.captures[id][1] = matcher.str
matcher:nextFunc()
end
end
local function captureStop(id)
return function(cC)
matcher.captures[id][2] = matcher.str - 1
matcher:nextFunc()
end
end
local function balancer(str)
local sum = 0
local bc, ec = utf8sub(str, 1, 1), utf8sub(str, 2, 2)
local skip = len(bc) + len(ec)
bc, ec = utf8unicode(bc), utf8unicode(ec)
return function(cC)
if cC == ec and sum > 0 then
sum = sum - 1
if sum == 0 then
matcher:nextFunc()
end
matcher:nextStr()
elseif cC == bc then
sum = sum + 1
matcher:nextStr()
else
if sum == 0 or cC == -1 then
sum = 0
matcher:reset()
else
matcher:nextStr()
end
end
end, skip
end
matcher.functions[1] = function(cC)
matcher:fullResetOnNextStr()
matcher.seqStart = matcher.str
matcher:nextFunc()
if (matcher.str > matcher.startStr and matcher.fromStart) or matcher.str >= matcher.stringLen then
matcher.stop = true
matcher.seqStart = nil
end
end
local lastFunc
local ignore = false
local skip = nil
local it = (function()
local gen = utf8gensub(regex)
return function()
return gen(skip)
end
end)()
local cs = {}
for c, bs, be in it do
skip = nil
if plain then
table.insert(matcher.functions, simple(classMatchGenerator(c, plain)))
else
if ignore then
if find('123456789', c, 1, true) then
if lastFunc then
table.insert(matcher.functions, simple(lastFunc))
lastFunc = nil
end
table.insert(matcher.functions, capture(tonumber(c)))
elseif c == 'b' then
if lastFunc then
table.insert(matcher.functions, simple(lastFunc))
lastFunc = nil
end
local b
b, skip = balancer(sub(regex, be + 1, be + 9))
table.insert(matcher.functions, b)
else
lastFunc = classMatchGenerator('%' .. c)
end
ignore = false
else
if c == '*' then
if lastFunc then
table.insert(matcher.functions, star(lastFunc))
lastFunc = nil
else
error('invalid regex after ' .. sub(regex, 1, bs))
end
elseif c == '+' then
if lastFunc then
table.insert(matcher.functions, simple(lastFunc))
table.insert(matcher.functions, star(lastFunc))
lastFunc = nil
else
error('invalid regex after ' .. sub(regex, 1, bs))
end
elseif c == '-' then
if lastFunc then
table.insert(matcher.functions, minus(lastFunc))
lastFunc = nil
else
error('invalid regex after ' .. sub(regex, 1, bs))
end
elseif c == '?' then
if lastFunc then
table.insert(matcher.functions, question(lastFunc))
lastFunc = nil
else
error('invalid regex after ' .. sub(regex, 1, bs))
end
elseif c == '^' then
if bs == 1 then
matcher.fromStart = true
else
error('invalid regex after ' .. sub(regex, 1, bs))
end
elseif c == '$' then
if be == len(regex) then
matcher.toEnd = true
else
error('invalid regex after ' .. sub(regex, 1, bs))
end
elseif c == '[' then
if lastFunc then
table.insert(matcher.functions, simple(lastFunc))
end
lastFunc, skip = classMatchGenerator(sub(regex, be + 1))
elseif c == '(' then
if lastFunc then
table.insert(matcher.functions, simple(lastFunc))
lastFunc = nil
end
table.insert(matcher.captures, {})
table.insert(cs, #matcher.captures)
table.insert(matcher.functions, captureStart(cs[#cs]))
if sub(regex, be + 1, be + 1) == ')' then matcher.captures[#matcher.captures].empty = true end
elseif c == ')' then
if lastFunc then
table.insert(matcher.functions, simple(lastFunc))
lastFunc = nil
end
local cap = table.remove(cs)
if not cap then
error('invalid capture: "(" missing')
end
table.insert(matcher.functions, captureStop(cap))
elseif c == '.' then
if lastFunc then
table.insert(matcher.functions, simple(lastFunc))
end
lastFunc = function(cC) return cC ~= -1 end
elseif c == '%' then
ignore = true
else
if lastFunc then
table.insert(matcher.functions, simple(lastFunc))
end
lastFunc = classMatchGenerator(c)
end
end
end
end
if #cs > 0 then
error('invalid capture: ")" missing')
end
if lastFunc then
table.insert(matcher.functions, simple(lastFunc))
end
lastFunc = nil
ignore = nil
table.insert(matcher.functions, function()
if matcher.toEnd and matcher.str ~= matcher.stringLen then
matcher:reset()
else
matcher.stop = true
end
end)
matcher.nextFunc = function(self)
self.func = self.func + 1
end
matcher.nextStr = function(self)
self.str = self.str + 1
end
matcher.strReset = function(self)
local oldReset = self.reset
local str = self.str
self.reset = function(s)
s.str = str
s.reset = oldReset
end
end
matcher.fullResetOnNextFunc = function(self)
local oldReset = self.reset
local func = self.func +1
local str = self.str
self.reset = function(s)
s.func = func
s.str = str
s.reset = oldReset
end
end
matcher.fullResetOnNextStr = function(self)
local oldReset = self.reset
local str = self.str + 1
local func = self.func
self.reset = function(s)
s.func = func
s.str = str
s.reset = oldReset
end
end
matcher.process = function(self, str, start)
self.func = 1
start = start or 1
self.startStr = (start >= 0) and start or utf8len(str) + start + 1
self.seqStart = self.startStr
self.str = self.startStr
self.stringLen = utf8len(str) + 1
self.string = str
self.stop = false
self.reset = function(s)
s.func = 1
end
local lastPos = self.str
local lastByte
local char
while not self.stop do
if self.str < self.stringLen then
--[[ if lastPos < self.str then
print('last byte', lastByte)
char, lastByte = utf8subWithBytes(str, 1, self.str - lastPos - 1, lastByte)
char, lastByte = utf8subWithBytes(str, 1, 1, lastByte)
lastByte = lastByte - 1
else
char, lastByte = utf8subWithBytes(str, self.str, self.str)
end
lastPos = self.str ]]
char = utf8sub(str, self.str,self.str)
--print('char', char, utf8unicode(char))
self.functions[self.func](utf8unicode(char))
else
self.functions[self.func](-1)
end
end
if self.seqStart then
local captures = {}
for _,pair in pairs(self.captures) do
if pair.empty then
table.insert(captures, pair[1])
else
table.insert(captures, utf8sub(str, pair[1], pair[2]))
end
end
return self.seqStart, self.str - 1, unpack(captures)
end
end
return matcher
end
-- string.find
local function utf8find(str, regex, init, plain)
local matcher = cache[regex] or matcherGenerator(regex, plain)
return matcher:process(str, init)
end
-- string.match
local function utf8match(str, regex, init)
init = init or 1
local found = {utf8find(str, regex, init)}
if found[1] then
if found[3] then
return unpack(found, 3)
end
return utf8sub(str, found[1], found[2])
end
end
-- string.gmatch
local function utf8gmatch(str, regex, all)
regex = (utf8sub(regex,1,1) ~= '^') and regex or '%' .. regex
local lastChar = 1
return function()
local found = {utf8find(str, regex, lastChar)}
if found[1] then
lastChar = found[2] + 1
if found[all and 1 or 3] then
return unpack(found, all and 1 or 3)
end
return utf8sub(str, found[1], found[2])
end
end
end
local function replace(repl, args)
local ret = ''
if type(repl) == 'string' then
local ignore = false
local num = 0
for c in utf8gensub(repl) do
if not ignore then
if c == '%' then
ignore = true
else
ret = ret .. c
end
else
num = tonumber(c)
if num then
ret = ret .. args[num]
else
ret = ret .. c
end
ignore = false
end
end
elseif type(repl) == 'table' then
ret = repl[args[1] or args[0]] or ''
elseif type(repl) == 'function' then
if #args > 0 then
ret = repl(unpack(args, 1)) or ''
else
ret = repl(args[0]) or ''
end
end
return ret
end
-- string.gsub
local function utf8gsub(str, regex, repl, limit)
limit = limit or -1
local ret = ''
local prevEnd = 1
local it = utf8gmatch(str, regex, true)
local found = {it()}
local n = 0
while #found > 0 and limit ~= n do
local args = {[0] = utf8sub(str, found[1], found[2]), unpack(found, 3)}
ret = ret .. utf8sub(str, prevEnd, found[1] - 1)
.. replace(repl, args)
prevEnd = found[2] + 1
n = n + 1
found = {it()}
end
return ret .. utf8sub(str, prevEnd), n
end
string.byte = utf8unicode
string.char = utf8char
string.dump = dump
string.find = utf8find
string.format = format
string.gmatch = utf8gmatch
string.gsub = utf8gsub
string.len = utf8len
string.lower = lower--utf8lower
string.match = utf8match
string.rep = rep
string.reverse = utf8reverse
string.sub = utf8sub
string.upper = upper--utf8upper
string.gensub = utf8gensub
--[[ local utf8 = {}
utf8.len = utf8len
utf8.sub = utf8sub
utf8.reverse = utf8reverse
utf8.char = utf8char
utf8.unicode = utf8unicode
utf8.gensub = utf8gensub
utf8.byte = utf8unicode
utf8.find = utf8find
utf8.match = utf8match
utf8.gmatch = utf8gmatch
utf8.gsub = utf8gsub
utf8.dump = dump
utf8.format = format
utf8.lower = lower
utf8.upper = upper
utf8.rep = rep
return utf8 ]]