Merge pull request #153 from Stepets/master

utf8 lib update
This commit is contained in:
Kenny Shields 2014-11-23 19:47:01 -05:00
commit 03f2b0736c

View File

@ -1,17 +1,21 @@
-- $Id: utf8.lua 179 2009-04-03 18:10:03Z pasta $ -- $Id: utf8.lua 179 2009-04-03 18:10:03Z pasta $
-- --
-- Provides UTF-8 aware string functions implemented in pure lua: -- Provides UTF-8 aware string functions implemented in pure lua:
-- * string.utf8len(s) -- * utf8len(s)
-- * string.utf8sub(s, i, j) -- * utf8sub(s, i, j)
-- * string.utf8reverse(s) -- * utf8reverse(s)
-- * string.utf8char(unicode) -- * utf8char(unicode)
-- * string.utf8unicode(s, i, j) -- * utf8unicode(s, i, j)
-- * string.utf8gensub(s, sub_len) -- * utf8gensub(s, sub_len)
-- * utf8find(str, regex, init, plain)
-- * utf8match(str, regex, init)
-- * utf8gmatch(str, regex, all)
-- * utf8gsub(str, regex, repl, limit)
-- --
-- If utf8data.lua (containing the lower<->upper case mappings) is loaded, these -- If utf8data.lua (containing the lower<->upper case mappings) is loaded, these
-- additional functions are available: -- additional functions are available:
-- * string.utf8upper(s) -- * utf8upper(s)
-- * string.utf8lower(s) -- * utf8lower(s)
-- --
-- All functions behave as their non UTF-8 aware counterparts with the exception -- All functions behave as their non UTF-8 aware counterparts with the exception
-- that UTF-8 characters are used instead of bytes for all units. -- that UTF-8 characters are used instead of bytes for all units.
@ -20,6 +24,9 @@
Copyright (c) 2006-2007, Kyle Smith Copyright (c) 2006-2007, Kyle Smith
All rights reserved. All rights reserved.
Contributors:
Alimov Stepan
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met: modification, are permitted provided that the following conditions are met:
@ -57,9 +64,20 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-- UTF8-tail = %x80-BF -- UTF8-tail = %x80-BF
-- --
local len = string.len local byte = string.byte
local sub = string.sub local char = string.char
local char = string.char local dump = string.dump
local find = string.find
local format = string.format
local gmatch = string.gmatch
local gsub = string.gsub
local len = string.len
local lower = string.lower
local match = string.match
local rep = string.rep
local reverse = string.reverse
local sub = string.sub
local upper = string.upper
-- returns the number of bytes used by the UTF-8 character at byte i in s -- returns the number of bytes used by the UTF-8 character at byte i in s
-- also doubles as a UTF-8 character validator -- also doubles as a UTF-8 character validator
@ -75,7 +93,7 @@ local function utf8charbytes (s, i)
error("bad argument #2 to 'utf8charbytes' (number expected, got ".. type(i).. ")") error("bad argument #2 to 'utf8charbytes' (number expected, got ".. type(i).. ")")
end end
local c = s:byte(i) local c = byte(s, i)
-- determine bytes needed for character, based on RFC 3629 -- determine bytes needed for character, based on RFC 3629
-- validate byte 1 -- validate byte 1
@ -85,7 +103,7 @@ local function utf8charbytes (s, i)
elseif c >= 194 and c <= 223 then elseif c >= 194 and c <= 223 then
-- UTF8-2 -- UTF8-2
local c2 = s:byte(i + 1) local c2 = byte(s, i + 1)
if not c2 then if not c2 then
error("UTF-8 string terminated early") error("UTF-8 string terminated early")
@ -100,8 +118,8 @@ local function utf8charbytes (s, i)
elseif c >= 224 and c <= 239 then elseif c >= 224 and c <= 239 then
-- UTF8-3 -- UTF8-3
local c2 = s:byte(i + 1) local c2 = byte(s, i + 1)
local c3 = s:byte(i + 2) local c3 = byte(s, i + 2)
if not c2 or not c3 then if not c2 or not c3 then
error("UTF-8 string terminated early") error("UTF-8 string terminated early")
@ -125,9 +143,9 @@ local function utf8charbytes (s, i)
elseif c >= 240 and c <= 244 then elseif c >= 240 and c <= 244 then
-- UTF8-4 -- UTF8-4
local c2 = s:byte(i + 1) local c2 = byte(s, i + 1)
local c3 = s:byte(i + 2) local c3 = byte(s, i + 2)
local c4 = s:byte(i + 3) local c4 = byte(s, i + 3)
if not c2 or not c3 or not c4 then if not c2 or not c3 or not c4 then
error("UTF-8 string terminated early") error("UTF-8 string terminated early")
@ -275,10 +293,10 @@ local function utf8reverse (s)
local newstr = "" local newstr = ""
while pos > 0 do while pos > 0 do
c = s:byte(pos) c = byte(s, pos)
while c >= 128 and c <= 191 do while c >= 128 and c <= 191 do
pos = pos - 1 pos = pos - 1
c = s:byte(pos) c = byte(s, pos)
end end
charbytes = utf8charbytes(s, pos) charbytes = utf8charbytes(s, pos)
@ -342,25 +360,25 @@ utf8unicode = function(str, i, j, byte_pos)
bytes = utf8charbytes(str,byte_pos) bytes = utf8charbytes(str,byte_pos)
char = sub(str,byte_pos,byte_pos-1+bytes) char = sub(str,byte_pos,byte_pos-1+bytes)
else else
char,byte_pos = utf8sub(str,i,i) char,byte_pos = utf8sub(str,i,i), 0
bytes = #char bytes = #char
end end
local unicode local unicode
if bytes == 1 then unicode = string.byte(char) end if bytes == 1 then unicode = byte(char) end
if bytes == 2 then if bytes == 2 then
local byte0,byte1 = string.byte(char,1,2) local byte0,byte1 = byte(char,1,2)
local code0,code1 = byte0-0xC0,byte1-0x80 local code0,code1 = byte0-0xC0,byte1-0x80
unicode = code0*shift_6 + code1 unicode = code0*shift_6 + code1
end end
if bytes == 3 then if bytes == 3 then
local byte0,byte1,byte2 = string.byte(char,1,3) local byte0,byte1,byte2 = byte(char,1,3)
local code0,code1,code2 = byte0-0xE0,byte1-0x80,byte2-0x80 local code0,code1,code2 = byte0-0xE0,byte1-0x80,byte2-0x80
unicode = code0*shift_12 + code1*shift_6 + code2 unicode = code0*shift_12 + code1*shift_6 + code2
end end
if bytes == 4 then if bytes == 4 then
local byte0,byte1,byte2,byte3 = string.byte(char,1,4) local byte0,byte1,byte2,byte3 = byte(char,1,4)
local code0,code1,code2,code3 = byte0-0xF0,byte1-0x80,byte2-0x80,byte3-0x80 local code0,code1,code2,code3 = byte0-0xF0,byte1-0x80,byte2-0x80,byte3-0x80
unicode = code0*shift_18 + code1*shift_12 + code2*shift_6 + code3 unicode = code0*shift_18 + code1*shift_12 + code2*shift_6 + code3
end end
@ -373,7 +391,8 @@ local function utf8gensub(str, sub_len)
sub_len = sub_len or 1 sub_len = sub_len or 1
local byte_pos = 1 local byte_pos = 1
local len = #str local len = #str
return function() return function(skip)
if skip then byte_pos = byte_pos + skip end
local char_count = 0 local char_count = 0
local start = byte_pos local start = byte_pos
repeat repeat
@ -390,9 +409,657 @@ local function utf8gensub(str, sub_len)
end end
end end
string.len = utf8len local function binsearch(sortedTable, item, comp)
string.sub = utf8sub local head, tail = 1, #sortedTable
string.reverse = utf8reverse local mid = math.floor((head + tail)/2)
string.char = utf8char if not comp then
string.unicode = utf8unicode while (tail - head) > 1 do
string.gensub = utf8gensub if sortedTable[tonumber(mid)] > item then
tail = mid
else
head = mid
end
mid = math.floor((head + tail)/2)
end
else
end
if sortedTable[tonumber(head)] == item then
return true, tonumber(head)
elseif sortedTable[tonumber(tail)] == item then
return true, tonumber(tail)
else
return false
end
end
local function classMatchGenerator(class, plain)
local codes = {}
local ranges = {}
local ignore = false
local range = false
local firstletter = true
local unmatch = false
local it = utf8gensub(class)
local skip
for c,bs,be in it do
skip = be
if not ignore and not plain then
if c == "%" then
ignore = true
elseif c == "-" then
table.insert(codes, utf8unicode(c))
range = true
elseif c == "^" then
if not firstletter then
error('!!!')
else
unmatch = true
end
elseif c == ']' then
break
else
if not range then
table.insert(codes, utf8unicode(c))
else
table.remove(codes) -- removing '-'
table.insert(ranges, {table.remove(codes), utf8unicode(c)})
range = false
end
end
elseif ignore and not plain then
if c == 'a' then -- %a: represents all letters. (ONLY ASCII)
table.insert(ranges, {65, 90}) -- A - Z
table.insert(ranges, {97, 122}) -- a - z
elseif c == 'c' then -- %c: represents all control characters.
table.insert(ranges, {0, 31})
table.insert(codes, 127)
elseif c == 'd' then -- %d: represents all digits.
table.insert(ranges, {48, 57}) -- 0 - 9
elseif c == 'g' then -- %g: represents all printable characters except space.
table.insert(ranges, {1, 8})
table.insert(ranges, {14, 31})
table.insert(ranges, {33, 132})
table.insert(ranges, {134, 159})
table.insert(ranges, {161, 5759})
table.insert(ranges, {5761, 8191})
table.insert(ranges, {8203, 8231})
table.insert(ranges, {8234, 8238})
table.insert(ranges, {8240, 8286})
table.insert(ranges, {8288, 12287})
elseif c == 'l' then -- %l: represents all lowercase letters. (ONLY ASCII)
table.insert(ranges, {97, 122}) -- a - z
elseif c == 'p' then -- %p: represents all punctuation characters. (ONLY ASCII)
table.insert(ranges, {33, 47})
table.insert(ranges, {58, 64})
table.insert(ranges, {91, 96})
table.insert(ranges, {123, 126})
elseif c == 's' then -- %s: represents all space characters.
table.insert(ranges, {9, 13})
table.insert(codes, 32)
table.insert(codes, 133)
table.insert(codes, 160)
table.insert(codes, 5760)
table.insert(ranges, {8192, 8202})
table.insert(codes, 8232)
table.insert(codes, 8233)
table.insert(codes, 8239)
table.insert(codes, 8287)
table.insert(codes, 12288)
elseif c == 'u' then -- %u: represents all uppercase letters. (ONLY ASCII)
table.insert(ranges, {65, 90}) -- A - Z
elseif c == 'w' then -- %w: represents all alphanumeric characters. (ONLY ASCII)
table.insert(ranges, {48, 57}) -- 0 - 9
table.insert(ranges, {65, 90}) -- A - Z
table.insert(ranges, {97, 122}) -- a - z
elseif c == 'x' then -- %x: represents all hexadecimal digits.
table.insert(ranges, {48, 57}) -- 0 - 9
table.insert(ranges, {65, 70}) -- A - F
table.insert(ranges, {97, 102}) -- a - f
else
if not range then
table.insert(codes, utf8unicode(c))
else
table.remove(codes) -- removing '-'
table.insert(ranges, {table.remove(codes), utf8unicode(c)})
range = false
end
end
ignore = false
else
if not range then
table.insert(codes, utf8unicode(c))
else
table.remove(codes) -- removing '-'
table.insert(ranges, {table.remove(codes), utf8unicode(c)})
range = false
end
ignore = false
end
firstletter = false
end
table.sort(codes)
local function inRanges(charCode)
for _,r in ipairs(ranges) do
if r[1] <= charCode and charCode <= r[2] then
return true
end
end
return false
end
if not unmatch then
return function(charCode)
return binsearch(codes, charCode) or inRanges(charCode)
end, skip
else
return function(charCode)
return charCode ~= -1 and not (binsearch(codes, charCode) or inRanges(charCode))
end, skip
end
end
-- utf8sub with extra argument, and extra result value
local function utf8subWithBytes (s, i, j, sb)
-- argument defaults
j = j or -1
local pos = sb or 1
local bytes = len(s)
local len = 0
-- only set l if i or j is negative
local l = (i >= 0 and j >= 0) or utf8len(s)
local startChar = (i >= 0) and i or l + i + 1
local endChar = (j >= 0) and j or l + j + 1
-- can't have start before end!
if startChar > endChar then
return ""
end
-- byte offsets to pass to string.sub
local startByte,endByte = 1,bytes
while pos <= bytes do
len = len + 1
if len == startChar then
startByte = pos
end
pos = pos + utf8charbytes(s, pos)
if len == endChar then
endByte = pos - 1
break
end
end
if startChar > len then startByte = bytes+1 end
if endChar < 1 then endByte = 0 end
return sub(s, startByte, endByte), endByte + 1
end
local cache = setmetatable({},{
__mode = 'kv'
})
local cachePlain = setmetatable({},{
__mode = 'kv'
})
local function matcherGenerator(regex, plain)
local matcher = {
functions = {},
captures = {}
}
if not plain then
cache[regex] = matcher
else
cachePlain[regex] = matcher
end
local function simple(func)
return function(cC)
if func(cC) then
matcher:nextFunc()
matcher:nextStr()
else
matcher:reset()
end
end
end
local function star(func)
return function(cC)
if func(cC) then
matcher:fullResetOnNextFunc()
matcher:nextStr()
else
matcher:nextFunc()
end
end
end
local function minus(func)
return function(cC)
if func(cC) then
matcher:fullResetOnNextStr()
end
matcher:nextFunc()
end
end
local function question(func)
return function(cC)
if func(cC) then
matcher:fullResetOnNextFunc()
matcher:nextStr()
end
matcher:nextFunc()
end
end
local function capture(id)
return function(cC)
local l = matcher.captures[id][2] - matcher.captures[id][1]
local captured = utf8sub(matcher.string, matcher.captures[id][1], matcher.captures[id][2])
local check = utf8sub(matcher.string, matcher.str, matcher.str + l)
if captured == check then
for i = 0, l do
matcher:nextStr()
end
matcher:nextFunc()
else
matcher:reset()
end
end
end
local function captureStart(id)
return function(cC)
matcher.captures[id][1] = matcher.str
matcher:nextFunc()
end
end
local function captureStop(id)
return function(cC)
matcher.captures[id][2] = matcher.str - 1
matcher:nextFunc()
end
end
local function balancer(str)
local sum = 0
local bc, ec = utf8sub(str, 1, 1), utf8sub(str, 2, 2)
local skip = len(bc) + len(ec)
bc, ec = utf8unicode(bc), utf8unicode(ec)
return function(cC)
if cC == ec and sum > 0 then
sum = sum - 1
if sum == 0 then
matcher:nextFunc()
end
matcher:nextStr()
elseif cC == bc then
sum = sum + 1
matcher:nextStr()
else
if sum == 0 or cC == -1 then
sum = 0
matcher:reset()
else
matcher:nextStr()
end
end
end, skip
end
matcher.functions[1] = function(cC)
matcher:fullResetOnNextStr()
matcher.seqStart = matcher.str
matcher:nextFunc()
if (matcher.str > matcher.startStr and matcher.fromStart) or matcher.str >= matcher.stringLen then
matcher.stop = true
matcher.seqStart = nil
end
end
local lastFunc
local ignore = false
local skip = nil
local it = (function()
local gen = utf8gensub(regex)
return function()
return gen(skip)
end
end)()
local cs = {}
for c, bs, be in it do
skip = nil
if plain then
table.insert(matcher.functions, simple(classMatchGenerator(c, plain)))
else
if ignore then
if find('123456789', c, 1, true) then
if lastFunc then
table.insert(matcher.functions, simple(lastFunc))
lastFunc = nil
end
table.insert(matcher.functions, capture(tonumber(c)))
elseif c == 'b' then
if lastFunc then
table.insert(matcher.functions, simple(lastFunc))
lastFunc = nil
end
local b
b, skip = balancer(sub(regex, be + 1, be + 9))
table.insert(matcher.functions, b)
else
lastFunc = classMatchGenerator('%' .. c)
end
ignore = false
else
if c == '*' then
if lastFunc then
table.insert(matcher.functions, star(lastFunc))
lastFunc = nil
else
error('invalid regex after ' .. sub(regex, 1, bs))
end
elseif c == '+' then
if lastFunc then
table.insert(matcher.functions, simple(lastFunc))
table.insert(matcher.functions, star(lastFunc))
lastFunc = nil
else
error('invalid regex after ' .. sub(regex, 1, bs))
end
elseif c == '-' then
if lastFunc then
table.insert(matcher.functions, minus(lastFunc))
lastFunc = nil
else
error('invalid regex after ' .. sub(regex, 1, bs))
end
elseif c == '?' then
if lastFunc then
table.insert(matcher.functions, question(lastFunc))
lastFunc = nil
else
error('invalid regex after ' .. sub(regex, 1, bs))
end
elseif c == '^' then
if bs == 1 then
matcher.fromStart = true
else
error('invalid regex after ' .. sub(regex, 1, bs))
end
elseif c == '$' then
if be == len(regex) then
matcher.toEnd = true
else
error('invalid regex after ' .. sub(regex, 1, bs))
end
elseif c == '[' then
if lastFunc then
table.insert(matcher.functions, simple(lastFunc))
end
lastFunc, skip = classMatchGenerator(sub(regex, be + 1))
elseif c == '(' then
if lastFunc then
table.insert(matcher.functions, simple(lastFunc))
lastFunc = nil
end
table.insert(matcher.captures, {})
table.insert(cs, #matcher.captures)
table.insert(matcher.functions, captureStart(cs[#cs]))
if sub(regex, be + 1, be + 1) == ')' then matcher.captures[#matcher.captures].empty = true end
elseif c == ')' then
if lastFunc then
table.insert(matcher.functions, simple(lastFunc))
lastFunc = nil
end
local cap = table.remove(cs)
if not cap then
error('invalid capture: "(" missing')
end
table.insert(matcher.functions, captureStop(cap))
elseif c == '.' then
if lastFunc then
table.insert(matcher.functions, simple(lastFunc))
end
lastFunc = function(cC) return cC ~= -1 end
elseif c == '%' then
ignore = true
else
if lastFunc then
table.insert(matcher.functions, simple(lastFunc))
end
lastFunc = classMatchGenerator(c)
end
end
end
end
if #cs > 0 then
error('invalid capture: ")" missing')
end
if lastFunc then
table.insert(matcher.functions, simple(lastFunc))
end
lastFunc = nil
ignore = nil
table.insert(matcher.functions, function()
if matcher.toEnd and matcher.str ~= matcher.stringLen then
matcher:reset()
else
matcher.stop = true
end
end)
matcher.nextFunc = function(self)
self.func = self.func + 1
end
matcher.nextStr = function(self)
self.str = self.str + 1
end
matcher.strReset = function(self)
local oldReset = self.reset
local str = self.str
self.reset = function(s)
s.str = str
s.reset = oldReset
end
end
matcher.fullResetOnNextFunc = function(self)
local oldReset = self.reset
local func = self.func +1
local str = self.str
self.reset = function(s)
s.func = func
s.str = str
s.reset = oldReset
end
end
matcher.fullResetOnNextStr = function(self)
local oldReset = self.reset
local str = self.str + 1
local func = self.func
self.reset = function(s)
s.func = func
s.str = str
s.reset = oldReset
end
end
matcher.process = function(self, str, start)
self.func = 1
start = start or 1
self.startStr = (start >= 0) and start or utf8len(str) + start + 1
self.seqStart = self.startStr
self.str = self.startStr
self.stringLen = utf8len(str) + 1
self.string = str
self.stop = false
self.reset = function(s)
s.func = 1
end
local lastPos = self.str
local lastByte
local char
while not self.stop do
if self.str < self.stringLen then
--[[ if lastPos < self.str then
print('last byte', lastByte)
char, lastByte = utf8subWithBytes(str, 1, self.str - lastPos - 1, lastByte)
char, lastByte = utf8subWithBytes(str, 1, 1, lastByte)
lastByte = lastByte - 1
else
char, lastByte = utf8subWithBytes(str, self.str, self.str)
end
lastPos = self.str ]]
char = utf8sub(str, self.str,self.str)
--print('char', char, utf8unicode(char))
self.functions[self.func](utf8unicode(char))
else
self.functions[self.func](-1)
end
end
if self.seqStart then
local captures = {}
for _,pair in pairs(self.captures) do
if pair.empty then
table.insert(captures, pair[1])
else
table.insert(captures, utf8sub(str, pair[1], pair[2]))
end
end
return self.seqStart, self.str - 1, unpack(captures)
end
end
return matcher
end
-- string.find
local function utf8find(str, regex, init, plain)
local matcher = cache[regex] or matcherGenerator(regex, plain)
return matcher:process(str, init)
end
-- string.match
local function utf8match(str, regex, init)
init = init or 1
local found = {utf8find(str, regex, init)}
if found[1] then
if found[3] then
return unpack(found, 3)
end
return utf8sub(str, found[1], found[2])
end
end
-- string.gmatch
local function utf8gmatch(str, regex, all)
regex = (utf8sub(regex,1,1) ~= '^') and regex or '%' .. regex
local lastChar = 1
return function()
local found = {utf8find(str, regex, lastChar)}
if found[1] then
lastChar = found[2] + 1
if found[all and 1 or 3] then
return unpack(found, all and 1 or 3)
end
return utf8sub(str, found[1], found[2])
end
end
end
local function replace(repl, args)
local ret = ''
if type(repl) == 'string' then
local ignore = false
local num = 0
for c in utf8gensub(repl) do
if not ignore then
if c == '%' then
ignore = true
else
ret = ret .. c
end
else
num = tonumber(c)
if num then
ret = ret .. args[num]
else
ret = ret .. c
end
ignore = false
end
end
elseif type(repl) == 'table' then
ret = repl[args[1] or args[0]] or ''
elseif type(repl) == 'function' then
if #args > 0 then
ret = repl(unpack(args, 1)) or ''
else
ret = repl(args[0]) or ''
end
end
return ret
end
-- string.gsub
local function utf8gsub(str, regex, repl, limit)
limit = limit or -1
local ret = ''
local prevEnd = 1
local it = utf8gmatch(str, regex, true)
local found = {it()}
local n = 0
while #found > 0 and limit ~= n do
local args = {[0] = utf8sub(str, found[1], found[2]), unpack(found, 3)}
ret = ret .. utf8sub(str, prevEnd, found[1] - 1)
.. replace(repl, args)
prevEnd = found[2] + 1
n = n + 1
found = {it()}
end
return ret .. utf8sub(str, prevEnd), n
end
string.byte = utf8unicode
string.char = utf8char
string.dump = dump
string.find = utf8find
string.format = format
string.gmatch = utf8gmatch
string.gsub = utf8gsub
string.len = utf8len
string.lower = lower--utf8lower
string.match = utf8match
string.rep = rep
string.reverse = utf8reverse
string.sub = utf8sub
string.upper = upper--utf8upper
string.gensub = utf8gensub
--[[ local utf8 = {}
utf8.len = utf8len
utf8.sub = utf8sub
utf8.reverse = utf8reverse
utf8.char = utf8char
utf8.unicode = utf8unicode
utf8.gensub = utf8gensub
utf8.byte = utf8unicode
utf8.find = utf8find
utf8.match = utf8match
utf8.gmatch = utf8gmatch
utf8.gsub = utf8gsub
utf8.dump = dump
utf8.format = format
utf8.lower = lower
utf8.upper = upper
utf8.rep = rep
return utf8 ]]