Merge pull request #153 from Stepets/master

utf8 lib update
2024-11-18 16:04:22 +00:00 · 2014-11-23 19:47:01 -05:00 · 2014-11-23 19:47:01 -05:00 · 03f2b0736c
commit 03f2b0736c
parent 89135abc06 ff861631c4
1 changed files with 699 additions and 32 deletions
--- a/libraries/utf8.lua
+++ b/libraries/utf8.lua
@ -1,17 +1,21 @@
 -- $Id: utf8.lua 179 2009-04-03 18:10:03Z pasta $
 --
 -- Provides UTF-8 aware string functions implemented in pure lua:
-- * string.utf8len(s)
+-- * utf8len(s)
-- * string.utf8sub(s, i, j)
+-- * utf8sub(s, i, j)
-- * string.utf8reverse(s)
+-- * utf8reverse(s)
-- * string.utf8char(unicode)
+-- * utf8char(unicode)
-- * string.utf8unicode(s, i, j)
+-- * utf8unicode(s, i, j)
-- * string.utf8gensub(s, sub_len)
+-- * utf8gensub(s, sub_len)
 -- * utf8find(str, regex, init, plain)
 -- * utf8match(str, regex, init)
 -- * utf8gmatch(str, regex, all)
 -- * utf8gsub(str, regex, repl, limit)
 --
 -- If utf8data.lua (containing the lower<->upper case mappings) is loaded, these
 -- additional functions are available:
-- * string.utf8upper(s)
+-- * utf8upper(s)
-- * string.utf8lower(s)
+-- * utf8lower(s)
 --
 -- All functions behave as their non UTF-8 aware counterparts with the exception
 -- that UTF-8 characters are used instead of bytes for all units.
@ -20,6 +24,9 @@
 Copyright (c) 2006-2007, Kyle Smith
 All rights reserved.
 Contributors:
 	Alimov Stepan
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
@ -57,9 +64,20 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 -- UTF8-tail   = %x80-BF
 -- 
-local len = string.len
+local byte    = string.byte
-local sub = string.sub
+local char    = string.char
-local char = string.char
+local dump    = string.dump
 local find    = string.find
 local format  = string.format
 local gmatch  = string.gmatch
 local gsub    = string.gsub
 local len     = string.len
 local lower   = string.lower
 local match   = string.match
 local rep     = string.rep
 local reverse = string.reverse
 local sub     = string.sub
 local upper   = string.upper
 -- returns the number of bytes used by the UTF-8 character at byte i in s
 -- also doubles as a UTF-8 character validator
@ -75,7 +93,7 @@ local function utf8charbytes (s, i)
 		error("bad argument #2 to 'utf8charbytes' (number expected, got ".. type(i).. ")")
 	end
-	local c = s:byte(i)
+	local c = byte(s, i)
 	-- determine bytes needed for character, based on RFC 3629
 	-- validate byte 1
@ -85,7 +103,7 @@ local function utf8charbytes (s, i)
 	elseif c >= 194 and c <= 223 then
 		-- UTF8-2
-		local c2 = s:byte(i + 1)
+		local c2 = byte(s, i + 1)
 		if not c2 then
 			error("UTF-8 string terminated early")
@ -100,8 +118,8 @@ local function utf8charbytes (s, i)
 	elseif c >= 224 and c <= 239 then
 		-- UTF8-3
-		local c2 = s:byte(i + 1)
+		local c2 = byte(s, i + 1)
-		local c3 = s:byte(i + 2)
+		local c3 = byte(s, i + 2)
 		if not c2 or not c3 then
 			error("UTF-8 string terminated early")
@ -125,9 +143,9 @@ local function utf8charbytes (s, i)
 	elseif c >= 240 and c <= 244 then
 		-- UTF8-4
-		local c2 = s:byte(i + 1)
+		local c2 = byte(s, i + 1)
-		local c3 = s:byte(i + 2)
+		local c3 = byte(s, i + 2)
-		local c4 = s:byte(i + 3)
+		local c4 = byte(s, i + 3)
 		if not c2 or not c3 or not c4 then
 			error("UTF-8 string terminated early")
@ -275,10 +293,10 @@ local function utf8reverse (s)
 	local newstr = ""
 	while pos > 0 do
-		c = s:byte(pos)
+		c = byte(s, pos)
 		while c >= 128 and c <= 191 do
 			pos = pos - 1
-			c = s:byte(pos)
+			c = byte(s, pos)
 		end
 		charbytes = utf8charbytes(s, pos)
@ -342,25 +360,25 @@ utf8unicode = function(str, i, j, byte_pos)
 		bytes = utf8charbytes(str,byte_pos)
 		char  = sub(str,byte_pos,byte_pos-1+bytes)
 	else
-		char,byte_pos = utf8sub(str,i,i)
+		char,byte_pos = utf8sub(str,i,i), 0
 		bytes         = #char
 	end
 	local unicode
-	if bytes == 1 then unicode = string.byte(char) end
+	if bytes == 1 then unicode = byte(char) end
 	if bytes == 2 then
-		local byte0,byte1 = string.byte(char,1,2)
+		local byte0,byte1 = byte(char,1,2)
 		local code0,code1 = byte0-0xC0,byte1-0x80
 		unicode = code0*shift_6 + code1
 	end
 	if bytes == 3 then
-		local byte0,byte1,byte2 = string.byte(char,1,3)
+		local byte0,byte1,byte2 = byte(char,1,3)
 		local code0,code1,code2 = byte0-0xE0,byte1-0x80,byte2-0x80
 		unicode = code0*shift_12 + code1*shift_6 + code2
 	end
 	if bytes == 4 then
-		local byte0,byte1,byte2,byte3 = string.byte(char,1,4)
+		local byte0,byte1,byte2,byte3 = byte(char,1,4)
 		local code0,code1,code2,code3 = byte0-0xF0,byte1-0x80,byte2-0x80,byte3-0x80
 		unicode = code0*shift_18 + code1*shift_12 + code2*shift_6 + code3
 	end
@ -373,7 +391,8 @@ local function utf8gensub(str, sub_len)
 	sub_len        = sub_len or 1
 	local byte_pos = 1
 	local len      = #str
-	return function()
+	return function(skip)
 		if skip then byte_pos = byte_pos + skip end
 		local char_count = 0
 		local start      = byte_pos
 		repeat
@ -390,9 +409,657 @@ local function utf8gensub(str, sub_len)
 	end
 end
-string.len       = utf8len
+local function binsearch(sortedTable, item, comp)
-string.sub       = utf8sub
+	local head, tail = 1, #sortedTable
-string.reverse   = utf8reverse
+	local mid = math.floor((head + tail)/2)
-string.char      = utf8char
+	if not comp then
-string.unicode   = utf8unicode
+		while (tail - head) > 1 do
-string.gensub    = utf8gensub
+			if sortedTable[tonumber(mid)] > item then
 				tail = mid
 			else
 				head = mid
 			end
 			mid = math.floor((head + tail)/2)
 		end
 	else
 	end
 	if sortedTable[tonumber(head)] == item then
 		return true, tonumber(head)
 	elseif sortedTable[tonumber(tail)] == item then
 		return true, tonumber(tail)
 	else
 		return false
 	end
 end
 local function classMatchGenerator(class, plain)
 	local codes = {}
 	local ranges = {}
 	local ignore = false
 	local range = false
 	local firstletter = true
 	local unmatch = false
 	local it = utf8gensub(class) 
 	local skip
 	for c,bs,be in it do
 		skip = be
 		if not ignore and not plain then
 			if c == "%" then
 				ignore = true
 			elseif c == "-" then
 				table.insert(codes, utf8unicode(c))
 				range = true
 			elseif c == "^" then
 				if not firstletter then
 					error('!!!')
 				else
 					unmatch = true
 				end
 			elseif c == ']' then
 				break
 			else
 				if not range then
 					table.insert(codes, utf8unicode(c))
 				else
 					table.remove(codes) -- removing '-'
 					table.insert(ranges, {table.remove(codes), utf8unicode(c)})
 					range = false
 				end
 			end
 		elseif ignore and not plain then
 			if c == 'a' then -- %a: represents all letters. (ONLY ASCII)
 				table.insert(ranges, {65, 90}) -- A - Z
 				table.insert(ranges, {97, 122}) -- a - z
 			elseif c == 'c' then -- %c: represents all control characters.
 				table.insert(ranges, {0, 31})
 				table.insert(codes, 127)
 			elseif c == 'd' then -- %d: represents all digits.
 				table.insert(ranges, {48, 57}) -- 0 - 9
 			elseif c == 'g' then -- %g: represents all printable characters except space.
 				table.insert(ranges, {1, 8})
 				table.insert(ranges, {14, 31})
 				table.insert(ranges, {33, 132})
 				table.insert(ranges, {134, 159})
 				table.insert(ranges, {161, 5759})
 				table.insert(ranges, {5761, 8191})
 				table.insert(ranges, {8203, 8231})
 				table.insert(ranges, {8234, 8238})
 				table.insert(ranges, {8240, 8286})
 				table.insert(ranges, {8288, 12287})
 			elseif c == 'l' then -- %l: represents all lowercase letters. (ONLY ASCII)
 				table.insert(ranges, {97, 122}) -- a - z
 			elseif c == 'p' then -- %p: represents all punctuation characters. (ONLY ASCII)
 				table.insert(ranges, {33, 47})
 				table.insert(ranges, {58, 64})
 				table.insert(ranges, {91, 96})
 				table.insert(ranges, {123, 126})
 			elseif c == 's' then -- %s: represents all space characters.
 				table.insert(ranges, {9, 13})
 				table.insert(codes, 32)
 				table.insert(codes, 133)
 				table.insert(codes, 160)
 				table.insert(codes, 5760)
 				table.insert(ranges, {8192, 8202})
 				table.insert(codes, 8232)
 				table.insert(codes, 8233)
 				table.insert(codes, 8239)
 				table.insert(codes, 8287)
 				table.insert(codes, 12288)
 			elseif c == 'u' then -- %u: represents all uppercase letters. (ONLY ASCII)
 				table.insert(ranges, {65, 90}) -- A - Z
 			elseif c == 'w' then -- %w: represents all alphanumeric characters. (ONLY ASCII)
 				table.insert(ranges, {48, 57}) -- 0 - 9
 				table.insert(ranges, {65, 90}) -- A - Z
 				table.insert(ranges, {97, 122}) -- a - z
 			elseif c == 'x' then -- %x: represents all hexadecimal digits.
 				table.insert(ranges, {48, 57}) -- 0 - 9
 				table.insert(ranges, {65, 70}) -- A - F
 				table.insert(ranges, {97, 102}) -- a - f
 			else
 				if not range then
 					table.insert(codes, utf8unicode(c))
 				else
 					table.remove(codes) -- removing '-'
 					table.insert(ranges, {table.remove(codes), utf8unicode(c)})
 					range = false
 				end
 			end
 			ignore = false
 		else
 			if not range then
 				table.insert(codes, utf8unicode(c))
 			else
 				table.remove(codes) -- removing '-'
 				table.insert(ranges, {table.remove(codes), utf8unicode(c)})
 				range = false
 			end
 			ignore = false
 		end
 		firstletter = false
 	end
 	table.sort(codes)
 	local function inRanges(charCode)
 		for _,r in ipairs(ranges) do
 			if r[1] <= charCode and charCode <= r[2] then
 				return true
 			end
 		end
 		return false
 	end
 	if not unmatch then 
 		return function(charCode)
 			return binsearch(codes, charCode) or inRanges(charCode) 
 		end, skip
 	else
 		return function(charCode)
 			return charCode ~= -1 and not (binsearch(codes, charCode) or inRanges(charCode))
 		end, skip
 	end
 end
 -- utf8sub with extra argument, and extra result value 
 local function utf8subWithBytes (s, i, j, sb)
 	-- argument defaults
 	j = j or -1
 	local pos = sb or 1
 	local bytes = len(s)
 	local len = 0
 	-- only set l if i or j is negative
 	local l = (i >= 0 and j >= 0) or utf8len(s)
 	local startChar = (i >= 0) and i or l + i + 1
 	local endChar   = (j >= 0) and j or l + j + 1
 	-- can't have start before end!
 	if startChar > endChar then
 		return ""
 	end
 	-- byte offsets to pass to string.sub
 	local startByte,endByte = 1,bytes
 	while pos <= bytes do
 		len = len + 1
 		if len == startChar then
 			startByte = pos
 		end
 		pos = pos + utf8charbytes(s, pos)
 		if len == endChar then
 			endByte = pos - 1
 			break
 		end
 	end
 	if startChar > len then startByte = bytes+1   end
 	if endChar   < 1   then endByte   = 0         end
 	return sub(s, startByte, endByte), endByte + 1
 end
 local cache = setmetatable({},{
 	__mode = 'kv'
 })
 local cachePlain = setmetatable({},{
 	__mode = 'kv'
 })
 local function matcherGenerator(regex, plain)
 	local matcher = {
 		functions = {},
 		captures = {}
 	}
 	if not plain then
 		cache[regex] =  matcher
 	else
 		cachePlain[regex] = matcher
 	end
 	local function simple(func)
 		return function(cC) 
 			if func(cC) then
 				matcher:nextFunc()
 				matcher:nextStr()
 			else
 				matcher:reset()
 			end
 		end
 	end
 	local function star(func)
 		return function(cC)
 			if func(cC) then
 				matcher:fullResetOnNextFunc()
 				matcher:nextStr()
 			else
 				matcher:nextFunc()
 			end
 		end
 	end
 	local function minus(func)
 		return function(cC)
 			if func(cC) then
 				matcher:fullResetOnNextStr()
 			end
 			matcher:nextFunc()
 		end
 	end
 	local function question(func)
 		return function(cC)
 			if func(cC) then
 				matcher:fullResetOnNextFunc()
 				matcher:nextStr()
 			end
 			matcher:nextFunc()
 		end
 	end
 	local function capture(id)
 		return function(cC)
 			local l = matcher.captures[id][2] - matcher.captures[id][1]
 			local captured = utf8sub(matcher.string, matcher.captures[id][1], matcher.captures[id][2])
 			local check = utf8sub(matcher.string, matcher.str, matcher.str + l)
 			if captured == check then
 				for i = 0, l do
 					matcher:nextStr()
 				end
 				matcher:nextFunc()
 			else
 				matcher:reset()
 			end
 		end
 	end
 	local function captureStart(id)
 		return function(cC)
 			matcher.captures[id][1] = matcher.str
 			matcher:nextFunc()
 		end
 	end
 	local function captureStop(id)
 		return function(cC)
 			matcher.captures[id][2] = matcher.str - 1
 			matcher:nextFunc()
 		end
 	end
 	local function balancer(str)
 		local sum = 0
 		local bc, ec = utf8sub(str, 1, 1), utf8sub(str, 2, 2)
 		local skip = len(bc) + len(ec)
 		bc, ec = utf8unicode(bc), utf8unicode(ec)
 		return function(cC)
 			if cC == ec and sum > 0 then
 				sum = sum - 1
 				if sum == 0 then
 					matcher:nextFunc()
 				end
 				matcher:nextStr()
 			elseif cC == bc then
 				sum = sum + 1
 				matcher:nextStr()
 			else
 				if sum == 0 or cC == -1 then
 					sum = 0
 					matcher:reset()
 				else
 					matcher:nextStr()
 				end
 			end
 		end, skip
 	end
 	matcher.functions[1] = function(cC)
 		matcher:fullResetOnNextStr()
 		matcher.seqStart = matcher.str
 		matcher:nextFunc()
 		if (matcher.str > matcher.startStr and matcher.fromStart) or matcher.str >= matcher.stringLen then
 			matcher.stop = true
 			matcher.seqStart = nil
 		end
 	end
 	local lastFunc
 	local ignore = false
 	local skip = nil
 	local it = (function()
 		local gen = utf8gensub(regex)
 		return function()
 			return gen(skip)
 		end
 	end)()
 	local cs = {}
 	for c, bs, be in it do
 		skip = nil
 		if plain then
 			table.insert(matcher.functions, simple(classMatchGenerator(c, plain)))
 		else
 			if ignore then
 				if find('123456789', c, 1, true) then
 					if lastFunc then
 						table.insert(matcher.functions, simple(lastFunc))
 						lastFunc = nil
 					end
 					table.insert(matcher.functions, capture(tonumber(c)))
 				elseif c == 'b' then
 					if lastFunc then
 						table.insert(matcher.functions, simple(lastFunc))
 						lastFunc = nil
 					end
 					local b
 					b, skip = balancer(sub(regex, be + 1, be + 9))
 					table.insert(matcher.functions, b)
 				else
 					lastFunc = classMatchGenerator('%' .. c)
 				end
 				ignore = false
 			else
 				if c == '*' then
 					if lastFunc then
 						table.insert(matcher.functions, star(lastFunc))
 						lastFunc = nil
 					else
 						error('invalid regex after ' .. sub(regex, 1, bs))
 					end
 				elseif c == '+' then
 					if lastFunc then
 						table.insert(matcher.functions, simple(lastFunc))
 						table.insert(matcher.functions, star(lastFunc))
 						lastFunc = nil
 					else
 						error('invalid regex after ' .. sub(regex, 1, bs))
 					end
 				elseif c == '-' then
 					if lastFunc then
 						table.insert(matcher.functions, minus(lastFunc))
 						lastFunc = nil
 					else
 						error('invalid regex after ' .. sub(regex, 1, bs))
 					end
 				elseif c == '?' then
 					if lastFunc then
 						table.insert(matcher.functions, question(lastFunc))
 						lastFunc = nil
 					else
 						error('invalid regex after ' .. sub(regex, 1, bs))
 					end
 				elseif c == '^' then
 					if bs == 1 then
 						matcher.fromStart = true
 					else
 						error('invalid regex after ' .. sub(regex, 1, bs))
 					end
 				elseif c == '$' then
 					if be == len(regex) then
 						matcher.toEnd = true
 					else
 						error('invalid regex after ' .. sub(regex, 1, bs))
 					end
 				elseif c == '[' then
 					if lastFunc then
 						table.insert(matcher.functions, simple(lastFunc))
 					end
 					lastFunc, skip = classMatchGenerator(sub(regex, be + 1))
 				elseif c == '(' then
 					if lastFunc then
 						table.insert(matcher.functions, simple(lastFunc))
 						lastFunc = nil
 					end
 					table.insert(matcher.captures, {})
 					table.insert(cs, #matcher.captures)
 					table.insert(matcher.functions, captureStart(cs[#cs]))
 					if sub(regex, be + 1, be + 1) == ')' then matcher.captures[#matcher.captures].empty = true end
 				elseif c == ')' then
 					if lastFunc then
 						table.insert(matcher.functions, simple(lastFunc))
 						lastFunc = nil
 					end
 					local cap = table.remove(cs)
 					if not cap then
 						error('invalid capture: "(" missing')
 					end
 					table.insert(matcher.functions, captureStop(cap))
 				elseif c == '.' then
 					if lastFunc then
 						table.insert(matcher.functions, simple(lastFunc))
 					end
 					lastFunc = function(cC) return cC ~= -1 end
 				elseif c == '%' then
 					ignore = true
 				else
 					if lastFunc then
 						table.insert(matcher.functions, simple(lastFunc))
 					end
 					lastFunc = classMatchGenerator(c)
 				end
 			end
 		end
 	end
 	if #cs > 0 then
 		error('invalid capture: ")" missing')
 	end
 	if lastFunc then
 		table.insert(matcher.functions, simple(lastFunc))
 	end
 	lastFunc = nil
 	ignore = nil
 	table.insert(matcher.functions, function()
 		if matcher.toEnd and matcher.str ~= matcher.stringLen then
 			matcher:reset()
 		else
 			matcher.stop = true
 		end
 	end)
 	matcher.nextFunc = function(self)
 		self.func = self.func + 1
 	end
 	matcher.nextStr = function(self)
 		self.str = self.str + 1
 	end
 	matcher.strReset = function(self)
 		local oldReset = self.reset
 		local str = self.str
 		self.reset = function(s)
 			s.str = str
 			s.reset = oldReset
 		end
 	end
 	matcher.fullResetOnNextFunc = function(self)
 		local oldReset = self.reset
 		local func = self.func +1
 		local str = self.str
 		self.reset = function(s)
 			s.func = func
 			s.str = str
 			s.reset = oldReset
 		end
 	end
 	matcher.fullResetOnNextStr = function(self)
 		local oldReset = self.reset
 		local str = self.str + 1
 		local func = self.func
 		self.reset = function(s)
 			s.func = func
 			s.str = str
 			s.reset = oldReset
 		end
 	end
 	matcher.process = function(self, str, start)
 		self.func = 1
 		start = start or 1
 		self.startStr = (start >= 0) and start or utf8len(str) + start + 1
 		self.seqStart = self.startStr
 		self.str = self.startStr
 		self.stringLen = utf8len(str) + 1
 		self.string = str
 		self.stop = false
 		self.reset = function(s)
 			s.func = 1
 		end
 		local lastPos = self.str
 		local lastByte
 		local char
 		while not self.stop do
 			if self.str < self.stringLen then
 				--[[ if lastPos < self.str then
 					print('last byte', lastByte)
 					char, lastByte = utf8subWithBytes(str, 1, self.str - lastPos - 1, lastByte)
 					char, lastByte = utf8subWithBytes(str, 1, 1, lastByte)
 					lastByte = lastByte - 1
 				else
 					char, lastByte = utf8subWithBytes(str, self.str, self.str)
 				end
 				lastPos = self.str ]]
 				char = utf8sub(str, self.str,self.str)
 				--print('char', char, utf8unicode(char))
 				self.functions[self.func](utf8unicode(char))
 			else
 				self.functions[self.func](-1)
 			end
 		end
 		if self.seqStart then
 			local captures = {}
 			for _,pair in pairs(self.captures) do
 				if pair.empty then
 					table.insert(captures, pair[1])
 				else
 					table.insert(captures, utf8sub(str, pair[1], pair[2]))
 				end
 			end
 			return self.seqStart, self.str - 1, unpack(captures)
 		end
 	end
 	return matcher
 end
 -- string.find
 local function utf8find(str, regex, init, plain)
 	local matcher = cache[regex] or matcherGenerator(regex, plain)
 	return matcher:process(str, init)
 end
 -- string.match
 local function utf8match(str, regex, init)
 	init = init or 1
 	local found = {utf8find(str, regex, init)}
 	if found[1] then
 		if found[3] then
 			return unpack(found, 3)
 		end
 		return utf8sub(str, found[1], found[2])
 	end
 end
 -- string.gmatch
 local function utf8gmatch(str, regex, all)
 	regex = (utf8sub(regex,1,1) ~= '^') and regex or '%' .. regex 
 	local lastChar = 1
 	return function()
 		local found = {utf8find(str, regex, lastChar)}
 		if found[1] then
 			lastChar = found[2] + 1
 			if found[all and 1 or 3] then
 				return unpack(found, all and 1 or 3)
 			end
 			return utf8sub(str, found[1], found[2])
 		end
 	end
 end
 local function replace(repl, args)
 	local ret = ''
 	if type(repl) == 'string' then
 		local ignore = false
 		local num = 0
 		for c in utf8gensub(repl) do
 			if not ignore then
 				if c == '%' then
 					ignore = true
 				else
 					ret = ret .. c
 				end
 			else
 				num = tonumber(c)
 				if num then
 					ret = ret .. args[num]
 				else
 					ret = ret .. c
 				end
 				ignore = false
 			end
 		end
 	elseif type(repl) == 'table' then
 		ret = repl[args[1] or args[0]] or ''
 	elseif type(repl) == 'function' then
 		if #args > 0 then
 			ret = repl(unpack(args, 1)) or ''
 		else
 			ret = repl(args[0]) or ''
 		end
 	end
 	return ret
 end
 -- string.gsub
 local function utf8gsub(str, regex, repl, limit)
 	limit = limit or -1
 	local ret = ''
 	local prevEnd = 1
 	local it = utf8gmatch(str, regex, true)
 	local found = {it()}
 	local n = 0
 	while #found > 0 and limit ~= n do
 		local args = {[0] = utf8sub(str, found[1], found[2]), unpack(found, 3)}
 		ret = ret .. utf8sub(str, prevEnd, found[1] - 1)
 		.. replace(repl, args)
 		prevEnd = found[2] + 1
 		n = n + 1 
 		found = {it()}
 	end
 	return ret .. utf8sub(str, prevEnd), n 
 end
 string.byte    = utf8unicode
 string.char    = utf8char
 string.dump    = dump
 string.find    = utf8find
 string.format  = format
 string.gmatch  = utf8gmatch
 string.gsub    = utf8gsub
 string.len     = utf8len
 string.lower   = lower--utf8lower
 string.match   = utf8match
 string.rep     = rep
 string.reverse = utf8reverse
 string.sub     = utf8sub
 string.upper   = upper--utf8upper
 string.gensub  = utf8gensub
 --[[ local utf8 = {}                                                                                             
 utf8.len = utf8len
 utf8.sub = utf8sub
 utf8.reverse = utf8reverse
 utf8.char = utf8char
 utf8.unicode = utf8unicode
 utf8.gensub = utf8gensub
 utf8.byte = utf8unicode
 utf8.find    = utf8find
 utf8.match   = utf8match
 utf8.gmatch  = utf8gmatch
 utf8.gsub    = utf8gsub  
 utf8.dump    = dump  
 utf8.format = format 
 utf8.lower = lower      
 utf8.upper = upper      
 utf8.rep     = rep
 return utf8 ]]