diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index e69de29..0000000 diff --git a/loveframes/third-party/utf8/LICENSE b/loveframes/third-party/utf8/LICENSE new file mode 100644 index 0000000..fd3b301 --- /dev/null +++ b/loveframes/third-party/utf8/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2016 Stepets + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/loveframes/third-party/utf8/README.md b/loveframes/third-party/utf8/README.md new file mode 100644 index 0000000..1b2d803 --- /dev/null +++ b/loveframes/third-party/utf8/README.md @@ -0,0 +1,62 @@ +# utf8.lua +pure-lua 5.3 regex library for Lua 5.3, Lua 5.1, LuaJIT + +This library provides simple way to add UTF-8 support into your application. + +#### Example: +```Lua +local utf8 = require('.utf8'):init() +for k,v in pairs(utf8) do + string[k] = v +end + +local str = "пыщпыщ ололоо я водитель нло" +print(str:find("(.л.+)н")) +-- 8 26 ололоо я водитель + +print(str:gsub("ло+", "보라")) +-- пыщпыщ о보라보라 я водитель н보라 3 + +print(str:match("^п[лопыщ ]*я")) +-- пыщпыщ ололоо я +``` + +#### Usage: + +This library can be used as drop-in replacement for vanilla string library. It exports all vanilla functions under `raw` sub-object. + +```Lua +local utf8 = require('.utf8'):init() +local str = "пыщпыщ ололоо я водитель нло" +utf8.gsub(str, "ло+", "보라") +-- пыщпыщ о보라보라 я водитель н보라 3 +utf8.raw.gsub(str, "ло+", "보라") +-- пыщпыщ о보라보라о я водитель н보라 3 +``` + +It also provides all functions from Lua 5.3 UTF-8 [module](https://www.lua.org/manual/5.3/manual.html#6.5) except `utf8.len (s [, i [, j]])`. If you need to validate your strings use `utf8.validate(str, byte_pos)` or iterate over with `utf8.validator`. + +#### Installation: + +Download repository to your project folder. (no rockspecs yet) + +As of Lua 5.3 default `utf8` module has precedence over user-provided. In this case you can specify full module path (`.utf8`). + +#### Configuration: + +Library is highly modular. You can provide your implementation for almost any function used. Library already has several back-ends: +- [Runtime character class processing](charclass/runtime/init.lua) using hardcoded codepoint ranges or using native functions through `ffi`. +- [Basic functions](primitives/init.lua) for working with UTF-8 characters have specializations for `ffi`-enabled runtime and for tarantool. + +Probably most interesting [customizations](init.lua) are `utf8.config.loadstring` and `utf8.config.cache` if you want to precompile your regexes. + +```Lua +local utf8 = require('.utf8') +utf8.config = { + cache = my_smart_cache, +} +utf8:init() +``` +Customization is done before initialization. If you want, you can change configuration after `init`, it might work for everything but modules. All of them should be reloaded. + +#### [Documentation:](test/test.lua) diff --git a/loveframes/third-party/utf8/begins/compiletime/parser.lua b/loveframes/third-party/utf8/begins/compiletime/parser.lua new file mode 100644 index 0000000..c5b0dbc --- /dev/null +++ b/loveframes/third-party/utf8/begins/compiletime/parser.lua @@ -0,0 +1,17 @@ +return function(utf8) + +utf8.config.begins = utf8.config.begins or { + utf8:require "begins.compiletime.vanilla" +} + +function utf8.regex.compiletime.begins.parse(regex, c, bs, ctx) + for _, m in ipairs(utf8.config.begins) do + local functions, move = m.parse(regex, c, bs, ctx) + utf8.debug("begins", _, c, bs, nbs, move, functions) + if functions then + return functions, move + end + end +end + +end diff --git a/loveframes/third-party/utf8/begins/compiletime/vanilla.lua b/loveframes/third-party/utf8/begins/compiletime/vanilla.lua new file mode 100644 index 0000000..69da4ca --- /dev/null +++ b/loveframes/third-party/utf8/begins/compiletime/vanilla.lua @@ -0,0 +1,61 @@ +return function(utf8) + +local matchers = { + sliding = function() + return [[ + add(function(ctx) -- sliding + local saved = ctx:clone() + local start_pos = ctx.pos + while ctx.pos <= 1 + utf8len(ctx.str) do + debug('starting from', ctx, "start_pos", start_pos) + ctx.result.start = ctx.pos + ctx:next_function() + ctx:get_function()(ctx) + + ctx = saved:clone() + start_pos = start_pos + 1 + ctx.pos = start_pos + end + ctx:terminate() + end) +]] + end, + fromstart = function(ctx) + return [[ + add(function(ctx) -- fromstart + local saved = ctx:clone() + ctx.result.start = ctx.pos + ctx:next_function() + ctx:get_function()(ctx) + ctx:terminate() + end) +]] + end, +} + +local function default() + return matchers.sliding() +end + +local function parse(regex, c, bs, ctx) + if bs ~= 1 then return end + + local functions + local skip = 0 + + if c == '^' then + functions = matchers.fromstart() + skip = 1 + else + functions = matchers.sliding() + end + + return functions, skip +end + +return { + parse = parse, + default = default, +} + +end diff --git a/loveframes/third-party/utf8/charclass/compiletime/builder.lua b/loveframes/third-party/utf8/charclass/compiletime/builder.lua new file mode 100644 index 0000000..b1d9e47 --- /dev/null +++ b/loveframes/third-party/utf8/charclass/compiletime/builder.lua @@ -0,0 +1,124 @@ +return function(utf8) + +local byte = utf8.byte +local unpack = utf8.config.unpack + +local builder = {} +local mt = {__index = builder} + +utf8.regex.compiletime.charclass.builder = builder + +function builder.new() + return setmetatable({}, mt) +end + +function builder:invert() + self.inverted = true + return self +end + +function builder:internal() -- is it enclosed in [] + self.internal = true + return self +end + +function builder:with_codes(...) + local codes = {...} + self.codes = self.codes or {} + + for _, v in ipairs(codes) do + table.insert(self.codes, type(v) == "number" and v or byte(v)) + end + + table.sort(self.codes) + return self +end + +function builder:with_ranges(...) + local ranges = {...} + self.ranges = self.ranges or {} + + for _, v in ipairs(ranges) do + table.insert(self.ranges, v) + end + + return self +end + +function builder:with_classes(...) + local classes = {...} + self.classes = self.classes or {} + + for _, v in ipairs(classes) do + table.insert(self.classes, v) + end + + return self +end + +function builder:without_classes(...) + local not_classes = {...} + self.not_classes = self.not_classes or {} + + for _, v in ipairs(not_classes) do + table.insert(self.not_classes, v) + end + + return self +end + +function builder:include(b) + if not b.inverted then + if b.codes then + self:with_codes(unpack(b.codes)) + end + if b.ranges then + self:with_ranges(unpack(b.ranges)) + end + if b.classes then + self:with_classes(unpack(b.classes)) + end + if b.not_classes then + self:without_classes(unpack(b.not_classes)) + end + else + self.includes = self.includes or {} + self.includes[#self.includes + 1] = b + end + return self +end + +function builder:build() + local codes_list = table.concat(self.codes or {}, ', ') + local ranges_list = '' + for i, r in ipairs(self.ranges or {}) do ranges_list = ranges_list .. (i > 1 and ', {' or '{') .. tostring(r[1]) .. ', ' .. tostring(r[2]) .. '}' end + local classes_list = '' + if self.classes then classes_list = "'" .. table.concat(self.classes, "', '") .. "'" end + local not_classes_list = '' + if self.not_classes then not_classes_list = "'" .. table.concat(self.not_classes, "', '") .. "'" end + + local subs_list = '' + for i, r in ipairs(self.includes or {}) do subs_list = subs_list .. (i > 1 and ', ' or '') .. r:build() .. '' end + + local src = [[cl.new():with_codes( + ]] .. codes_list .. [[ + ):with_ranges( + ]] .. ranges_list .. [[ + ):with_classes( + ]] .. classes_list .. [[ + ):without_classes( + ]] .. not_classes_list .. [[ + ):with_subs( + ]] .. subs_list .. [[ + )]] + + if self.inverted then + src = src .. ':invert()' + end + + return src +end + +return builder + +end diff --git a/loveframes/third-party/utf8/charclass/compiletime/parser.lua b/loveframes/third-party/utf8/charclass/compiletime/parser.lua new file mode 100644 index 0000000..4f1d4a9 --- /dev/null +++ b/loveframes/third-party/utf8/charclass/compiletime/parser.lua @@ -0,0 +1,21 @@ +return function(utf8) + +utf8.config.compiletime_charclasses = utf8.config.compiletime_charclasses or { + utf8:require "charclass.compiletime.vanilla", + utf8:require "charclass.compiletime.range", + utf8:require "charclass.compiletime.stub", +} + +function utf8.regex.compiletime.charclass.parse(regex, c, bs, ctx) + utf8.debug("parse charclass():", regex, c, bs, regex[bs]) + for _, p in ipairs(utf8.config.compiletime_charclasses) do + local charclass, nbs = p(regex, c, bs, ctx) + if charclass then + ctx.prev_class = charclass:build() + utf8.debug("cc", ctx.prev_class, _, c, bs, nbs) + return charclass, nbs + end + end +end + +end diff --git a/loveframes/third-party/utf8/charclass/compiletime/range.lua b/loveframes/third-party/utf8/charclass/compiletime/range.lua new file mode 100644 index 0000000..a76cfd1 --- /dev/null +++ b/loveframes/third-party/utf8/charclass/compiletime/range.lua @@ -0,0 +1,44 @@ +return function(utf8) + +local cl = utf8.regex.compiletime.charclass.builder + +local next = utf8.util.next + +return function(str, c, bs, ctx) + if not ctx.internal then return end + + local nbs = bs + + local r1, r2 + + local c, nbs = c, bs + if c == '%' then + c, nbs = next(str, nbs) + r1 = c + else + r1 = c + end + + utf8.debug("range r1", r1, nbs) + + c, nbs = next(str, nbs) + if c ~= '-' then return end + + c, nbs = next(str, nbs) + if c == '%' then + c, nbs = next(str, nbs) + r2 = c + elseif c ~= '' then + r2 = c + end + + utf8.debug("range r2", r2, nbs) + + if r1 and r2 then + return cl.new():with_ranges{utf8.byte(r1), utf8.byte(r2)}, utf8.next(str, nbs) - bs + else + return + end +end + +end diff --git a/loveframes/third-party/utf8/charclass/compiletime/stub.lua b/loveframes/third-party/utf8/charclass/compiletime/stub.lua new file mode 100644 index 0000000..395d05c --- /dev/null +++ b/loveframes/third-party/utf8/charclass/compiletime/stub.lua @@ -0,0 +1,9 @@ +return function(utf8) + +local cl = utf8.regex.compiletime.charclass.builder + +return function(str, c, bs, ctx) + return cl.new():with_codes(c), utf8.next(str, bs) - bs +end + +end diff --git a/loveframes/third-party/utf8/charclass/compiletime/vanilla.lua b/loveframes/third-party/utf8/charclass/compiletime/vanilla.lua new file mode 100644 index 0000000..3307519 --- /dev/null +++ b/loveframes/third-party/utf8/charclass/compiletime/vanilla.lua @@ -0,0 +1,112 @@ +return function(utf8) + +local cl = utf8:require "charclass.compiletime.builder" + +local next = utf8.util.next + +local token = 1 + +local function parse(str, c, bs, ctx) + local tttt = token + token = token + 1 + + local class + local nbs = bs + utf8.debug("cc_parse", tttt, str, c, nbs, next(str, nbs)) + + if c == '%' then + c, nbs = next(str, bs) + local _c = utf8.raw.lower(c) + local matched + if _c == 'a' then + matched = ('alpha') + elseif _c == 'c' then + matched = ('cntrl') + elseif _c == 'd' then + matched = ('digit') + elseif _c == 'g' then + matched = ('graph') + elseif _c == 'l' then + matched = ('lower') + elseif _c == 'p' then + matched = ('punct') + elseif _c == 's' then + matched = ('space') + elseif _c == 'u' then + matched = ('upper') + elseif _c == 'w' then + matched = ('alnum') + elseif _c == 'x' then + matched = ('xdigit') + end + + if matched then + if _c ~= c then + class = cl.new():without_classes(matched) + else + class = cl.new():with_classes(matched) + end + end + elseif c == '[' then + local old_internal = ctx.internal + ctx.internal = true + class = cl.new() + local firstletter = true + while true do + local prev_nbs = nbs + c, nbs = next(str, nbs) + utf8.debug("next", tttt, c, nbs) + if c == '^' and firstletter then + class:invert() + elseif c == ']' then + utf8.debug('] on pos', tttt, nbs) + break + elseif c == '' then + error "malformed pattern (missing ']')" + else + local sub_class, skip = utf8.regex.compiletime.charclass.parse(str, c, nbs, ctx) + nbs = prev_nbs + skip + utf8.debug("include", tttt, bs, prev_nbs, nbs, skip) + class:include(sub_class) + end + firstletter = false + end + ctx.internal = old_internal + elseif c == '.' then + if not ctx.internal then + class = cl.new():invert() + else + class = cl.new():with_codes(c) + end + end + + return class, utf8.next(str, nbs) - bs +end + +return parse + +end + +--[[ + x: (where x is not one of the magic characters ^$()%.[]*+-?) represents the character x itself. + .: (a dot) represents all characters. + %a: represents all letters. + %c: represents all control characters. + %d: represents all digits. + %g: represents all printable characters except space. + %l: represents all lowercase letters. + %p: represents all punctuation characters. + %s: represents all space characters. + %u: represents all uppercase letters. + %w: represents all alphanumeric characters. + %x: represents all hexadecimal digits. + %x: (where x is any non-alphanumeric character) represents the character x. This is the standard way to escape the magic characters. Any non-alphanumeric character (including all punctuation characters, even the non-magical) can be preceded by a '%' when used to represent itself in a pattern. + [set]: represents the class which is the union of all characters in set. A range of characters can be specified by separating the end characters of the range, in ascending order, with a '-'. All classes %x described above can also be used as components in set. All other characters in set represent themselves. For example, [%w_] (or [_%w]) represents all alphanumeric characters plus the underscore, [0-7] represents the octal digits, and [0-7%l%-] represents the octal digits plus the lowercase letters plus the '-' character. + + You can put a closing square bracket in a set by positioning it as the first character in the set. You can put a hyphen in a set by positioning it as the first or the last character in the set. (You can also use an escape for both cases.) + + The interaction between ranges and classes is not defined. Therefore, patterns like [%a-z] or [a-%%] have no meaning. + [^set]: represents the complement of set, where set is interpreted as above. + +For all classes represented by single letters (%a, %c, etc.), the corresponding uppercase letter represents the complement of the class. For instance, %S represents all non-space characters. +]] diff --git a/loveframes/third-party/utf8/charclass/runtime/base.lua b/loveframes/third-party/utf8/charclass/runtime/base.lua new file mode 100644 index 0000000..70b3c88 --- /dev/null +++ b/loveframes/third-party/utf8/charclass/runtime/base.lua @@ -0,0 +1,164 @@ +return function(utf8) + +local class = {} +local mt = {__index = class} + +local utf8gensub = utf8.gensub + +function class.new() + return setmetatable({}, mt) +end + +function class:invert() + self.inverted = true + return self +end + +function class:with_codes(...) + local codes = {...} + self.codes = self.codes or {} + + for _, v in ipairs(codes) do + table.insert(self.codes, v) + end + + table.sort(self.codes) + return self +end + +function class:with_ranges(...) + local ranges = {...} + self.ranges = self.ranges or {} + + for _, v in ipairs(ranges) do + table.insert(self.ranges, v) + end + + return self +end + +function class:with_classes(...) + local classes = {...} + self.classes = self.classes or {} + + for _, v in ipairs(classes) do + table.insert(self.classes, v) + end + + return self +end + +function class:without_classes(...) + local not_classes = {...} + self.not_classes = self.not_classes or {} + + for _, v in ipairs(not_classes) do + table.insert(self.not_classes, v) + end + + return self +end + +function class:with_subs(...) + local subs = {...} + self.subs = self.subs or {} + + for _, v in ipairs(subs) do + table.insert(self.subs, v) + end + + return self +end + +function class:in_codes(item) + if not self.codes then return false end + + local head, tail = 1, #self.codes + local mid = math.floor((head + tail)/2) + while (tail - head) > 1 do + if self.codes[mid] > item then + tail = mid + else + head = mid + end + mid = math.floor((head + tail)/2) + end + if self.codes[head] == item then + return true, head + elseif self.codes[tail] == item then + return true, tail + else + return false + end +end + +function class:in_ranges(char_code) + if not self.ranges then return false end + + for _,r in ipairs(self.ranges) do + if r[1] <= char_code and char_code <= r[2] then + return true + end + end + return false +end + +function class:in_classes(char_code) + if not self.classes then return false end + + for _, class in ipairs(self.classes) do + if self:is(class, char_code) then + return true + end + end + return false +end + +function class:in_not_classes(char_code) + if not self.not_classes then return false end + + for _, class in ipairs(self.not_classes) do + if self:is(class, char_code) then + return true + end + end + return false +end + +function class:is(class, char_code) + error("not implemented") +end + +function class:in_subs(char_code) + if not self.subs or #self.subs == 0 then return false end + + for _, c in ipairs(self.subs) do + if not c:test(char_code) then + return false + end + end + return true +end + +function class:test(char_code) + local result = self:do_test(char_code) + utf8.debug('class:test', result, "'" .. (char_code and utf8.char(char_code) or 'nil') .. "'", char_code) + return result +end + +function class:do_test(char_code) + if not char_code then return false end + local found = (self:in_codes(char_code) or self:in_ranges(char_code) or self:in_classes(char_code) or self:in_subs(char_code)) and not self:in_not_classes(char_code) + utf8.debug('class:do_test', 'found', found, 'inverted', self.inverted, 'result', self.inverted and not found or found) + -- utf8.debug(self:in_codes(char_code), self:in_ranges(char_code), self:in_classes(char_code), self:in_subs(char_code), not self:in_not_classes(char_code)) + -- ternary if ideom (self.inverted and not found or found) doesn't work with booleans >_< + if self.inverted then + return not found + else + return found + end +end + +return class + +end diff --git a/loveframes/third-party/utf8/charclass/runtime/dummy.lua b/loveframes/third-party/utf8/charclass/runtime/dummy.lua new file mode 100644 index 0000000..1faddc1 --- /dev/null +++ b/loveframes/third-party/utf8/charclass/runtime/dummy.lua @@ -0,0 +1,41 @@ +return function(utf8) + +local base = utf8:require "charclass.runtime.base" + +local dummy = setmetatable({}, {__index = base}) +local mt = {__index = dummy} + +function dummy.new() + return setmetatable({}, mt) +end + +function dummy:with_classes(...) + local classes = {...} + for _, c in ipairs(classes) do + if c == 'alpha' then self:with_ranges({65, 90}, {97, 122}) + elseif c == 'cntrl' then self:with_ranges({0, 31}):with_codes(127) + elseif c == 'digit' then self:with_ranges({48, 57}) + elseif c == 'graph' then self:with_ranges({1, 8}, {14, 31}, {33, 132}, {134, 159}, {161, 5759}, {5761, 8191}, {8203, 8231}, {8234, 8238}, {8240, 8286}, {8288, 12287}) + elseif c == 'lower' then self:with_ranges({97, 122}) + elseif c == 'punct' then self:with_ranges({33, 47}, {58, 64}, {91, 96}, {123, 126}) + elseif c == 'space' then self:with_ranges({9, 13}):with_codes(32, 133, 160, 5760):with_ranges({8192, 8202}):with_codes(8232, 8233, 8239, 8287, 12288) + elseif c == 'upper' then self:with_ranges({65, 90}) + elseif c == 'alnum' then self:with_ranges({48, 57}, {65, 90}, {97, 122}) + elseif c == 'xdigit' then self:with_ranges({48, 57}, {65, 70}, {97, 102}) + end + end + return self +end + +function dummy:without_classes(...) + local classes = {...} + if #classes > 0 then + return self:with_subs(dummy.new():with_classes(...):invert()) + else + return self + end +end + +return dummy + +end diff --git a/loveframes/third-party/utf8/charclass/runtime/init.lua b/loveframes/third-party/utf8/charclass/runtime/init.lua new file mode 100644 index 0000000..e71d037 --- /dev/null +++ b/loveframes/third-party/utf8/charclass/runtime/init.lua @@ -0,0 +1,22 @@ +return function(utf8) + +local provided = utf8.config.runtime_charclasses + +if provided then + if type(provided) == "table" then + return provided + elseif type(provided) == "function" then + return provided(utf8) + else + return utf8:require(provided) + end +end + +local ffi = pcall(require, "ffi") +if not ffi then + return utf8:require "charclass.runtime.dummy" +else + return utf8:require "charclass.runtime.native" +end + +end diff --git a/loveframes/third-party/utf8/charclass/runtime/native.lua b/loveframes/third-party/utf8/charclass/runtime/native.lua new file mode 100644 index 0000000..f7b7890 --- /dev/null +++ b/loveframes/third-party/utf8/charclass/runtime/native.lua @@ -0,0 +1,47 @@ +return function(utf8) + +os.setlocale(utf8.config.locale, "ctype") + +local ffi = require("ffi") +ffi.cdef[[ + int iswalnum(int c); + int iswalpha(int c); + int iswascii(int c); + int iswblank(int c); + int iswcntrl(int c); + int iswdigit(int c); + int iswgraph(int c); + int iswlower(int c); + int iswprint(int c); + int iswpunct(int c); + int iswspace(int c); + int iswupper(int c); + int iswxdigit(int c); +]] + +local base = utf8:require "charclass.runtime.base" + +local native = setmetatable({}, {__index = base}) +local mt = {__index = native} + +function native.new() + return setmetatable({}, mt) +end + +function native:is(class, char_code) + if class == 'alpha' then return ffi.C.iswalpha(char_code) ~= 0 + elseif class == 'cntrl' then return ffi.C.iswcntrl(char_code) ~= 0 + elseif class == 'digit' then return ffi.C.iswdigit(char_code) ~= 0 + elseif class == 'graph' then return ffi.C.iswgraph(char_code) ~= 0 + elseif class == 'lower' then return ffi.C.iswlower(char_code) ~= 0 + elseif class == 'punct' then return ffi.C.iswpunct(char_code) ~= 0 + elseif class == 'space' then return ffi.C.iswspace(char_code) ~= 0 + elseif class == 'upper' then return ffi.C.iswupper(char_code) ~= 0 + elseif class == 'alnum' then return ffi.C.iswalnum(char_code) ~= 0 + elseif class == 'xdigit' then return ffi.C.iswxdigit(char_code) ~= 0 + end +end + +return native + +end diff --git a/loveframes/third-party/utf8/context/compiletime.lua b/loveframes/third-party/utf8/context/compiletime.lua new file mode 100644 index 0000000..621204d --- /dev/null +++ b/loveframes/third-party/utf8/context/compiletime.lua @@ -0,0 +1,18 @@ +return function(utf8) + +local begins = utf8.config.begins +local ends = utf8.config.ends + +return { + new = function() + return { + prev_class = nil, + begins = begins[1].default(), + ends = ends[1].default(), + funcs = {}, + internal = false, -- hack for ranges, flags if parser is in [] + } + end +} + +end diff --git a/loveframes/third-party/utf8/context/runtime.lua b/loveframes/third-party/utf8/context/runtime.lua new file mode 100644 index 0000000..a333749 --- /dev/null +++ b/loveframes/third-party/utf8/context/runtime.lua @@ -0,0 +1,74 @@ +return function(utf8) + +local utf8unicode = utf8.byte +local utf8sub = utf8.sub +local utf8len = utf8.len +local rawgsub = utf8.raw.gsub + +local util = utf8.util + +local ctx = {} +local mt = { + __index = ctx, + __tostring = function(self) + return rawgsub([[str: '${str}', char: ${pos} '${char}', func: ${func_pos}]], "${(.-)}", { + str = self.str, + pos = self.pos, + char = self:get_char(), + func_pos = self.func_pos, + }) + end +} + +function ctx.new(obj) + obj = obj or {} + return setmetatable({ + pos = obj.pos or 1, + str = obj.str or nil, + starts = obj.starts or nil, + functions = obj.functions or {}, + func_pos = obj.func_pos or 1, + ends = obj.ends or nil, + result = obj.result and util.copy(obj.result) or {}, + captures = obj.captures and util.copy(obj.captures, true) or {active = {}}, + }, mt) +end + +function ctx:clone() + return self:new() +end + +function ctx:next_char() + self.pos = self.pos + 1 +end + +function ctx:get_char() + return utf8sub(self.str, self.pos, self.pos) +end + +function ctx:get_charcode() + if utf8len(self.str) < self.pos then return nil end + return utf8unicode(self:get_char()) +end + +function ctx:next_function() + self.func_pos = self.func_pos + 1 +end + +function ctx:get_function() + return self.functions[self.func_pos] +end + +function ctx:done() + utf8.debug('done', self) + coroutine.yield(self, self.result, self.captures) +end + +function ctx:terminate() + utf8.debug('terminate', self) + coroutine.yield(nil) +end + +return ctx + +end diff --git a/loveframes/third-party/utf8/ends/compiletime/parser.lua b/loveframes/third-party/utf8/ends/compiletime/parser.lua new file mode 100644 index 0000000..25f93ca --- /dev/null +++ b/loveframes/third-party/utf8/ends/compiletime/parser.lua @@ -0,0 +1,17 @@ +return function(utf8) + +utf8.config.ends = utf8.config.ends or { + utf8:require "ends.compiletime.vanilla" +} + +function utf8.regex.compiletime.ends.parse(regex, c, bs, ctx) + for _, m in ipairs(utf8.config.ends) do + local functions, move = m.parse(regex, c, bs, ctx) + utf8.debug("ends", _, c, bs, nbs, move, functions) + if functions then + return functions, move + end + end +end + +end diff --git a/loveframes/third-party/utf8/ends/compiletime/vanilla.lua b/loveframes/third-party/utf8/ends/compiletime/vanilla.lua new file mode 100644 index 0000000..520c7a9 --- /dev/null +++ b/loveframes/third-party/utf8/ends/compiletime/vanilla.lua @@ -0,0 +1,45 @@ +return function(utf8) + +local matchers = { + any = function() + return [[ + add(function(ctx) -- any + ctx.result.finish = ctx.pos - 1 + ctx:done() + end) +]] + end, + toend = function(ctx) + return [[ + add(function(ctx) -- toend + ctx.result.finish = ctx.pos - 1 + if ctx.pos == #ctx.str + 1 then ctx:done() end + end) +]] + end, +} + +local len = utf8.raw.len + +local function default() + return matchers.any() +end + +local function parse(regex, c, bs, ctx) + local functions + local skip = 0 + + if bs == len(regex) and c == '$' then + functions = matchers.toend() + skip = 1 + end + + return functions, skip +end + +return { + parse = parse, + default = default, +} + +end diff --git a/loveframes/third-party/utf8/functions/lua53.lua b/loveframes/third-party/utf8/functions/lua53.lua new file mode 100644 index 0000000..ee05271 --- /dev/null +++ b/loveframes/third-party/utf8/functions/lua53.lua @@ -0,0 +1,134 @@ +return function(utf8) + +local utf8sub = utf8.sub +local utf8gensub = utf8.gensub +local unpack = utf8.config.unpack +local get_matcher_function = utf8:require 'regex_parser' + +local function utf8find(str, regex, init, plain) + local func = get_matcher_function(regex, plain) + init = ((init or 1) < 0) and (utf8.len(str) + init + 1) or init + local ctx, result, captures = func(str, init, utf8) + if not ctx then return nil end + + utf8.debug('ctx:', ctx) + utf8.debug('result:', result) + utf8.debug('captures:', captures) + + return result.start, result.finish, unpack(captures) +end + +local function utf8match(str, regex, init) + local func = get_matcher_function(regex, plain, utf8) + local ctx, result, captures = func(str, init, utf8) + if not ctx then return nil end + + utf8.debug('ctx:', ctx) + utf8.debug('result:', result) + utf8.debug('captures:', captures) + + if #captures > 0 then return unpack(captures) end + + return utf8sub(str, result.start, result.finish) +end + +local function utf8gmatch(str, regex) + regex = (utf8sub(regex,1,1) ~= '^') and regex or '%' .. regex + local func = get_matcher_function(regex, plain, utf8) + local ctx, result, captures + local continue_pos = 1 + + return function() + ctx, result, captures = func(str, continue_pos, utf8) + + if not ctx then return nil end + + utf8.debug('ctx:', ctx) + utf8.debug('result:', result) + utf8.debug('captures:', captures) + + continue_pos = math.max(result.finish + 1, result.start + 1) + if #captures > 0 then + return unpack(captures) + else + return utf8sub(str, result.start, result.finish) + end + end +end + +local function replace(repl, args) + local ret = '' + if type(repl) == 'string' then + local ignore = false + local num + for _, c in utf8gensub(repl) do + if not ignore then + if c == '%' then + ignore = true + else + ret = ret .. c + end + else + num = tonumber(c) + if num then + ret = ret .. args[num] + else + ret = ret .. c + end + ignore = false + end + end + elseif type(repl) == 'table' then + ret = repl[args[1] or args[0]] or '' + elseif type(repl) == 'function' then + if #args > 0 then + ret = repl(unpack(args, 1)) or '' + else + ret = repl(args[0]) or '' + end + end + return ret +end + +local function utf8gsub(str, regex, repl, limit) + limit = limit or -1 + local subbed = '' + local prev_sub_finish = 1 + + regex = (utf8sub(regex,1,1) ~= '^') and regex or '%' .. regex + local func = get_matcher_function(regex, plain, utf8) + local ctx, result, captures + local continue_pos = 1 + + local n = 0 + while limit ~= n do + ctx, result, captures = func(str, continue_pos, utf8) + if not ctx then break end + + utf8.debug('ctx:', ctx) + utf8.debug('result:', result) + utf8.debug('result:', utf8sub(str, result.start, result.finish)) + utf8.debug('captures:', captures) + + continue_pos = math.max(result.finish + 1, result.start + 1) + local args = {[0] = utf8sub(str, result.start, result.finish), unpack(captures)} + + subbed = subbed .. utf8sub(str, prev_sub_finish, result.start - 1) + subbed = subbed .. replace(repl, args) + prev_sub_finish = result.finish + 1 + n = n + 1 + + end + + return subbed .. utf8sub(str, prev_sub_finish), n +end + +-- attaching high-level functions +utf8.find = utf8find +utf8.match = utf8match +utf8.gmatch = utf8gmatch +utf8.gsub = utf8gsub + +return utf8 + +end diff --git a/loveframes/third-party/utf8/init.lua b/loveframes/third-party/utf8/init.lua new file mode 100644 index 0000000..13cf69a --- /dev/null +++ b/loveframes/third-party/utf8/init.lua @@ -0,0 +1,57 @@ +local module_path = ... +module_path = module_path:match("^(.-)init$") or (module_path .. '.') +local utf8 = { + config = {}, + default = { + debug = nil, + logger = io.write, + loadstring = (loadstring or load), + unpack = (unpack or table.unpack), + cache = { + regex = setmetatable({},{ + __mode = 'kv' + }), + plain = setmetatable({},{ + __mode = 'kv' + }), + }, + locale = "C.UTF-8", + }, + regex = { + compiletime = { + charclass = {}, + begins = {}, + ends = {}, + modifier = {}, + } + }, + util = {}, +} + +function utf8:require(name) + local full_module_path = module_path .. name + if package.loaded[full_module_path] then + return package.loaded[full_module_path] + end + + local mod = require(full_module_path) + if type(mod) == 'function' then + mod = mod(self) + package.loaded[full_module_path] = mod + end + return mod +end + +function utf8:init() + for k, v in pairs(self.default) do + self.config[k] = self.config[k] or v + end + + self:require "util" + self:require "primitives.init" + self:require "functions.lua53" + + return self +end + +return utf8 diff --git a/loveframes/third-party/utf8/modifier/compiletime/frontier.lua b/loveframes/third-party/utf8/modifier/compiletime/frontier.lua new file mode 100644 index 0000000..0c3a912 --- /dev/null +++ b/loveframes/third-party/utf8/modifier/compiletime/frontier.lua @@ -0,0 +1,49 @@ +return function(utf8) + +local matchers = { + frontier = function(class, name) + local class_name = 'class' .. name + return [[ + local ]] .. class_name .. [[ = ]] .. class .. [[ + + add(function(ctx) -- frontier + ctx.pos = ctx.pos - 1 + local prev_charcode = ctx:get_charcode() + ctx:next_char() + debug("frontier pos", ctx.pos, "prev_charcode", prev_charcode, "charcode", ctx:get_charcode()) + if ]] .. class_name .. [[:test(prev_charcode) then return end + if ]] .. class_name .. [[:test(ctx:get_charcode()) then + ctx:next_function() + return ctx:get_function()(ctx) + end + end) +]] + end, + simple = utf8:require("modifier.compiletime.simple").simple, +} + +local function parse(regex, c, bs, ctx) + local functions, nbs, class + + if c == '%' then + if utf8.raw.sub(regex, bs + 1, bs + 1) ~= 'f' then return end + if utf8.raw.sub(regex, bs + 2, bs + 2) ~= '[' then error("missing '[' after '%f' in pattern") end + + functions = {} + if ctx.prev_class then + table.insert(functions, matchers.simple(ctx.prev_class, tostring(bs))) + ctx.prev_class = nil + end + class, nbs = utf8.regex.compiletime.charclass.parse(regex, '[', bs + 2, ctx) + nbs = nbs + 2 + table.insert(functions, matchers.frontier(class:build(), tostring(bs))) + end + + return functions, nbs +end + +return { + parse = parse, +} + +end diff --git a/loveframes/third-party/utf8/modifier/compiletime/parser.lua b/loveframes/third-party/utf8/modifier/compiletime/parser.lua new file mode 100644 index 0000000..0fb2e53 --- /dev/null +++ b/loveframes/third-party/utf8/modifier/compiletime/parser.lua @@ -0,0 +1,20 @@ +return function(utf8) + +utf8.config.modifier = utf8.config.modifier or { + utf8:require "modifier.compiletime.vanilla", + utf8:require "modifier.compiletime.frontier", + utf8:require "modifier.compiletime.stub", +} + +function utf8.regex.compiletime.modifier.parse(regex, c, bs, ctx) + for _, m in ipairs(utf8.config.modifier) do + local functions, move = m.parse(regex, c, bs, ctx) + utf8.debug("mod", _, c, bs, nbs, move, functions and utf8.config.unpack(functions)) + if functions then + ctx.prev_class = nil + return functions, move + end + end +end + +end diff --git a/loveframes/third-party/utf8/modifier/compiletime/simple.lua b/loveframes/third-party/utf8/modifier/compiletime/simple.lua new file mode 100644 index 0000000..d1e3943 --- /dev/null +++ b/loveframes/third-party/utf8/modifier/compiletime/simple.lua @@ -0,0 +1,23 @@ +return function(utf8) + +local matchers = { + simple = function(class, name) + local class_name = 'class' .. name + return [[ + local ]] .. class_name .. [[ = ]] .. class .. [[ + + add(function(ctx) -- simple + debug(ctx, 'simple', ']] .. class_name .. [[') + if ]] .. class_name .. [[:test(ctx:get_charcode()) then + ctx:next_char() + ctx:next_function() + return ctx:get_function()(ctx) + end + end) +]] + end, +} + +return matchers + +end diff --git a/loveframes/third-party/utf8/modifier/compiletime/stub.lua b/loveframes/third-party/utf8/modifier/compiletime/stub.lua new file mode 100644 index 0000000..16458b3 --- /dev/null +++ b/loveframes/third-party/utf8/modifier/compiletime/stub.lua @@ -0,0 +1,28 @@ +return function(utf8) + +local matchers = utf8:require("modifier.compiletime.simple") + +local function parse(regex, c, bs, ctx) + local functions + + if ctx.prev_class then + functions = { matchers.simple(ctx.prev_class, tostring(bs)) } + ctx.prev_class = nil + end + + return functions, 0 +end + +local function check(ctx) + if ctx.prev_class then + table.insert(ctx.funcs, matchers.simple(ctx.prev_class, tostring(bs))) + ctx.prev_class = nil + end +end + +return { + parse = parse, + check = check, +} + +end diff --git a/loveframes/third-party/utf8/modifier/compiletime/vanilla.lua b/loveframes/third-party/utf8/modifier/compiletime/vanilla.lua new file mode 100644 index 0000000..0723583 --- /dev/null +++ b/loveframes/third-party/utf8/modifier/compiletime/vanilla.lua @@ -0,0 +1,226 @@ +return function(utf8) + +local utf8unicode = utf8.byte +local sub = utf8.raw.sub + +local matchers = { + star = function(class, name) + local class_name = 'class' .. name + return [[ + local ]] .. class_name .. [[ = ]] .. class .. [[ + + add(function(ctx) -- star + debug(ctx, 'star', ']] .. class_name .. [[') + local saved = {ctx:clone()} + while ]] .. class_name .. [[:test(ctx:get_charcode()) do + ctx:next_char() + table.insert(saved, ctx:clone()) + debug('#saved <<', #saved) + end + while #saved > 0 do + ctx = table.remove(saved) + ctx:next_function() + ctx:get_function()(ctx) + debug('#saved >>', #saved) + end + end) +]] + end, + minus = function(class, name) + local class_name = 'class' .. name + return [[ + local ]] .. class_name .. [[ = ]] .. class .. [[ + + add(function(ctx) -- minus + debug(ctx, 'minus', ']] .. class_name .. [[') + + repeat + local saved = ctx:clone() + ctx:next_function() + ctx:get_function()(ctx) + ctx = saved + local match = ]] .. class_name .. [[:test(ctx:get_charcode()) + ctx:next_char() + until not match + end) +]] + end, + question = function(class, name) + local class_name = 'class' .. name + return [[ + local ]] .. class_name .. [[ = ]] .. class .. [[ + + add(function(ctx) -- question + debug(ctx, 'question', ']] .. class_name .. [[') + local saved = ctx:clone() + if ]] .. class_name .. [[:test(ctx:get_charcode()) then + ctx:next_char() + ctx:next_function() + ctx:get_function()(ctx) + end + ctx = saved + ctx:next_function() + return ctx:get_function()(ctx) + end) +]] + end, + capture_start = function(number) + return [[ + add(function(ctx) + debug(ctx, 'capture_start', ']] .. tostring(number) .. [[') + table.insert(ctx.captures.active, { id = ]] .. tostring(number) .. [[, start_byte = byte_pos, start = ctx.pos }) + ctx:next_function() + return ctx:get_function()(ctx) + end) +]] + end, + capture_finish = function(number) + return [[ + add(function(ctx) + debug(ctx, 'capture_finish', ']] .. tostring(number) .. [[') + local cap = table.remove(ctx.captures.active) + cap.finish_byte = byte_pos + cap.finish = ctx.pos + ctx.captures[cap.id] = utf8sub(ctx.str, cap.start, cap.finish - 1) + debug('capture#' .. tostring(cap.id), '[' .. tostring(cap.start).. ',' .. tostring(cap.finish) .. ']' , 'is', ctx.captures[cap.id]) + ctx:next_function() + return ctx:get_function()(ctx) + end) +]] + end, + capture = function(number) + return [[ + add(function(ctx) + debug(ctx, 'capture', ']] .. tostring(number) .. [[') + local cap = ctx.captures[ ]] .. tostring(number) .. [[ ] + local len = utf8len(cap) + local check = utf8sub(ctx.str, ctx.pos, ctx.pos + len - 1) + debug("capture check:", cap, check) + if cap == check then + ctx.pos = ctx.pos + len + ctx:next_function() + return ctx:get_function()(ctx) + end + end) +]] + end, + balancer = function(pair, name) + local class_name = 'class' .. name + return [[ + + add(function(ctx) -- balancer + local d, b = ]] .. tostring(utf8unicode(pair[1])) .. [[, ]] .. tostring(utf8unicode(pair[2])) .. [[ + if ctx:get_charcode() ~= d then return end + local balance = 0 + repeat + local c = ctx:get_charcode() + if c == nil then return end + + if c == d then + balance = balance + 1 + elseif c == b then + balance = balance - 1 + end + debug("balancer: balance=", balance, ", d=", d, ", b=", b, ", charcode=", ctx:get_charcode()) + ctx:next_char() + until balance == 0 + ctx:next_function() + return ctx:get_function()(ctx) + end) +]] + end, + simple = utf8:require("modifier.compiletime.simple").simple, +} + +local next = utf8.util.next + +local function parse(regex, c, bs, ctx) + local functions, nbs = nil, bs + if c == '%' then + c, nbs = next(regex, bs) + utf8.debug("next", c, bs) + if utf8.raw.find('123456789', c, 1, true) then + functions = { matchers.capture(tonumber(c)) } + nbs = utf8.next(regex, nbs) + elseif c == 'b' then + local d, b + d, nbs = next(regex, nbs) + b, nbs = next(regex, nbs) + functions = { matchers.balancer({d, b}, tostring(bs)) } + nbs = utf8.next(regex, nbs) + end + + if functions and ctx.prev_class then + table.insert(functions, 1, matchers.simple(ctx.prev_class, tostring(bs))) + end + elseif c == '*' and ctx.prev_class then + functions = { + matchers.star( + ctx.prev_class, + tostring(bs) + ) + } + nbs = bs + 1 + elseif c == '+' and ctx.prev_class then + functions = { + matchers.simple( + ctx.prev_class, + tostring(bs) + ), + matchers.star( + ctx.prev_class, + tostring(bs) + ) + } + nbs = bs + 1 + elseif c == '-' and ctx.prev_class then + functions = { + matchers.minus( + ctx.prev_class, + tostring(bs) + ) + } + nbs = bs + 1 + elseif c == '?' and ctx.prev_class then + functions = { + matchers.question( + ctx.prev_class, + tostring(bs) + ) + } + nbs = bs + 1 + elseif c == '(' then + ctx.capture = ctx.capture or {balance = 0, id = 0} + ctx.capture.balance = ctx.capture.balance + 1 + ctx.capture.id = ctx.capture.id + 1 + functions = { matchers.capture_start(ctx.capture.id) } + if ctx.prev_class then + table.insert(functions, 1, matchers.simple(ctx.prev_class, tostring(bs))) + end + nbs = bs + 1 + elseif c == ')' then + ctx.capture = ctx.capture or {balance = 0, id = 0} + functions = { matchers.capture_finish(ctx.capture.id) } + + ctx.capture.balance = ctx.capture.balance - 1 + assert(ctx.capture.balance >= 0, 'invalid capture: "(" missing') + + if ctx.prev_class then + table.insert(functions, 1, matchers.simple(ctx.prev_class, tostring(bs))) + end + nbs = bs + 1 + end + + return functions, nbs - bs +end + +local function check(ctx) + if ctx.capture then assert(ctx.capture.balance == 0, 'invalid capture: ")" missing') end +end + +return { + parse = parse, + check = check, +} + +end diff --git a/loveframes/third-party/utf8/primitives/dummy.lua b/loveframes/third-party/utf8/primitives/dummy.lua new file mode 100644 index 0000000..90a5d7e --- /dev/null +++ b/loveframes/third-party/utf8/primitives/dummy.lua @@ -0,0 +1,522 @@ +-- $Id: utf8.lua 179 2009-04-03 18:10:03Z pasta $ +-- +-- Provides UTF-8 aware string functions implemented in pure lua: +-- * utf8len(s) +-- * utf8sub(s, i, j) +-- * utf8reverse(s) +-- * utf8char(unicode) +-- * utf8unicode(s, i, j) +-- * utf8gensub(s, sub_len) +-- * utf8find(str, regex, init, plain) +-- * utf8match(str, regex, init) +-- * utf8gmatch(str, regex, all) +-- * utf8gsub(str, regex, repl, limit) +-- +-- If utf8data.lua (containing the lower<->upper case mappings) is loaded, these +-- additional functions are available: +-- * utf8upper(s) +-- * utf8lower(s) +-- +-- All functions behave as their non UTF-8 aware counterparts with the exception +-- that UTF-8 characters are used instead of bytes for all units. + +--[[ +Copyright (c) 2006-2007, Kyle Smith +All rights reserved. + +Contributors: + Alimov Stepan + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the author nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +--]] + +-- ABNF from RFC 3629 +-- +-- UTF8-octets = *( UTF8-char ) +-- UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4 +-- UTF8-1 = %x00-7F +-- UTF8-2 = %xC2-DF UTF8-tail +-- UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / +-- %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail ) +-- UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / +-- %xF4 %x80-8F 2( UTF8-tail ) +-- UTF8-tail = %x80-BF +-- +return function(utf8) + +local byte = string.byte +local char = string.char +local dump = string.dump +local find = string.find +local format = string.format +local len = string.len +local lower = string.lower +local rep = string.rep +local sub = string.sub +local upper = string.upper + +local function utf8symbollen(byte) + return not byte and 0 or (byte < 0x80 and 1) or (byte >= 0xF0 and 4) or (byte >= 0xE0 and 3) or (byte >= 0xC0 and 2) or 1 +end + +local function utf8charbytes(str, bs) + return utf8symbollen(byte(str, bs)) +end + +local function utf8next(str, bs) + return bs + utf8charbytes(str, bs) +end + +-- returns the number of characters in a UTF-8 string +local function utf8len (str) + local bs = 1 + local bytes = len(str) + local length = 0 + + while bs <= bytes do + length = length + 1 + bs = utf8next(str, bs) + end + + return length +end + +-- functions identically to string.sub except that i and j are UTF-8 characters +-- instead of bytes +local function utf8sub (s, i, j) + -- argument defaults + j = j or -1 + + local bs = 1 + local bytes = len(s) + local length = 0 + + local l = (i >= 0 and j >= 0) or utf8len(s) + i = (i >= 0) and i or l + i + 1 + j = (j >= 0) and j or l + j + 1 + + if i > j then + return "" + end + + local start, finish = 1, bytes + + while bs <= bytes do + length = length + 1 + + if length == i then + start = bs + end + + bs = utf8next(s, bs) + + if length == j then + finish = bs - 1 + break + end + end + + if i > length then start = bytes + 1 end + if j < 1 then finish = 0 end + + return sub(s, start, finish) +end + +-- http://en.wikipedia.org/wiki/Utf8 +-- http://developer.coronalabs.com/code/utf-8-conversion-utility +local function utf8char(...) + local codes = {...} + local result = {} + + for _, unicode in ipairs(codes) do + + if unicode <= 0x7F then + result[#result + 1] = unicode + elseif unicode <= 0x7FF then + local b0 = 0xC0 + math.floor(unicode / 0x40); + local b1 = 0x80 + (unicode % 0x40); + result[#result + 1] = b0 + result[#result + 1] = b1 + elseif unicode <= 0xFFFF then + local b0 = 0xE0 + math.floor(unicode / 0x1000); + local b1 = 0x80 + (math.floor(unicode / 0x40) % 0x40); + local b2 = 0x80 + (unicode % 0x40); + result[#result + 1] = b0 + result[#result + 1] = b1 + result[#result + 1] = b2 + elseif unicode <= 0x10FFFF then + local code = unicode + local b3= 0x80 + (code % 0x40); + code = math.floor(code / 0x40) + local b2= 0x80 + (code % 0x40); + code = math.floor(code / 0x40) + local b1= 0x80 + (code % 0x40); + code = math.floor(code / 0x40) + local b0= 0xF0 + code; + + result[#result + 1] = b0 + result[#result + 1] = b1 + result[#result + 1] = b2 + result[#result + 1] = b3 + else + error 'Unicode cannot be greater than U+10FFFF!' + end + + end + + return char(utf8.config.unpack(result)) +end + + +local shift_6 = 2^6 +local shift_12 = 2^12 +local shift_18 = 2^18 + +local utf8unicode +utf8unicode = function(str, ibs, jbs) + if ibs > jbs then return end + + local ch,bytes + + bytes = utf8charbytes(str, ibs) + if bytes == 0 then return end + ch = sub(str,ibs,ibs-1+bytes) + + local unicode + + if bytes == 1 then unicode = byte(ch) end + if bytes == 2 then + local byte0,byte1 = byte(ch,1,2) + if byte0 and byte1 then + local code0,code1 = byte0-0xC0,byte1-0x80 + unicode = code0*shift_6 + code1 + else + unicode = byte0 + end + end + if bytes == 3 then + local byte0,byte1,byte2 = byte(ch,1,3) + if byte0 and byte1 and byte2 then + local code0,code1,code2 = byte0-0xE0,byte1-0x80,byte2-0x80 + unicode = code0*shift_12 + code1*shift_6 + code2 + else + unicode = byte0 + end + end + if bytes == 4 then + local byte0,byte1,byte2,byte3 = byte(ch,1,4) + if byte0 and byte1 and byte2 and byte3 then + local code0,code1,code2,code3 = byte0-0xF0,byte1-0x80,byte2-0x80,byte3-0x80 + unicode = code0*shift_18 + code1*shift_12 + code2*shift_6 + code3 + else + unicode = byte0 + end + end + + return unicode,utf8unicode(str, ibs+bytes, jbs) +end + +local function utf8byte(str, i, j) + if #str == 0 then return end + + local ibs, jbs + + if i or j then + i = i or 1 + j = j or i + + local str_len = utf8len(str) + i = i < 0 and str_len + i + 1 or i + j = j < 0 and str_len + j + 1 or j + j = j > str_len and str_len or j + + if i > j then return end + + for p = 1, i - 1 do + ibs = utf8next(str, ibs or 1) + end + + if i == j then + jbs = ibs + else + for p = 1, j - 1 do + jbs = utf8next(str, jbs or 1) + end + end + + if not ibs or not jbs then + return nil + end + else + ibs, jbs = 1, 1 + end + + return utf8unicode(str, ibs, jbs) +end + +local function utf8gensub(str, sub_len) + sub_len = sub_len or 1 + local max_len = #str + return function(skip_ptr, bs) + bs = (bs and bs or 1) + (skip_ptr and (skip_ptr[1] or 0) or 0) + + nbs = bs + if bs > max_len then return nil end + for i = 1, sub_len do + nbs = utf8next(str, nbs) + end + + return nbs, sub(str, bs, nbs - 1), bs + end +end + +local function utf8reverse (s) + local result = '' + for _, w in utf8gensub(s) do result = w .. result end + return result +end + +local function utf8validator(str, bs) + bs = bs or 1 + + if type(str) ~= "string" then + error("bad argument #1 to 'utf8charbytes' (string expected, got ".. type(str).. ")") + end + if type(bs) ~= "number" then + error("bad argument #2 to 'utf8charbytes' (number expected, got ".. type(bs).. ")") + end + + local c = byte(str, bs) + if not c then return end + + -- determine bytes needed for character, based on RFC 3629 + + -- UTF8-1 + if c >= 0 and c <= 127 then + return bs + 1 + elseif c >= 128 and c <= 193 then + return bs + 1, bs, 1, c + -- UTF8-2 + elseif c >= 194 and c <= 223 then + local c2 = byte(str, bs + 1) + if not c2 or c2 < 128 or c2 > 191 then + return bs + 2, bs, 2, c2 + end + + return bs + 2 + -- UTF8-3 + elseif c >= 224 and c <= 239 then + local c2 = byte(str, bs + 1) + + if not c2 then + return bs + 2, bs, 2, c2 + end + + -- validate byte 2 + if c == 224 and (c2 < 160 or c2 > 191) then + return bs + 2, bs, 2, c2 + elseif c == 237 and (c2 < 128 or c2 > 159) then + return bs + 2, bs, 2, c2 + elseif c2 < 128 or c2 > 191 then + return bs + 2, bs, 2, c2 + end + + local c3 = byte(str, bs + 2) + if not c3 or c3 < 128 or c3 > 191 then + return bs + 3, bs, 3, c3 + end + + return bs + 3 + -- UTF8-4 + elseif c >= 240 and c <= 244 then + local c2 = byte(str, bs + 1) + + if not c2 then + return bs + 2, bs, 2, c2 + end + + -- validate byte 2 + if c == 240 and (c2 < 144 or c2 > 191) then + return bs + 2, bs, 2, c2 + elseif c == 244 and (c2 < 128 or c2 > 143) then + return bs + 2, bs, 2, c2 + elseif c2 < 128 or c2 > 191 then + return bs + 2, bs, 2, c2 + end + + local c3 = byte(str, bs + 2) + if not c3 or c3 < 128 or c3 > 191 then + return bs + 3, bs, 3, c3 + end + + local c4 = byte(str, bs + 3) + if not c4 or c4 < 128 or c4 > 191 then + return bs + 4, bs, 4, c4 + end + + return bs + 4 + else -- c > 245 + return bs + 1, bs, 1, c + end +end + +local function utf8validate(str, byte_pos) + local result = {} + for nbs, bs, part, code in utf8validator, str, byte_pos do + if bs then + result[#result + 1] = { pos = bs, part = part, code = code } + end + end + return #result == 0, result +end + +local function utf8codes(str) + local max_len = #str + local bs = 1 + return function(skip_ptr) + if bs > max_len then return nil end + local pbs = bs + bs = utf8next(str, pbs) + + return pbs, utf8unicode(str, pbs, pbs), pbs + end +end + + +--[[-- +differs from Lua 5.3 utf8.offset in accepting any byte positions (not only head byte) for all n values + +h - head, c - continuation, t - tail +hhhccthccthccthcthhh + ^ start byte pos +searching current charracter head by moving backwards +hhhccthccthccthcthhh + ^ head + +n == 0: current position +n > 0: n jumps forward +n < 0: n more scans backwards +--]]-- +local function utf8offset(str, n, bs) + local l = #str + if not bs then + if n < 0 then + bs = l + 1 + else + bs = 1 + end + end + if bs < 0 or bs > l + 1 then + error("bad argument #3 to 'offset' (position out of range)") + end + + if n == 0 then + if bs == l + 1 then + return bs + end + while true do + local b = byte(str, bs) + if 0 < b and b < 127 + or 194 < b and b < 244 then + return bs + end + bs = bs - 1 + if bs < 1 then + return + end + end + elseif n < 0 then + bs = bs - 1 + repeat + if bs < 1 then + return + end + + local b = byte(str, bs) + if 0 < b and b < 127 + or 194 < b and b < 244 then + n = n + 1 + end + bs = bs - 1 + until n == 0 + return bs + 1 + else + while true do + if bs > l then + return + end + + local b = byte(str, bs) + if 0 < b and b < 127 + or 194 < b and b < 244 then + n = n - 1 + for i = 1, n do + if bs > l then + return + end + bs = utf8next(str, bs) + end + return bs + end + bs = bs - 1 + end + end + +end + +utf8.len = utf8len +utf8.sub = utf8sub +utf8.reverse = utf8reverse +utf8.char = utf8char +utf8.unicode = utf8unicode +utf8.byte = utf8byte +utf8.next = utf8next +utf8.gensub = utf8gensub +utf8.validator = utf8validator +utf8.validate = utf8validate +utf8.dump = dump +utf8.format = format +utf8.lower = lower +utf8.upper = upper +utf8.rep = rep +utf8.raw = {} +for k,v in pairs(string) do + utf8.raw[k] = v +end + +utf8.charpattern = '[\0-\127\194-\244][\128-\191]*' +utf8.offset = utf8offset +local ok, utf8_53 = pcall(require, "utf8") +if ok then + utf8.codes = utf8_53.codes + utf8.codepoint = utf8_53.codepoint + utf8.len53 = utf8_53.len +else + utf8.codes = utf8codes + utf8.codepoint = utf8unicode +end + +return utf8 + +end diff --git a/loveframes/third-party/utf8/primitives/init.lua b/loveframes/third-party/utf8/primitives/init.lua new file mode 100644 index 0000000..df28ef3 --- /dev/null +++ b/loveframes/third-party/utf8/primitives/init.lua @@ -0,0 +1,23 @@ +return function(utf8) + +local provided = utf8.config.primitives + +if provided then + if type(provided) == "table" then + return provided + elseif type(provided) == "function" then + return provided(utf8) + else + return utf8:require(provided) + end +end + +if pcall(require, "tarantool") then + return utf8:require "primitives.tarantool" +elseif pcall(require, "ffi") then + return utf8:require "primitives.native" +else + return utf8:require "primitives.dummy" +end + +end diff --git a/loveframes/third-party/utf8/primitives/native.lua b/loveframes/third-party/utf8/primitives/native.lua new file mode 100644 index 0000000..3653839 --- /dev/null +++ b/loveframes/third-party/utf8/primitives/native.lua @@ -0,0 +1,46 @@ +return function(utf8) + +os.setlocale(utf8.config.locale, "ctype") + +local ffi = require("ffi") +ffi.cdef[[ + int towupper(int c); + int towlower(int c); +]] + +utf8:require "primitives.dummy" + +function utf8.lower(str) + local bs = 1 + local nbs + local bytes = utf8.raw.len(str) + local res = {} + + while bs <= bytes do + nbs = utf8.next(str, bs) + local cp = utf8.unicode(str, bs, nbs) + res[#res + 1] = ffi.C.towlower(cp) + bs = nbs + end + + return utf8.char(utf8.config.unpack(res)) +end + +function utf8.upper(str) + local bs = 1 + local nbs + local bytes = utf8.raw.len(str) + local res = {} + + while bs <= bytes do + nbs = utf8.next(str, bs) + local cp = utf8.unicode(str, bs, nbs) + res[#res + 1] = ffi.C.towupper(cp) + bs = nbs + end + + return utf8.char(utf8.config.unpack(res)) +end + +return utf8 +end diff --git a/loveframes/third-party/utf8/primitives/tarantool.lua b/loveframes/third-party/utf8/primitives/tarantool.lua new file mode 100644 index 0000000..c38acf6 --- /dev/null +++ b/loveframes/third-party/utf8/primitives/tarantool.lua @@ -0,0 +1,13 @@ +return function(utf8) + +utf8:require "primitives.dummy" + +local tnt_utf8 = utf8.config.tarantool_utf8 or require("utf8") + +utf8.lower = tnt_utf8.lower +utf8.upper = tnt_utf8.upper +utf8.len = tnt_utf8.len +utf8.char = tnt_utf8.char + +return utf8 +end diff --git a/loveframes/third-party/utf8/regex_parser.lua b/loveframes/third-party/utf8/regex_parser.lua new file mode 100644 index 0000000..0d3b4fe --- /dev/null +++ b/loveframes/third-party/utf8/regex_parser.lua @@ -0,0 +1,78 @@ +return function(utf8) + +utf8:require "modifier.compiletime.parser" +utf8:require "charclass.compiletime.parser" +utf8:require "begins.compiletime.parser" +utf8:require "ends.compiletime.parser" + +local gensub = utf8.gensub +local sub = utf8.sub + +local parser_context = utf8:require "context.compiletime" + +return function(regex, plain) + utf8.debug("regex", regex) + local ctx = parser_context:new() + + local skip = {0} + for nbs, c, bs in gensub(regex, 0), skip do + repeat -- continue + skip[1] = 0 + + c = utf8.raw.sub(regex, bs, utf8.next(regex, bs) - 1) + + local functions, move = utf8.regex.compiletime.begins.parse(regex, c, bs, ctx) + if functions then + ctx.begins = functions + skip[1] = move + end + if skip[1] ~= 0 then break end + + local functions, move = utf8.regex.compiletime.ends.parse(regex, c, bs, ctx) + if functions then + ctx.ends = functions + skip[1] = move + end + if skip[1] ~= 0 then break end + + local functions, move = utf8.regex.compiletime.modifier.parse(regex, c, bs, ctx) + if functions then + for _, f in ipairs(functions) do + ctx.funcs[#ctx.funcs + 1] = f + end + skip[1] = move + end + if skip[1] ~= 0 then break end + + local charclass, move = utf8.regex.compiletime.charclass.parse(regex, c, bs, ctx) + if charclass then skip[1] = move end + until true -- continue + end + + for _, m in ipairs(utf8.config.modifier) do + if m.check then m.check(ctx) end + end + + local src = [[ + return function(str, init, utf8) + local ctx = utf8:require("context.runtime").new({str = str, pos = init or 1}) + local cl = utf8:require("charclass.runtime.init") + local utf8sub = utf8.sub + local utf8len = utf8.len + local debug = utf8.debug + local function add(fun) + ctx.functions[#ctx.functions + 1] = fun + end + ]] .. ctx.begins + for _, v in ipairs(ctx.funcs) do src = src .. v end + src = src .. ctx.ends .. [[ + return coroutine.wrap(ctx:get_function())(ctx) + end + ]] + + utf8.debug(regex, src) + + return assert(utf8.config.loadstring(src, (plain and "plain " or "") .. regex))() +end + +end diff --git a/loveframes/third-party/utf8/util.lua b/loveframes/third-party/utf8/util.lua new file mode 100644 index 0000000..7723626 --- /dev/null +++ b/loveframes/third-party/utf8/util.lua @@ -0,0 +1,64 @@ +return function(utf8) + +function utf8.util.copy(obj, deep) + if type(obj) == 'table' then + local result = {} + if deep then + for k,v in pairs(obj) do + result[k] = utf8.util.copy(v, true) + end + else + for k,v in pairs(obj) do + result[k] = v + end + end + return result + else + return obj + end +end + +local function dump(val, tab) + tab = tab or '' + + if type(val) == 'table' then + utf8.config.logger('{\n') + for k,v in pairs(val) do + utf8.config.logger(tab .. tostring(k) .. " = ") + dump(v, tab .. '\t') + utf8.config.logger("\n") + end + utf8.config.logger(tab .. '}\n') + else + utf8.config.logger(tostring(val)) + end +end + +function utf8.util.debug(...) + local t = {...} + for _, v in ipairs(t) do + if type(v) == "table" and not (getmetatable(v) or {}).__tostring then + dump(v, '\t') + else + utf8.config.logger(tostring(v), " ") + end + end + + utf8.config.logger('\n') +end + +function utf8.debug(...) + if utf8.config.debug then + utf8.config.debug(...) + end +end + +function utf8.util.next(str, bs) + local nbs1 = utf8.next(str, bs) + local nbs2 = utf8.next(str, nbs1) + return utf8.raw.sub(str, nbs1, nbs2 - 1), nbs1 +end + +return utf8.util + +end