diff --git a/htmlparser.lua b/htmlparser.lua new file mode 100644 index 0000000..65991f4 --- /dev/null +++ b/htmlparser.lua @@ -0,0 +1,272 @@ +-- vim: ft=lua ts=2 sw=2 + +-- Syntactic Sugar {{{ +local function rine(val) -- Return (val) If it's Not Empty (non-zero-length) + return (val and #val>0) and val +end +local function rit(a) -- Return (a) If it's Table + return (type(a) == "table") and a +end +local noop = function() end +local esc = function(s) return string.gsub(s, "([%^%$%(%)%%%.%[%]%*%+%-%?])", "%%" .. "%1") end +local str = tostring +local char = string.char +local opts = rit(htmlparser_opts) or {} -- needed for silent/noerr/noout/nonl directives, also needed to be defined before `require` in such case +local prn = opts.silent and noop or function(l,f,...) + local fd = (l=="i") and "stdout" or "stderr" + local t = (" [%s] "):format(l:upper()) + io[fd] + :write('[HTMLParser]'..t..f:format(...) + ..(opts.nonl or "\n") + ) +end +local err = opts.noerr and noop or function(f,...) prn("e",f,...) end +local out = opts.noout and noop or function(f,...) prn("i",f,...) end +local line = debug and function(lvl) return debug.getinfo(lvl or 2).currentline end or noop +local dbg = opts.debug and function(f,...) prn("d",f:gsub("#LINE#",str(line(3))),...) end or noop +-- }}} +-- Requires {{{ + +-- MODIFIED -- + +-- local ElementNode = require"htmlparser.ElementNode" +-- local voidelements = require"htmlparser.voidelements" +local success, utility = pcall(function() + return dofile(arg[0]:match("@?(.*/)") or arg[0]:match("@?(.*\\)") .. "utility-functions.lua") +end) +if not success then + print("\n\n" .. tostring(utility)) + error("\n\nThis script may be installed improperly. Follow instructions at:\n\thttps://github.com/TangentFoxy/.lua-files#installation\n") +end + +local ElementNode = utility.require("htmlparser/ElementNode") +local voidelements = utility.require("htmlparser/voidelements") + +-- MODIFIED -- + +--}}} +local HtmlParser = {} +local function parse(text,limit) -- {{{ + local opts = rine(opts) -- use top-level opts-table (the one, defined before requiring the module), if exists + or rit(htmlparser_opts) -- or defined after requiring (but before calling `parse`) + or {} -- fallback otherwise + opts.looplimit = opts.looplimit or htmlparser_looplimit + + local text = str(text) + local limit = limit or opts.looplimit or 1000 + local tpl = false + + if not opts.keep_comments then -- Strip (or not) comments {{{ + text = text:gsub("","") -- Many chances commented code will have syntax errors, that'll lead to parser failures + end -- }}} + + local tpr={} + + if not opts.keep_danger_placeholders then -- {{{ little speedup by cost of potential parsing breakages + -- search unused "invalid" bytes {{{ + local busy,i={},0; + repeat -- {{{ + local cc = char(i) + if not(text:match(cc)) then -- {{{ + if not(tpr["<"]) or not(tpr[">"]) then -- {{{ + if not(busy[i]) then -- {{{ + if not(tpr["<"]) then -- {{{ + tpr["<"] = cc; + elseif not(tpr[">"]) then + tpr[">"] = cc; + end -- }}} + busy[i] = true + dbg("c:{%s}||cc:{%d}||tpr[c]:{%s}",str(c),cc:byte(),str(tpr[c])) + dbg("busy[i]:{%s},i:{%d}",str(busy[i]),i) + dbg("[FindPH]:#LINE# Success! || i=%d",i) + else -- if !busy + dbg("[FindPH]:#LINE# Busy! || i=%d",i) + end -- if !busy -- }}} + dbg("c:{%s}||cc:{%d}||tpr[c]:{%s}",c,cc:byte(),str(tpr[c])) + dbg("%s",str(busy[i])) + else -- if < or > + dbg("[FindPH]:#LINE# Done!",i) + break + end -- if < or > -- }}} + else -- text!match(cc) + dbg("[FindPH]:#LINE# Text contains this byte! || i=%d",i) + end -- text!match(cc) -- }}} + local skip=1 + if i==31 then + skip=96 -- ASCII + end + i=i+skip + until (i==255) -- }}} + i=nil + --- }}} + + if not(tpr["<"]) or not(tpr[">"]) then + err("Impossible to find at least two unused byte codes in this HTML-code. We need it to escape bracket-contained placeholders inside tags.") + err("Consider enabling 'keep_danger_placeholders' option (to silence this error, if parser wasn't failed with current HTML-code) or manually replace few random bytes, to free up the codes.") + else + dbg("[FindPH]:#LINE# Found! || '<'=%d, '>'=%d",tpr["<"]:byte(),tpr[">"]:byte()) + end + +-- dbg("tpr[>] || tpr[] || #busy%d") + + -- g {{{ + local function g(id,...) + local arg={...} + local orig=arg[id] + arg[id]=arg[id]:gsub("(.)",tpr) + if arg[id] ~= orig then + tpl=true + dbg("[g]:#LINE# orig: %s", str(orig)) + dbg("[g]:#LINE# replaced: %s",str(arg[id])) + end + dbg("[g]:#LINE# called, id: %s, arg[id]: %s, args { "..(("{%s}, "):rep(#arg):gsub(", $","")).." }",id,arg[id],...) + dbg("[g]:#LINE# concat(arg): %s",table.concat(arg)) + return table.concat(arg) + end + -- g }}} + + -- tpl-placeholders and attributes {{{ + text=text + :gsub( + "(=[%s]-)".. -- only match attr.values, and not random strings between two random apostrophs + "(%b'')", + function(...)return g(2,...)end + ) + :gsub( + "(=[%s]-)".. -- same for " + '(%b"")', + function(...)return g(2,...)end + ) -- Escape "<"/">" inside attr.values (see issue #50) + :gsub( + "(<".. -- Match "<", + (opts.tpl_skip_pattern or "[^!]").. -- with exclusion pattern (for example, to ignore comments, which aren't template placeholders, but can legally contain "<"/">" inside. + ")([^>]+)".. -- If matched, we want to escape '<'s if we meet them inside tag + "(>)", + function(...)return g(2,...)end + ) + :gsub( + "(".. + (tpr["<"] or "__FAILED__").. -- Here we search for "<", we escaped in previous gsub (and don't break things if we have no escaping replacement) + ")(".. + (opts.tpl_marker_pattern or "[^%w%s]").. -- Capture templating symbol + ")([%g%s]-)".. -- match placeholder's content + "(%2)(>)".. -- placeholder's tail + "([^>]*>)", -- remainings + function(...)return g(5,...)end + ) + -- }}} + end -- }}} + + local index = 0 + local root = ElementNode:new(index, str(text)) + local node, descend, tpos, opentags = root, true, 1, {} + + while true do -- MainLoop {{{ + if index == limit then -- {{{ + err("Main loop reached loop limit (%d). Consider either increasing it or checking HTML-code for syntax errors", limit) + break + end -- }}} + -- openstart/tpos Definitions {{{ + local openstart, name + openstart, tpos, name = root._text:find( + "<" .. -- an uncaptured starting "<" + "([%w-]+)" .. -- name = the first word, directly following the "<" + "[^>]*>", -- include, but not capture everything up to the next ">" + tpos) + dbg("[MainLoop]:#LINE# openstart=%s || tpos=%s || name=%s",str(openstart),str(tpos),str(name)) + -- }}} + if not name then break end + -- Some more vars {{{ + index = index + 1 + local tag = ElementNode:new(index, str(name), (node or {}), descend, openstart, tpos) + node = tag + local tagloop + local tagst, apos = tag:gettext(), 1 + -- }}} + while true do -- TagLoop {{{ + dbg("[TagLoop]:#LINE# tag.name=%s, tagloop=%s",str(tag.name),str(tagloop)) + if tagloop == limit then -- {{{ + err("Tag parsing loop reached loop limit (%d). Consider either increasing it or checking HTML-code for syntax errors", limit) + break + end -- }}} + -- Attrs {{{ + local start, k, eq, quote, v, zsp + start, apos, k, zsp, eq, zsp, quote = tagst:find( + "%s+" .. -- some uncaptured space + "([^%s=/>]+)" .. -- k = an unspaced string up to an optional "=" or the "/" or ">" + "([%s]-)".. -- zero or more spaces + "(=?)" .. -- eq = the optional; "=", else "" + "([%s]-)".. -- zero or more spaces + [=[(['"]?)]=], -- quote = an optional "'" or '"' following the "=", or "" + apos) + dbg("[TagLoop]:#LINE# start=%s || apos=%s || k=%s || zsp='%s' || eq='%s', quote=[%s]",str(start),str(apos),str(k),str(zsp),str(eq),str(quote)) + -- }}} + if not k or k == "/>" or k == ">" then break end + -- Pattern {{{ + if eq == "=" then + local pattern = "=([^%s>]*)" + if quote ~= "" then + pattern = quote .. "([^" .. quote .. "]*)" .. quote + end + start, apos, v = tagst:find(pattern, apos) + dbg("[TagLoop]:#LINE# start=%s || apos=%s || v=%s || pattern=%s",str(start),str(apos),str(v),str(pattern)) + end + -- }}} + v=v or "" + if tpl then -- {{{ + for rk,rv in pairs(tpr) do + v = v:gsub(rv,rk) + dbg("[TagLoop]:#LINE# rv=%s || rk=%s",str(rv),str(rk)) + end + end -- }}} + + dbg("[TagLoop]:#LINE# k=%s || v=%s",str(k),str(v)) + tag:addattribute(k, v) + tagloop = (tagloop or 0) + 1 + end + -- }}} + if voidelements[tag.name:lower()] then -- {{{ + descend = false + tag:close() + else + descend = true + opentags[tag.name] = opentags[tag.name] or {} + table.insert(opentags[tag.name], tag) + end + -- }}} + local closeend = tpos + local closingloop + while true do -- TagCloseLoop {{{ + -- Can't remember why did I add that, so comment it for now (and not remove), in case it will be needed again + -- (although, it causes #59 and #60, so it will anyway be needed to rework) + -- if voidelements[tag.name:lower()] then break end -- already closed + if closingloop == limit then + err("Tag closing loop reached loop limit (%d). Consider either increasing it or checking HTML-code for syntax errors", limit) + break + end + + local closestart, closing, closename + closestart, closeend, closing, closename = root._text:find("[^<]*<(/?)([%w-]+)", closeend) + dbg("[TagCloseLoop]:#LINE# closestart=%s || closeend=%s || closing=%s || closename=%s",str(closestart),str(closeend),str(closing),str(closename)) + + if not closing or closing == "" then break end + + tag = table.remove(opentags[closename] or {}) or tag -- kludges for the cases of closing void or non-opened tags + closestart = root._text:find("<", closestart) + dbg("[TagCloseLoop]:#LINE# closestart=%s",str(closestart)) + tag:close(closestart, closeend + 1) + node = tag.parent + descend = true + closingloop = (closingloop or 0) + 1 + end -- }}} + end -- }}} + if tpl then -- {{{ + dbg("tpl") + for k,v in pairs(tpr) do + root._text = root._text:gsub(v,k) + end + end -- }}} + return root +end -- }}} +HtmlParser.parse = parse +return HtmlParser diff --git a/htmlparser/ElementNode.lua b/htmlparser/ElementNode.lua new file mode 100644 index 0000000..0c39901 --- /dev/null +++ b/htmlparser/ElementNode.lua @@ -0,0 +1,283 @@ +-- vim: ft=lua ts=2 +local Set = {} +Set.mt = {__index = Set} +function Set:new(values) + local instance = {} + local isSet if getmetatable(values) == Set.mt then isSet = true end + if type(values) == "table" then + if not isSet and #values > 0 then + for _,v in ipairs(values) do + instance[v] = true + end + else + for k in pairs(values) do + instance[k] = true + end + end + elseif values ~= nil then + instance = {[values] = true} + end + return setmetatable(instance, Set.mt) +end + +function Set:add(e) + if e ~= nil then self[e] = true end + return self +end + +function Set:remove(e) + if e ~= nil then self[e] = nil end + return self +end + +function Set:tolist() + local res = {} + for k in pairs(self) do + table.insert(res, k) + end + return res +end + +Set.mt.__add = function (a, b) + local res, a, b = Set:new(), Set:new(a), Set:new(b) + for k in pairs(a) do res[k] = true end + for k in pairs(b) do res[k] = true end + return res +end + +-- Subtraction +Set.mt.__sub = function (a, b) + local res, a, b = Set:new(), Set:new(a), Set:new(b) + for k in pairs(a) do res[k] = true end + for k in pairs(b) do res[k] = nil end + return res +end + +-- Intersection +Set.mt.__mul = function (a, b) + local res, a, b = Set:new(), Set:new(a), Set:new(b) + for k in pairs(a) do + res[k] = b[k] + end + return res +end + +-- String representation +Set.mt.__tostring = function (set) + local s = "{" + local sep = "" + for k in pairs(set) do + s = s .. sep .. tostring(k) + sep = ", " + end + return s .. "}" +end + + +local ElementNode = {} +ElementNode.mt = {__index = ElementNode} +function ElementNode:new(index, nameortext, node, descend, openstart, openend) + local instance = { + index = index, + name = nameortext, + level = 0, + parent = nil, + root = nil, + nodes = {}, + _openstart = openstart, _openend = openend, + _closestart = openstart, _closeend = openend, + attributes = {}, + id = nil, + classes = {}, + deepernodes = Set:new(), + deeperelements = {}, deeperattributes = {}, deeperids = {}, deeperclasses = {} + } + if not node then + instance.name = "root" + instance.root = instance + instance._text = nameortext + local length = string.len(nameortext) + instance._openstart, instance._openend = 1, length + instance._closestart, instance._closeend = 1, length + elseif descend then + instance.root = node.root + instance.parent = node + instance.level = node.level + 1 + table.insert(node.nodes, instance) + else + instance.root = node.root + instance.parent = node.parent or node --XXX: adds some safety but needs more testing for heisenbugs in corner cases + instance.level = node.level + table.insert((node.parent and node.parent.nodes or node.nodes), instance) --XXX: see above about heisenbugs + end + return setmetatable(instance, ElementNode.mt) +end + +function ElementNode:gettext() + return string.sub(self.root._text, self._openstart, self._closeend) +end + +function ElementNode:settext(c) + self.root._text=c +end + +function ElementNode:textonly() + return (self:gettext():gsub("<[^>]*>","")) +end + +function ElementNode:getcontent() + return string.sub(self.root._text, self._openend + 1, self._closestart - 1) +end + +function ElementNode:addattribute(k, v) + self.attributes[k] = v + if string.lower(k) == "id" then + self.id = v + -- class attribute contains "space-separated tokens", each of which we'd like quick access to + elseif string.lower(k) == "class" then + for class in string.gmatch(v, "%S+") do + table.insert(self.classes, class) + end + end +end + +local function insert(table, name, node) + table[name] = table[name] or Set:new() + table[name]:add(node) +end + +function ElementNode:close(closestart, closeend) + if closestart and closeend then + self._closestart, self._closeend = closestart, closeend + end + -- inform hihger level nodes about this element's existence in their branches + local node = self + while true do + node = node.parent + if not node then break end + node.deepernodes:add(self) + insert(node.deeperelements, self.name, self) + for k in pairs(self.attributes) do + insert(node.deeperattributes, k, self) + end + if self.id then + insert(node.deeperids, self.id, self) + end + for _,v in ipairs(self.classes) do + insert(node.deeperclasses, v, self) + end + end +end + +local function escape(s) + -- escape all ^, $, (, ), %, ., [, ], *, +, - , and ? with a % prefix + return string.gsub(s, "([%^%$%(%)%%%.%[%]%*%+%-%?])", "%%" .. "%1") +end + +local function select(self, s) + if not s or type(s) ~= "string" or s == "" then return Set:new() end + local sets = {[""] = self.deeperelements, ["["] = self.deeperattributes, + ["#"] = self.deeperids, ["."] = self.deeperclasses} + local function match(t, w) + local m, e, v + if t == "[" then w, m, e, v = string.match(w, + "([^=|%*~%$!%^]+)" .. -- w = 1 or more characters up to a possible "=", "|", "*", "~", "$", "!", or "^" + "([|%*~%$!%^]?)" .. -- m = an optional "|", "*", "~", "$", "!", or "^", preceding the optional "=" + "(=?)" .. -- e = the optional "=" + "(.*)" -- v = anything following the "=", or else "" + ) + end + local matched = Set:new(sets[t][w]) + -- attribute value selectors + if e == "=" then + if #v < 2 then v = "'" .. v .. "'" end -- values should be quoted + v = string.sub(v, 2, #v - 1) -- strip quotes + if m == "!" then matched = Set:new(self.deepernodes) end -- include those without that attribute + for node in pairs(matched) do + local a = node.attributes[w] + -- equals + if m == "" and a ~= v then matched:remove(node) + -- not equals + elseif m == "!" and a == v then matched:remove(node) + -- prefix + elseif m =="|" and string.match(a, "^[^-]*") ~= v then matched:remove(node) + -- contains + elseif m =="*" and string.match(a, escape(v)) ~= v then matched:remove(node) + -- word + elseif m =="~" then matched:remove(node) + for word in string.gmatch(a, "%S+") do + if word == v then matched:add(node) break end + end + -- starts with + elseif m =="^" and string.match(a, "^" .. escape(v)) ~= v then matched:remove(node) + -- ends with + elseif m =="$" and string.match(a, escape(v) .. "$") ~= v then matched:remove(node) + end + end -- for node + end -- if v + return matched + end + + local subjects, resultset, childrenonly = Set:new({self}) + for part in string.gmatch(s, "%S+") do + repeat + if part == ">" then childrenonly = true --[[goto nextpart]] break end + resultset = Set:new() + for subject in pairs(subjects) do + local star = subject.deepernodes + if childrenonly then star = Set:new(subject.nodes) end + resultset = resultset + star + end + childrenonly = false + if part == "*" then --[[goto nextpart]] break end + local excludes, filter = Set:new() + local start, pos = 0, 0 + while true do + local switch, stype, name, eq, quote + start, pos, switch, stype, name, eq, quote = string.find(part, + "(%(?%)?)" .. -- switch = a possible ( or ) switching the filter on or off + "([:%[#.]?)" .. -- stype = a possible :, [, #, or . + "([%w-_\\]+)" .. -- name = 1 or more alfanumeric chars (+ hyphen, reverse slash and uderscore) + "([|%*~%$!%^]?=?)" .. -- eq = a possible |=, *=, ~=, $=, !=, ^=, or = + "(['\"]?)", -- quote = a ' or " delimiting a possible attribute value + pos + 1 + ) + if not name then break end + repeat + if ":" == stype then + filter = name + --[[goto nextname]] break + end + if ")" == switch then + filter = nil + end + if "[" == stype and "" ~= quote then + local value + start, pos, value = string.find(part, "(%b" .. quote .. quote .. ")]", pos) + name = name .. eq .. value + end + local matched = match(stype, name) + if filter == "not" then + excludes = excludes + matched + else + resultset = resultset * matched + end + --::nextname:: + break + until true + end + resultset = resultset - excludes + subjects = Set:new(resultset) + --::nextpart:: +break +until true + end + resultset = resultset:tolist() + table.sort(resultset, function (a, b) return a.index < b.index end) + return resultset +end + +function ElementNode:select(s) return select(self, s) end +ElementNode.mt.__call = select + +return ElementNode diff --git a/htmlparser/voidelements.lua b/htmlparser/voidelements.lua new file mode 100644 index 0000000..43dedf5 --- /dev/null +++ b/htmlparser/voidelements.lua @@ -0,0 +1,19 @@ +-- vim: ft=lua ts=2 +return { + area = true, + base = true, + br = true, + col = true, + command = true, + embed = true, + hr = true, + img = true, + input = true, + keygen = true, + link = true, + meta = true, + param = true, + source = true, + track = true, + wbr = true +} diff --git a/json.lua b/json.lua new file mode 100644 index 0000000..711ef78 --- /dev/null +++ b/json.lua @@ -0,0 +1,388 @@ +-- +-- json.lua +-- +-- Copyright (c) 2020 rxi +-- +-- Permission is hereby granted, free of charge, to any person obtaining a copy of +-- this software and associated documentation files (the "Software"), to deal in +-- the Software without restriction, including without limitation the rights to +-- use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +-- of the Software, and to permit persons to whom the Software is furnished to do +-- so, subject to the following conditions: +-- +-- The above copyright notice and this permission notice shall be included in all +-- copies or substantial portions of the Software. +-- +-- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +-- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +-- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +-- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +-- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +-- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +-- SOFTWARE. +-- + +local json = { _version = "0.1.2" } + +------------------------------------------------------------------------------- +-- Encode +------------------------------------------------------------------------------- + +local encode + +local escape_char_map = { + [ "\\" ] = "\\", + [ "\"" ] = "\"", + [ "\b" ] = "b", + [ "\f" ] = "f", + [ "\n" ] = "n", + [ "\r" ] = "r", + [ "\t" ] = "t", +} + +local escape_char_map_inv = { [ "/" ] = "/" } +for k, v in pairs(escape_char_map) do + escape_char_map_inv[v] = k +end + + +local function escape_char(c) + return "\\" .. (escape_char_map[c] or string.format("u%04x", c:byte())) +end + + +local function encode_nil(val) + return "null" +end + + +local function encode_table(val, stack) + local res = {} + stack = stack or {} + + -- Circular reference? + if stack[val] then error("circular reference") end + + stack[val] = true + + if rawget(val, 1) ~= nil or next(val) == nil then + -- Treat as array -- check keys are valid and it is not sparse + local n = 0 + for k in pairs(val) do + if type(k) ~= "number" then + error("invalid table: mixed or invalid key types") + end + n = n + 1 + end + if n ~= #val then + error("invalid table: sparse array") + end + -- Encode + for i, v in ipairs(val) do + table.insert(res, encode(v, stack)) + end + stack[val] = nil + return "[" .. table.concat(res, ",") .. "]" + + else + -- Treat as an object + for k, v in pairs(val) do + if type(k) ~= "string" then + error("invalid table: mixed or invalid key types") + end + table.insert(res, encode(k, stack) .. ":" .. encode(v, stack)) + end + stack[val] = nil + return "{" .. table.concat(res, ",") .. "}" + end +end + + +local function encode_string(val) + return '"' .. val:gsub('[%z\1-\31\\"]', escape_char) .. '"' +end + + +local function encode_number(val) + -- Check for NaN, -inf and inf + if val ~= val or val <= -math.huge or val >= math.huge then + error("unexpected number value '" .. tostring(val) .. "'") + end + return string.format("%.14g", val) +end + + +local type_func_map = { + [ "nil" ] = encode_nil, + [ "table" ] = encode_table, + [ "string" ] = encode_string, + [ "number" ] = encode_number, + [ "boolean" ] = tostring, +} + + +encode = function(val, stack) + local t = type(val) + local f = type_func_map[t] + if f then + return f(val, stack) + end + error("unexpected type '" .. t .. "'") +end + + +function json.encode(val) + return ( encode(val) ) +end + + +------------------------------------------------------------------------------- +-- Decode +------------------------------------------------------------------------------- + +local parse + +local function create_set(...) + local res = {} + for i = 1, select("#", ...) do + res[ select(i, ...) ] = true + end + return res +end + +local space_chars = create_set(" ", "\t", "\r", "\n") +local delim_chars = create_set(" ", "\t", "\r", "\n", "]", "}", ",") +local escape_chars = create_set("\\", "/", '"', "b", "f", "n", "r", "t", "u") +local literals = create_set("true", "false", "null") + +local literal_map = { + [ "true" ] = true, + [ "false" ] = false, + [ "null" ] = nil, +} + + +local function next_char(str, idx, set, negate) + for i = idx, #str do + if set[str:sub(i, i)] ~= negate then + return i + end + end + return #str + 1 +end + + +local function decode_error(str, idx, msg) + local line_count = 1 + local col_count = 1 + for i = 1, idx - 1 do + col_count = col_count + 1 + if str:sub(i, i) == "\n" then + line_count = line_count + 1 + col_count = 1 + end + end + error( string.format("%s at line %d col %d", msg, line_count, col_count) ) +end + + +local function codepoint_to_utf8(n) + -- http://scripts.sil.org/cms/scripts/page.php?site_id=nrsi&id=iws-appendixa + local f = math.floor + if n <= 0x7f then + return string.char(n) + elseif n <= 0x7ff then + return string.char(f(n / 64) + 192, n % 64 + 128) + elseif n <= 0xffff then + return string.char(f(n / 4096) + 224, f(n % 4096 / 64) + 128, n % 64 + 128) + elseif n <= 0x10ffff then + return string.char(f(n / 262144) + 240, f(n % 262144 / 4096) + 128, + f(n % 4096 / 64) + 128, n % 64 + 128) + end + error( string.format("invalid unicode codepoint '%x'", n) ) +end + + +local function parse_unicode_escape(s) + local n1 = tonumber( s:sub(1, 4), 16 ) + local n2 = tonumber( s:sub(7, 10), 16 ) + -- Surrogate pair? + if n2 then + return codepoint_to_utf8((n1 - 0xd800) * 0x400 + (n2 - 0xdc00) + 0x10000) + else + return codepoint_to_utf8(n1) + end +end + + +local function parse_string(str, i) + local res = "" + local j = i + 1 + local k = j + + while j <= #str do + local x = str:byte(j) + + if x < 32 then + decode_error(str, j, "control character in string") + + elseif x == 92 then -- `\`: Escape + res = res .. str:sub(k, j - 1) + j = j + 1 + local c = str:sub(j, j) + if c == "u" then + local hex = str:match("^[dD][89aAbB]%x%x\\u%x%x%x%x", j + 1) + or str:match("^%x%x%x%x", j + 1) + or decode_error(str, j - 1, "invalid unicode escape in string") + res = res .. parse_unicode_escape(hex) + j = j + #hex + else + if not escape_chars[c] then + decode_error(str, j - 1, "invalid escape char '" .. c .. "' in string") + end + res = res .. escape_char_map_inv[c] + end + k = j + 1 + + elseif x == 34 then -- `"`: End of string + res = res .. str:sub(k, j - 1) + return res, j + 1 + end + + j = j + 1 + end + + decode_error(str, i, "expected closing quote for string") +end + + +local function parse_number(str, i) + local x = next_char(str, i, delim_chars) + local s = str:sub(i, x - 1) + local n = tonumber(s) + if not n then + decode_error(str, i, "invalid number '" .. s .. "'") + end + return n, x +end + + +local function parse_literal(str, i) + local x = next_char(str, i, delim_chars) + local word = str:sub(i, x - 1) + if not literals[word] then + decode_error(str, i, "invalid literal '" .. word .. "'") + end + return literal_map[word], x +end + + +local function parse_array(str, i) + local res = {} + local n = 1 + i = i + 1 + while 1 do + local x + i = next_char(str, i, space_chars, true) + -- Empty / end of array? + if str:sub(i, i) == "]" then + i = i + 1 + break + end + -- Read token + x, i = parse(str, i) + res[n] = x + n = n + 1 + -- Next token + i = next_char(str, i, space_chars, true) + local chr = str:sub(i, i) + i = i + 1 + if chr == "]" then break end + if chr ~= "," then decode_error(str, i, "expected ']' or ','") end + end + return res, i +end + + +local function parse_object(str, i) + local res = {} + i = i + 1 + while 1 do + local key, val + i = next_char(str, i, space_chars, true) + -- Empty / end of object? + if str:sub(i, i) == "}" then + i = i + 1 + break + end + -- Read key + if str:sub(i, i) ~= '"' then + decode_error(str, i, "expected string for key") + end + key, i = parse(str, i) + -- Read ':' delimiter + i = next_char(str, i, space_chars, true) + if str:sub(i, i) ~= ":" then + decode_error(str, i, "expected ':' after key") + end + i = next_char(str, i + 1, space_chars, true) + -- Read value + val, i = parse(str, i) + -- Set + res[key] = val + -- Next token + i = next_char(str, i, space_chars, true) + local chr = str:sub(i, i) + i = i + 1 + if chr == "}" then break end + if chr ~= "," then decode_error(str, i, "expected '}' or ','") end + end + return res, i +end + + +local char_func_map = { + [ '"' ] = parse_string, + [ "0" ] = parse_number, + [ "1" ] = parse_number, + [ "2" ] = parse_number, + [ "3" ] = parse_number, + [ "4" ] = parse_number, + [ "5" ] = parse_number, + [ "6" ] = parse_number, + [ "7" ] = parse_number, + [ "8" ] = parse_number, + [ "9" ] = parse_number, + [ "-" ] = parse_number, + [ "t" ] = parse_literal, + [ "f" ] = parse_literal, + [ "n" ] = parse_literal, + [ "[" ] = parse_array, + [ "{" ] = parse_object, +} + + +parse = function(str, idx) + local chr = str:sub(idx, idx) + local f = char_func_map[chr] + if f then + return f(str, idx) + end + decode_error(str, idx, "unexpected character '" .. chr .. "'") +end + + +function json.decode(str) + if type(str) ~= "string" then + error("expected argument of type string, got " .. type(str)) + end + local res, idx = parse(str, next_char(str, 1, space_chars, true)) + idx = next_char(str, idx, space_chars, true) + if idx <= #str then + decode_error(str, idx, "trailing garbage") + end + return res +end + + +return json diff --git a/make-epub.lua b/make-epub.lua index a4d4956..00df69a 100644 --- a/make-epub.lua +++ b/make-epub.lua @@ -32,20 +32,6 @@ Configuration example: } ]] -local function checkreq(name, display) - local success, library = pcall(function() return require(name) end) - if not success then - error("'" .. (display or name) .. "' missing or failed to load.") - else - return library - end -end - --- local htmlparser = checkreq("htmlparser") -- so that this can be used for its non-HTML functions without htmlparser installed, this check is delayed --- local json = checkreq('json', 'dkjson') -- I replaced the default, currently working on making sure it's properly included here.. -local json = require("json") -- TODO replace with utility-function loader --- pandoc and curl are also required - local success, utility = pcall(function() return dofile(arg[0]:match("@?(.*/)") or arg[0]:match("@?(.*\\)") .. "utility-functions.lua") end) @@ -54,13 +40,14 @@ if not success then error("\n\nThis script may be installed improperly. Follow instructions at:\n\thttps://github.com/TangentFoxy/.lua-files#installation\n") end -local path_separator = "\\" -- temporarily hard-forcing Windows because it's being SUCH A PILE OF GARBAGE --- local path_separator --- if utility.OS == "Windows" then --- path_separator = "\\" --- else --- path_separator = "/" --- end +local json = utility.require("json") + +local path_separator +if utility.OS == "Windows" then + path_separator = "\\" +else + path_separator = "/" +end -- also checks for errors -- TODO make it check for required elements and error if any are missing! @@ -69,7 +56,7 @@ local function get_config() print(help) error("\nA config file name/path must be specified.") elseif arg[1] == "-h" or arg[1] == "--help" then - error(help) + error(help) -- I strongly dislike using an error to print a help message instead of gracefully exiting.. end local file, err = io.open(arg[1], "r") @@ -109,9 +96,7 @@ local function format_metadata(config) end local function download_pages(config) - -- no longer necessary because utility initializes it for us - -- math.randomseed(os.time()) -- for randomized temporary file name and timings to avoid rate limiting - local htmlparser = require "htmlparser" -- TODO replace with local load + local htmlparser = utility.require("htmlparser") utility.required_program("curl") os.execute("mkdir Sections") diff --git a/utility-functions.lua b/utility-functions.lua index 4da573b..c2e0254 100644 --- a/utility-functions.lua +++ b/utility-functions.lua @@ -1,3 +1,15 @@ +-- TO USE, PUT THE INTERIOR OF THIS FUNCTION IN YOUR FILE +-- this only works if that file is in the same directory as this one - but works no matter where it was called from +local function _example_load() + local success, utility = pcall(function() + return dofile(arg[0]:match("@?(.*/)") or arg[0]:match("@?(.*\\)") .. "utility-functions.lua") + end) + if not success then + print("\n\n" .. tostring(utility)) + error("\n\nThis script may be installed improperly. Follow instructions at:\n\thttps://github.com/TangentFoxy/.lua-files#installation\n") + end +end + math.randomseed(os.time()) local utility = {} @@ -8,7 +20,7 @@ else utility.OS = "UNIX-like" end -utility.path = arg[0]:match("@?(.*/)") or arg[0]:match("@?(.*\\)") -- related to discussion in https://stackoverflow.com/q/6380820 +utility.path = arg[0]:match("@?(.*/)") or arg[0]:match("@?(.*\\)") -- inspired by discussion in https://stackoverflow.com/q/6380820 -- always uses outputting to a temporary file to guarantee safety function os.capture_safe(command, tmp_file_name) @@ -39,6 +51,19 @@ function string.trim(s) return s:match'^()%s*$' and '' or s:match'^%s*(.*%S)' end +utility.require = function(name) + local success, package_or_err = pcall(function() + return dofile(arg[0]:match("@?(.*/)") or arg[0]:match("@?(.*\\)") .. name .. ".lua") + end) + if success then + return package_or_err + else + print("\n\n" .. tostring(package_or_err)) + error("\n\nThis script may be installed improperly. Follow instructions at:\n\thttps://github.com/TangentFoxy/.lua-files#installation\n") + end +end + +-- errors if specified program isn't in the path utility.required_program = function(name) local command if utility.OS == "Windows" then @@ -74,6 +99,7 @@ utility.escape_quotes = function(input) return input end +-- Example, print all items in this directory: utility.ls(".")(print) utility.ls = function(path) local command if utility.OS == "Windows" then