From b1ce66fecb8f61315dea4f1d99eb6beee6710a7c Mon Sep 17 00:00:00 2001 From: "Vadim A. Misbakh-Soloviov" Date: Sun, 9 Jun 2019 04:20:53 +0300 Subject: [PATCH] Rework a bit: - Config API - Debugging - Fixes #50, #42 and similar - Reformatting Signed-off-by: Vadim A. Misbakh-Soloviov --- src/htmlparser.lua | 270 ++++++++++++++++++++++----------- src/htmlparser/ElementNode.lua | 4 +- 2 files changed, 183 insertions(+), 91 deletions(-) diff --git a/src/htmlparser.lua b/src/htmlparser.lua index c029eb5..c43cd6b 100644 --- a/src/htmlparser.lua +++ b/src/htmlparser.lua @@ -1,163 +1,255 @@ -- vim: ft=lua ts=2 sw=2 +-- Syntactic Sugar {{{ +local function rine(val) -- Return (val) If it's Not Empty (non-zero-length) + return (val and #val>0) and val +end +local function rit(a) -- Return (a) If it's Table + return (type(a) == "table") and a +end +local noop = function() end local esc = function(s) return string.gsub(s, "([%^%$%(%)%%%.%[%]%*%+%-%?])", "%%" .. "%1") end local str = tostring local char = string.char -local err = function(s) io.stderr:write(s) end -local out = function(s) io.stdout:write(s) end - -local ElementNode = require("htmlparser.ElementNode") -local voidelements = require("htmlparser.voidelements") - +local opts = rit(htmlparser_opts) or {} -- needed for silent/noerr/noout/nonl directives, also needed to be defined before `require` in such case +local prn = opts.silent and noop or function(l,f,...) + local fd = (l=="i") and "stdout" or "stderr" + local t = (" [%s] "):format(l:upper()) + io[fd] + :write('[HTMLParser]'..t..f:format(...) + ..(opts.nonl or "\n") + ) +end +local err = opts.noerr and noop or function(f,...) prn("e",f,...) end +local out = opts.noout and noop or function(f,...) prn("i",f,...) end +local line = debug and function(lvl) return debug.getinfo(lvl or 2).currentline end or noop +local dbg = opts.debug and function(f,...) prn("d",f:gsub("#LINE#",str(line(3))),...) end or noop +-- }}} +-- Requires {{{ +local ElementNode = require"htmlparser.ElementNode" +local voidelements = require"htmlparser.voidelements" +--}}} local HtmlParser = {} +local function parse(text,limit) -- {{{ + local opts = rine(opts) -- use top-level opts-table (the one, defined before requiring the module), if exists + or rit(htmlparser_opts) -- or defined after requiring (but before calling `parse`) + or {} -- fallback otherwise + opts.looplimit = opts.looplimit or htmlparser_looplimit -local tpr = { - -- Here we're replacing confusing sequences - -- (things looking like tags, but appearing where tags can't) - -- with definitelly invalid utf sequence, and later we'll replace them back - ["<"] = char(208,209,208,209), - [">"] = char(209,208,209,208), -} - -local function parse(text,limit) - local text=str(text) - - local limit = limit or htmlparser_looplimit or 1000 - + local text = str(text) + local limit = limit or opts.looplimit or 1000 local tpl = false - local function g(id,...) - local arg={...} - arg[id]=tpr[arg[id]] - tpl=true - return table.concat(arg) - end + if not opts.keep_comments then -- Strip (or not) comments {{{ + text = text:gsub("","") -- Many chances commented code will have syntax errors, that'll lead to parser failures + end -- }}} - text = text - :gsub( - "(<)".. - "([^>]-)".. - "(<)", - function(...)return g(3,...)end - ):gsub( - "("..tpr["<"]..")".. - "([^%w%s])".. - "([^%2]-)".. - "(%2)".. - "(>)".. - "([^>]-)".. - "(>)", - function(...)return g(5,...)end - ):gsub( - [=[(['"])]=].. - [=[([^'">%s]-)]=].. - "(>)".. - [=[([^'">%s]-)]=].. - [=[(['"])]=], - function(...)return g(3,...)end - ) + local tpr={} + + if not opts.keep_danger_placeholders then -- {{{ little speedup by cost of potential parsing breakages + -- search unused "invalid" bytes {{{ + local busy,i={},0; + repeat -- {{{ + local cc = char(i) + if not(text:match(cc)) then -- {{{ + if not(tpr["<"]) or not(tpr[">"]) then -- {{{ + if not(busy[i]) then -- {{{ + if not(tpr["<"]) then -- {{{ + tpr["<"] = cc; + elseif not(tpr[">"]) then + tpr[">"] = cc; + end -- }}} + busy[i] = true + dbg("c:{%s}||cc:{%d}||tpr[c]:{%s}",str(c),cc:byte(),str(tpr[c])) + dbg("busy[i]:{%s},i:{%d}",str(busy[i]),i) + dbg("[FindPH]:#LINE# Success! || i=%d",i) + else -- if !busy + dbg("[FindPH]:#LINE# Busy! || i=%d",i) + end -- if !busy -- }}} + dbg("c:{%s}||cc:{%d}||tpr[c]:{%s}",c,cc:byte(),str(tpr[c])) + dbg("%s",str(busy[i])) + else -- if < or > + dbg("[FindPH]:#LINE# Done!",i) + break + end -- if < or > -- }}} + else -- text!match(cc) + dbg("[FindPH]:#LINE# Text contains this byte! || i=%d",i) + end -- text!match(cc) -- }}} + local skip=1 + if i==31 then + skip=96 -- ASCII + end + i=i+skip + until (i==255) -- }}} + i=nil + --- }}} + + if not(tpr["<"]) or not(tpr[">"]) then + err("Impossible to find at least two unused byte codes in this HTML-code. We need it to escape bracket-contained placeholders inside tags.") + err("Consider enabling 'keep_danger_placeholders' option (to silence this error, if parser wasn't failed with current HTML-code) or manually replace few random bytes, to free up the codes.") + else + dbg("[FindPH]:#LINE# Found! || '<'=%d, '>'=%d",tpr["<"]:byte(),tpr[">"]:byte()) + end + +-- dbg("tpr[>] || tpr[] || #busy%d") + + -- g {{{ + local function g(id,...) + local arg={...} + local orig=arg[id] + arg[id]=arg[id]:gsub("(.)",tpr) + if arg[id] ~= orig then + tpl=true + dbg("[g]:#LINE# orig: %s", str(orig)) + dbg("[g]:#LINE# replaced: %s",str(arg[id])) + end + dbg("[g]:#LINE# called, id: %s, arg[id]: %s, args { "..(("{%s}, "):rep(#arg):gsub(", $","")).." }",id,arg[id],...) + dbg("[g]:#LINE# concat(arg): %s",table.concat(arg)) + return table.concat(arg) + end + -- g }}} + + -- templaters {{{ + text=text:gsub( + [=[(=[%s]-)(['"])]=].. -- only match attr.values, and not random strings between two random quoting marks + [=[([^%2<>]+)]=].. + [=[([^%2>]-)]=].. + [=[(%2)]=], + function(...)return g(4,...)end + ) -- Escape "<" inside attr.values (see issue #50) + text=text:gsub( + [=[(=[%s]-)(['"])]=].. -- only match attr.values, and not random strings between two random quoting marks + [=[([^%2<>]+)]=].. + [=[([^%2<]-)]=].. + [=[(%2)]=], + function(...)return g(4,...)end + ) -- Escape ">" inside attr.values (see issue #50) +--[[ +]] + text = text:gsub( + "(<[^!])".. -- Comments aren't templaters placeholders + "([^>]-)".. + "(>)", + function(...)return g(2,...)end + ) -- scan for a second "<", inside "<>" (if it shows before ">"), until it inside the comment or CDATA + text=text:gsub( + "("..tpr["<"]..")".. + "([^%w%s])".. + "([^%2]-)".. + "(%2)".. + "(>)".. + "([^>]-)".. + "(>)", -- Comments and CDATA aren't templaters placeholders + function(...)return g(5,...)end + ) -- try to find matching ">" for previous replace + -- templaters }}} + end -- }}} local index = 0 local root = ElementNode:new(index, str(text)) - local node, descend, tpos, opentags = root, true, 1, {} - while true do - if index == limit then - err("[HTMLParser] [ERR] Main loop reached loop limit ("..limit.."). Please, consider increasing it or check the code for errors") - break - end + while true do -- MainLoop {{{ + if index == limit then -- {{{ + err("Main loop reached loop limit (%d). Consider either increasing it or checking HTML-code for syntax errors", limit) + break + end -- }}} + -- openstart/tpos Definitions {{{ local openstart, name openstart, tpos, name = root._text:find( "<" .. -- an uncaptured starting "<" "([%w-]+)" .. -- name = the first word, directly following the "<" "[^>]*>", -- include, but not capture everything up to the next ">" tpos) - + dbg("[MainLoop]:#LINE# openstart=%s || tpos=%s || name=%s",str(openstart),str(tpos),str(name)) + -- }}} if not name then break end - + -- Some more vars {{{ index = index + 1 - - local tag = ElementNode:new(index, str(name), node, descend, openstart, tpos) + local tag = ElementNode:new(index, str(name), (node or {}), descend, openstart, tpos) node = tag - local tagloop local tagst, apos = tag:gettext(), 1 - while true do - if tagloop == limit then - err("[HTMLParser] [ERR] tag parsing loop reached loop limit ("..limit.."). Please, consider increasing it or check the code for errors") + -- }}} + while true do -- TagLoop {{{ + if tagloop == limit then -- {{{ + err("Tag parsing loop reached loop limit (%d). Consider either increasing it or checking HTML-code for syntax errors", limit) break - end - - local start, k, eq, quote, v - start, apos, k, eq, quote = tagst:find( + end -- }}} + -- Attrs {{{ + local start, k, eq, quote, v, zsp + start, apos, k, zsp, eq, zsp, quote = tagst:find( "%s+" .. -- some uncaptured space "([^%s=/>]+)" .. -- k = an unspaced string up to an optional "=" or the "/" or ">" + "([%s]-)".. -- zero or more spaces "(=?)" .. -- eq = the optional; "=", else "" - "(['\"]?)", -- quote = an optional "'" or '"' following the "=", or "" + "([%s]-)".. -- zero or more spaces + [=[(['"]?)]=], -- quote = an optional "'" or '"' following the "=", or "" apos) - + dbg("[TagLoop]:#LINE# start=%s || apos=%s || k=%s || zsp='%s' || eq='%s', quote=[%s]",str(start),str(apos),str(k),str(zsp),str(eq),str(quote)) + -- }}} if not k or k == "/>" or k == ">" then break end - + -- Pattern {{{ if eq == "=" then - pattern = "=([^%s>]*)" + local pattern = "=([^%s>]*)" if quote ~= "" then pattern = quote .. "([^" .. quote .. "]*)" .. quote end start, apos, v = tagst:find(pattern, apos) + dbg("[TagLoop]:#LINE# start=%s || apos=%s || v=%s || pattern=%s",str(start),str(apos),str(v),str(pattern)) end - + -- }}} v=v or "" - - if tpl then + if tpl then -- {{{ for rk,rv in pairs(tpr) do - v = v:gsub(rv,rk) + v = v:gsub(rv,rk) + dbg("[TagLoop]:#LINE# rv=%s || rk=%s",str(rv),str(rk)) end - end + end -- }}} + dbg("[TagLoop]:#LINE# k=%s || v=%s",str(k),str(v)) tag:addattribute(k, v) tagloop = (tagloop or 0) + 1 end - - if voidelements[tag.name:lower()] then + -- }}} + if voidelements[tag.name:lower()] then -- {{{ descend = false tag:close() else opentags[tag.name] = opentags[tag.name] or {} table.insert(opentags[tag.name], tag) end - + -- }}} local closeend = tpos local closingloop - while true do + while true do -- TagCloseLoop {{{ if closingloop == limit then - err("[HTMLParser] [ERR] tag closing loop reached loop limit ("..limit.."). Please, consider increasing it or check the code for errors") + err("Tag closing loop reached loop limit (%d). Consider either increasing it or checking HTML-code for syntax errors", limit) break end local closestart, closing, closename closestart, closeend, closing, closename = root._text:find("[^<]*<(/?)([%w-]+)", closeend) + dbg("[TagCloseLoop]:#LINE# closestart=%s || closeend=%s || closing=%s || closename=%s",str(closestart),str(closeend),str(closing),str(closename)) if not closing or closing == "" then break end tag = table.remove(opentags[closename] or {}) or tag -- kludges for the cases of closing void or non-opened tags closestart = root._text:find("<", closestart) + dbg("[TagCloseLoop]:#LINE# closestart=%s",str(closestart)) tag:close(closestart, closeend + 1) node = tag.parent descend = true closingloop = (closingloop or 0) + 1 - end - end - - if tpl then + end -- }}} + end -- }}} + if tpl then -- {{{ + dbg("tpl") for k,v in pairs(tpr) do root._text = root._text:gsub(v,k) end - end - + end -- }}} return root -end +end -- }}} HtmlParser.parse = parse - return HtmlParser - diff --git a/src/htmlparser/ElementNode.lua b/src/htmlparser/ElementNode.lua index 381fb90..0c39901 100644 --- a/src/htmlparser/ElementNode.lua +++ b/src/htmlparser/ElementNode.lua @@ -106,9 +106,9 @@ function ElementNode:new(index, nameortext, node, descend, openstart, openend) table.insert(node.nodes, instance) else instance.root = node.root - instance.parent = node.parent + instance.parent = node.parent or node --XXX: adds some safety but needs more testing for heisenbugs in corner cases instance.level = node.level - table.insert(node.parent.nodes, instance) + table.insert((node.parent and node.parent.nodes or node.nodes), instance) --XXX: see above about heisenbugs end return setmetatable(instance, ElementNode.mt) end