diff --git a/README.md b/README.md index 370245c..7b2a445 100644 --- a/README.md +++ b/README.md @@ -2,25 +2,27 @@ Parse HTML text into a tree of elements with selectors -[1]: http://wscherphof.github.com/lua-set/ -[2]: http://api.jquery.com/category/selectors/ +[1]: https://api.jquery.com/category/selectors/ ## Install Htmlparser is a listed [LuaRock](http://luarocks.org/repositories/rocks/). Install using [LuaRocks](http://www.luarocks.org/): `luarocks install htmlparser` ### Dependencies -Htmlparser depends on [Lua 5.2](http://www.lua.org/download.html) (while work with LuaJIT, which provides 5.1-compatible ABI), and on the ["lua-set"][1] package, which is installed along automatically. To be able to run the tests, [lunitx](https://github.com/dcurrie/lunit) also comes along as a LuaRock +Htmlparser depends on [Lua 5.1-5.3](https://www.lua.org/download.html) or [LuaJIT](https://luajit.org/download.html), which provides 5.1-compatible ABI. +To be able to run the tests, [lunitx](https://github.com/dcurrie/lunit) also comes along as a LuaRock ## Usage Start off with ```lua -pcall(require, "luarocks.loader") local htmlparser = require("htmlparser") ``` Then, parse some html: ```lua local root = htmlparser.parse(htmlstring) ``` +Optionally, you can pass loop-limit value (integer). This value means the deepness of the tree, after which parser will give up. Default value is 1000. +Also, global variable `htmlparser_looplimit` is supported (while this optional argument takes priority over global value) + The input to parse may be the contents of a complete html document, or any valid html snippet, as long as all tags are correctly opened and closed. Now, find specific contained elements by selecting: ```lua @@ -43,7 +45,7 @@ end The root element is a container for the top level elements in the parsed text, i.e. the `` element in a parsed html document would be a child of the returned root element. ## Selectors -Supported selectors are a subset of [jQuery's selectors][2]: +Supported selectors are a subset of [jQuery's selectors][1]: - `"*"` all contained elements - `"element"` elements with the given tagname diff --git a/src/htmlparser.lua b/src/htmlparser.lua index a96ac58..4ecd809 100644 --- a/src/htmlparser.lua +++ b/src/htmlparser.lua @@ -1,47 +1,108 @@ -- vim: ft=lua ts=2 + +local esc = function(s) return string.gsub(s, "([%^%$%(%)%%%.%[%]%*%+%-%?])", "%%" .. "%1") end +local str = tostring +local char = string.char +local err = function(s) io.stderr:write(s) end +local out = function(s) io.stdout:write(s) end + local ElementNode = require("htmlparser.ElementNode") local voidelements = require("htmlparser.voidelements") local HtmlParser = {} +local tpl_rep={ + -- Replace table for template engines syntax that can confuse us. + -- Here we're replacing confusing sequences + -- (things looking like tags, but appearing where tags can't) + -- with definitelly invalid utf sequence, and later we'll replace them back + ["<%"] = char(208,209), + ["%>"] = char(209,208), +} +local tpl_rep_rev = {} + + local function parse(text) + local text=str(text) + + local limit = limit or htmlparser_looplimit or 1000 + + local tpl = false + for k,v in pairs(tpl_rep) do + local mtc="("..esc(k)..")" + if text:match(mtc) then + tpl=true + text=text:gsub(mtc,tpl_rep) + tpl_rep_rev[v]=k; + end + end + local index = 0 - local root = ElementNode:new(index, text) + local root = ElementNode:new(index, str(text)) local node, descend, tpos, opentags = root, true, 1, {} while true do + if index == limit then + err("[HTMLParser] [ERR] Main loop reached loop limit ("..limit.."). Please, consider increasing it or check the code for errors") + break + end + local openstart, name - openstart, tpos, name = string.find(root._text, + openstart, tpos, name = root._text:find( "<" .. -- an uncaptured starting "<" "([%w-]+)" .. -- name = the first word, directly following the "<" "[^>]*>", -- include, but not capture everything up to the next ">" tpos) + if not name then break end + index = index + 1 - local tag = ElementNode:new(index, name, node, descend, openstart, tpos) + + local tag = ElementNode:new(index, str(name), node, descend, openstart, tpos) node = tag + local tagloop local tagst, apos = tag:gettext(), 1 while true do + if tagloop == limit then + err("[HTMLParser] [ERR] tag parsing loop reached loop limit ("..limit.."). Please, consider increasing it or check the code for errors") + break + end + local start, k, eq, quote, v - start, apos, k, eq, quote = string.find(tagst, + start, apos, k, eq, quote = tagst:find( "%s+" .. -- some uncaptured space "([^%s=/>]+)" .. -- k = an unspaced string up to an optional "=" or the "/" or ">" "(=?)" .. -- eq = the optional; "=", else "" "(['\"]?)", -- quote = an optional "'" or '"' following the "=", or "" apos) + if not k or k == "/>" or k == ">" then break end + if eq == "=" then local pattern = "=([^%s>]*)" if quote ~= "" then pattern = quote .. "([^" .. quote .. "]*)" .. quote end - start, apos, v = string.find(tagst, pattern, apos) + start, apos, v = tagst:find(pattern, apos) end - tag:addattribute(k, v or "") + + v=v or "" + + if tpl then + for rk,rv in pairs(tpl_rep_rev) do + local mtc="("..esc(rk)..")" + if text:match(mtc) then + v = v:gsub(mtc,tpl_rep_rev) + end + end + end + + tag:addattribute(k, v) + tagloop = (tagloop or 0) + 1 end - if voidelements[string.lower(tag.name)] then + if voidelements[tag.name:lower()] then descend = false tag:close() else @@ -50,15 +111,33 @@ local function parse(text) end local closeend = tpos + local closingloop while true do + if closingloop == limit then + err("[HTMLParser] [ERR] tag closing loop reached loop limit ("..limit.."). Please, consider increasing it or check the code for errors") + break + end + local closestart, closing, closename - closestart, closeend, closing, closename = string.find(root._text, "[^<]*<(/?)([%w-]+)", closeend) + closestart, closeend, closing, closename = root._text:find("[^<]*<(/?)([%w-]+)", closeend) + if not closing or closing == "" then break end + tag = table.remove(opentags[closename] or {}) or tag -- kludges for the cases of closing void or non-opened tags - closestart = string.find(root._text, "<", closestart) + closestart = root._text:find("<", closestart) tag:close(closestart, closeend + 1) node = tag.parent descend = true + closingloop = (closingloop or 0) + 1 + end + end + + if tpl then + for k,v in pairs(tpl_rep_rev) do + local mtc="("..esc(k)..")" + if text:match(mtc) then + root._text = root._text:gsub(mtc,tpl_rep_rev) + end end end