Added loop limit, added kludge for template engines, documented loop limit, added error messages, fixes #42

This commit is contained in:
Vadim A. Misbakh-Soloviov 2017-04-09 01:55:25 +07:00
parent 71c0a65006
commit acad4d40eb
No known key found for this signature in database
GPG Key ID: 26503D349B3B334B
2 changed files with 95 additions and 14 deletions

View File

@ -2,25 +2,27 @@
Parse HTML text into a tree of elements with selectors Parse HTML text into a tree of elements with selectors
[1]: http://wscherphof.github.com/lua-set/ [1]: https://api.jquery.com/category/selectors/
[2]: http://api.jquery.com/category/selectors/
## Install ## Install
Htmlparser is a listed [LuaRock](http://luarocks.org/repositories/rocks/). Install using [LuaRocks](http://www.luarocks.org/): `luarocks install htmlparser` Htmlparser is a listed [LuaRock](http://luarocks.org/repositories/rocks/). Install using [LuaRocks](http://www.luarocks.org/): `luarocks install htmlparser`
### Dependencies ### Dependencies
Htmlparser depends on [Lua 5.2](http://www.lua.org/download.html) (while work with LuaJIT, which provides 5.1-compatible ABI), and on the ["lua-set"][1] package, which is installed along automatically. To be able to run the tests, [lunitx](https://github.com/dcurrie/lunit) also comes along as a LuaRock Htmlparser depends on [Lua 5.1-5.3](https://www.lua.org/download.html) or [LuaJIT](https://luajit.org/download.html), which provides 5.1-compatible ABI.
To be able to run the tests, [lunitx](https://github.com/dcurrie/lunit) also comes along as a LuaRock
## Usage ## Usage
Start off with Start off with
```lua ```lua
pcall(require, "luarocks.loader")
local htmlparser = require("htmlparser") local htmlparser = require("htmlparser")
``` ```
Then, parse some html: Then, parse some html:
```lua ```lua
local root = htmlparser.parse(htmlstring) local root = htmlparser.parse(htmlstring)
``` ```
Optionally, you can pass loop-limit value (integer). This value means the deepness of the tree, after which parser will give up. Default value is 1000.
Also, global variable `htmlparser_looplimit` is supported (while this optional argument takes priority over global value)
The input to parse may be the contents of a complete html document, or any valid html snippet, as long as all tags are correctly opened and closed. The input to parse may be the contents of a complete html document, or any valid html snippet, as long as all tags are correctly opened and closed.
Now, find specific contained elements by selecting: Now, find specific contained elements by selecting:
```lua ```lua
@ -43,7 +45,7 @@ end
The root element is a container for the top level elements in the parsed text, i.e. the `<html>` element in a parsed html document would be a child of the returned root element. The root element is a container for the top level elements in the parsed text, i.e. the `<html>` element in a parsed html document would be a child of the returned root element.
## Selectors ## Selectors
Supported selectors are a subset of [jQuery's selectors][2]: Supported selectors are a subset of [jQuery's selectors][1]:
- `"*"` all contained elements - `"*"` all contained elements
- `"element"` elements with the given tagname - `"element"` elements with the given tagname

View File

@ -1,47 +1,108 @@
-- vim: ft=lua ts=2 -- vim: ft=lua ts=2
local esc = function(s) return string.gsub(s, "([%^%$%(%)%%%.%[%]%*%+%-%?])", "%%" .. "%1") end
local str = tostring
local char = string.char
local err = function(s) io.stderr:write(s) end
local out = function(s) io.stdout:write(s) end
local ElementNode = require("htmlparser.ElementNode") local ElementNode = require("htmlparser.ElementNode")
local voidelements = require("htmlparser.voidelements") local voidelements = require("htmlparser.voidelements")
local HtmlParser = {} local HtmlParser = {}
local tpl_rep={
-- Replace table for template engines syntax that can confuse us.
-- Here we're replacing confusing sequences
-- (things looking like tags, but appearing where tags can't)
-- with definitelly invalid utf sequence, and later we'll replace them back
["<%"] = char(208,209),
["%>"] = char(209,208),
}
local tpl_rep_rev = {}
local function parse(text) local function parse(text)
local text=str(text)
local limit = limit or htmlparser_looplimit or 1000
local tpl = false
for k,v in pairs(tpl_rep) do
local mtc="("..esc(k)..")"
if text:match(mtc) then
tpl=true
text=text:gsub(mtc,tpl_rep)
tpl_rep_rev[v]=k;
end
end
local index = 0 local index = 0
local root = ElementNode:new(index, text) local root = ElementNode:new(index, str(text))
local node, descend, tpos, opentags = root, true, 1, {} local node, descend, tpos, opentags = root, true, 1, {}
while true do while true do
if index == limit then
err("[HTMLParser] [ERR] Main loop reached loop limit ("..limit.."). Please, consider increasing it or check the code for errors")
break
end
local openstart, name local openstart, name
openstart, tpos, name = string.find(root._text, openstart, tpos, name = root._text:find(
"<" .. -- an uncaptured starting "<" "<" .. -- an uncaptured starting "<"
"([%w-]+)" .. -- name = the first word, directly following the "<" "([%w-]+)" .. -- name = the first word, directly following the "<"
"[^>]*>", -- include, but not capture everything up to the next ">" "[^>]*>", -- include, but not capture everything up to the next ">"
tpos) tpos)
if not name then break end if not name then break end
index = index + 1 index = index + 1
local tag = ElementNode:new(index, name, node, descend, openstart, tpos)
local tag = ElementNode:new(index, str(name), node, descend, openstart, tpos)
node = tag node = tag
local tagloop
local tagst, apos = tag:gettext(), 1 local tagst, apos = tag:gettext(), 1
while true do while true do
if tagloop == limit then
err("[HTMLParser] [ERR] tag parsing loop reached loop limit ("..limit.."). Please, consider increasing it or check the code for errors")
break
end
local start, k, eq, quote, v local start, k, eq, quote, v
start, apos, k, eq, quote = string.find(tagst, start, apos, k, eq, quote = tagst:find(
"%s+" .. -- some uncaptured space "%s+" .. -- some uncaptured space
"([^%s=/>]+)" .. -- k = an unspaced string up to an optional "=" or the "/" or ">" "([^%s=/>]+)" .. -- k = an unspaced string up to an optional "=" or the "/" or ">"
"(=?)" .. -- eq = the optional; "=", else "" "(=?)" .. -- eq = the optional; "=", else ""
"(['\"]?)", -- quote = an optional "'" or '"' following the "=", or "" "(['\"]?)", -- quote = an optional "'" or '"' following the "=", or ""
apos) apos)
if not k or k == "/>" or k == ">" then break end if not k or k == "/>" or k == ">" then break end
if eq == "=" then if eq == "=" then
local pattern = "=([^%s>]*)" local pattern = "=([^%s>]*)"
if quote ~= "" then if quote ~= "" then
pattern = quote .. "([^" .. quote .. "]*)" .. quote pattern = quote .. "([^" .. quote .. "]*)" .. quote
end end
start, apos, v = string.find(tagst, pattern, apos) start, apos, v = tagst:find(pattern, apos)
end
tag:addattribute(k, v or "")
end end
if voidelements[string.lower(tag.name)] then v=v or ""
if tpl then
for rk,rv in pairs(tpl_rep_rev) do
local mtc="("..esc(rk)..")"
if text:match(mtc) then
v = v:gsub(mtc,tpl_rep_rev)
end
end
end
tag:addattribute(k, v)
tagloop = (tagloop or 0) + 1
end
if voidelements[tag.name:lower()] then
descend = false descend = false
tag:close() tag:close()
else else
@ -50,15 +111,33 @@ local function parse(text)
end end
local closeend = tpos local closeend = tpos
local closingloop
while true do while true do
if closingloop == limit then
err("[HTMLParser] [ERR] tag closing loop reached loop limit ("..limit.."). Please, consider increasing it or check the code for errors")
break
end
local closestart, closing, closename local closestart, closing, closename
closestart, closeend, closing, closename = string.find(root._text, "[^<]*<(/?)([%w-]+)", closeend) closestart, closeend, closing, closename = root._text:find("[^<]*<(/?)([%w-]+)", closeend)
if not closing or closing == "" then break end if not closing or closing == "" then break end
tag = table.remove(opentags[closename] or {}) or tag -- kludges for the cases of closing void or non-opened tags tag = table.remove(opentags[closename] or {}) or tag -- kludges for the cases of closing void or non-opened tags
closestart = string.find(root._text, "<", closestart) closestart = root._text:find("<", closestart)
tag:close(closestart, closeend + 1) tag:close(closestart, closeend + 1)
node = tag.parent node = tag.parent
descend = true descend = true
closingloop = (closingloop or 0) + 1
end
end
if tpl then
for k,v in pairs(tpl_rep_rev) do
local mtc="("..esc(k)..")"
if text:match(mtc) then
root._text = root._text:gsub(mtc,tpl_rep_rev)
end
end end
end end