Added loop limit, added kludge for template engines, documented loop limit, added error messages, fixes #42

This commit is contained in:
Vadim A. Misbakh-Soloviov 2017-04-09 01:55:25 +07:00
parent 71c0a65006
commit acad4d40eb
No known key found for this signature in database
GPG Key ID: 26503D349B3B334B
2 changed files with 95 additions and 14 deletions

View File

@ -2,25 +2,27 @@
Parse HTML text into a tree of elements with selectors
[1]: http://wscherphof.github.com/lua-set/
[2]: http://api.jquery.com/category/selectors/
[1]: https://api.jquery.com/category/selectors/
## Install
Htmlparser is a listed [LuaRock](http://luarocks.org/repositories/rocks/). Install using [LuaRocks](http://www.luarocks.org/): `luarocks install htmlparser`
### Dependencies
Htmlparser depends on [Lua 5.2](http://www.lua.org/download.html) (while work with LuaJIT, which provides 5.1-compatible ABI), and on the ["lua-set"][1] package, which is installed along automatically. To be able to run the tests, [lunitx](https://github.com/dcurrie/lunit) also comes along as a LuaRock
Htmlparser depends on [Lua 5.1-5.3](https://www.lua.org/download.html) or [LuaJIT](https://luajit.org/download.html), which provides 5.1-compatible ABI.
To be able to run the tests, [lunitx](https://github.com/dcurrie/lunit) also comes along as a LuaRock
## Usage
Start off with
```lua
pcall(require, "luarocks.loader")
local htmlparser = require("htmlparser")
```
Then, parse some html:
```lua
local root = htmlparser.parse(htmlstring)
```
Optionally, you can pass loop-limit value (integer). This value means the deepness of the tree, after which parser will give up. Default value is 1000.
Also, global variable `htmlparser_looplimit` is supported (while this optional argument takes priority over global value)
The input to parse may be the contents of a complete html document, or any valid html snippet, as long as all tags are correctly opened and closed.
Now, find specific contained elements by selecting:
```lua
@ -43,7 +45,7 @@ end
The root element is a container for the top level elements in the parsed text, i.e. the `<html>` element in a parsed html document would be a child of the returned root element.
## Selectors
Supported selectors are a subset of [jQuery's selectors][2]:
Supported selectors are a subset of [jQuery's selectors][1]:
- `"*"` all contained elements
- `"element"` elements with the given tagname

View File

@ -1,47 +1,108 @@
-- vim: ft=lua ts=2
local esc = function(s) return string.gsub(s, "([%^%$%(%)%%%.%[%]%*%+%-%?])", "%%" .. "%1") end
local str = tostring
local char = string.char
local err = function(s) io.stderr:write(s) end
local out = function(s) io.stdout:write(s) end
local ElementNode = require("htmlparser.ElementNode")
local voidelements = require("htmlparser.voidelements")
local HtmlParser = {}
local tpl_rep={
-- Replace table for template engines syntax that can confuse us.
-- Here we're replacing confusing sequences
-- (things looking like tags, but appearing where tags can't)
-- with definitelly invalid utf sequence, and later we'll replace them back
["<%"] = char(208,209),
["%>"] = char(209,208),
}
local tpl_rep_rev = {}
local function parse(text)
local text=str(text)
local limit = limit or htmlparser_looplimit or 1000
local tpl = false
for k,v in pairs(tpl_rep) do
local mtc="("..esc(k)..")"
if text:match(mtc) then
tpl=true
text=text:gsub(mtc,tpl_rep)
tpl_rep_rev[v]=k;
end
end
local index = 0
local root = ElementNode:new(index, text)
local root = ElementNode:new(index, str(text))
local node, descend, tpos, opentags = root, true, 1, {}
while true do
if index == limit then
err("[HTMLParser] [ERR] Main loop reached loop limit ("..limit.."). Please, consider increasing it or check the code for errors")
break
end
local openstart, name
openstart, tpos, name = string.find(root._text,
openstart, tpos, name = root._text:find(
"<" .. -- an uncaptured starting "<"
"([%w-]+)" .. -- name = the first word, directly following the "<"
"[^>]*>", -- include, but not capture everything up to the next ">"
tpos)
if not name then break end
index = index + 1
local tag = ElementNode:new(index, name, node, descend, openstart, tpos)
local tag = ElementNode:new(index, str(name), node, descend, openstart, tpos)
node = tag
local tagloop
local tagst, apos = tag:gettext(), 1
while true do
if tagloop == limit then
err("[HTMLParser] [ERR] tag parsing loop reached loop limit ("..limit.."). Please, consider increasing it or check the code for errors")
break
end
local start, k, eq, quote, v
start, apos, k, eq, quote = string.find(tagst,
start, apos, k, eq, quote = tagst:find(
"%s+" .. -- some uncaptured space
"([^%s=/>]+)" .. -- k = an unspaced string up to an optional "=" or the "/" or ">"
"(=?)" .. -- eq = the optional; "=", else ""
"(['\"]?)", -- quote = an optional "'" or '"' following the "=", or ""
apos)
if not k or k == "/>" or k == ">" then break end
if eq == "=" then
local pattern = "=([^%s>]*)"
if quote ~= "" then
pattern = quote .. "([^" .. quote .. "]*)" .. quote
end
start, apos, v = string.find(tagst, pattern, apos)
start, apos, v = tagst:find(pattern, apos)
end
tag:addattribute(k, v or "")
v=v or ""
if tpl then
for rk,rv in pairs(tpl_rep_rev) do
local mtc="("..esc(rk)..")"
if text:match(mtc) then
v = v:gsub(mtc,tpl_rep_rev)
end
end
end
tag:addattribute(k, v)
tagloop = (tagloop or 0) + 1
end
if voidelements[string.lower(tag.name)] then
if voidelements[tag.name:lower()] then
descend = false
tag:close()
else
@ -50,15 +111,33 @@ local function parse(text)
end
local closeend = tpos
local closingloop
while true do
if closingloop == limit then
err("[HTMLParser] [ERR] tag closing loop reached loop limit ("..limit.."). Please, consider increasing it or check the code for errors")
break
end
local closestart, closing, closename
closestart, closeend, closing, closename = string.find(root._text, "[^<]*<(/?)([%w-]+)", closeend)
closestart, closeend, closing, closename = root._text:find("[^<]*<(/?)([%w-]+)", closeend)
if not closing or closing == "" then break end
tag = table.remove(opentags[closename] or {}) or tag -- kludges for the cases of closing void or non-opened tags
closestart = string.find(root._text, "<", closestart)
closestart = root._text:find("<", closestart)
tag:close(closestart, closeend + 1)
node = tag.parent
descend = true
closingloop = (closingloop or 0) + 1
end
end
if tpl then
for k,v in pairs(tpl_rep_rev) do
local mtc="("..esc(k)..")"
if text:match(mtc) then
root._text = root._text:gsub(mtc,tpl_rep_rev)
end
end
end