mirror of
https://github.com/msva/lua-htmlparser.git
synced 2024-11-27 12:44:22 +00:00
Added loop limit, added kludge for template engines, documented loop limit, added error messages, fixes #42
This commit is contained in:
parent
71c0a65006
commit
acad4d40eb
12
README.md
12
README.md
@ -2,25 +2,27 @@
|
||||
|
||||
Parse HTML text into a tree of elements with selectors
|
||||
|
||||
[1]: http://wscherphof.github.com/lua-set/
|
||||
[2]: http://api.jquery.com/category/selectors/
|
||||
[1]: https://api.jquery.com/category/selectors/
|
||||
|
||||
## Install
|
||||
Htmlparser is a listed [LuaRock](http://luarocks.org/repositories/rocks/). Install using [LuaRocks](http://www.luarocks.org/): `luarocks install htmlparser`
|
||||
|
||||
### Dependencies
|
||||
Htmlparser depends on [Lua 5.2](http://www.lua.org/download.html) (while work with LuaJIT, which provides 5.1-compatible ABI), and on the ["lua-set"][1] package, which is installed along automatically. To be able to run the tests, [lunitx](https://github.com/dcurrie/lunit) also comes along as a LuaRock
|
||||
Htmlparser depends on [Lua 5.1-5.3](https://www.lua.org/download.html) or [LuaJIT](https://luajit.org/download.html), which provides 5.1-compatible ABI.
|
||||
To be able to run the tests, [lunitx](https://github.com/dcurrie/lunit) also comes along as a LuaRock
|
||||
|
||||
## Usage
|
||||
Start off with
|
||||
```lua
|
||||
pcall(require, "luarocks.loader")
|
||||
local htmlparser = require("htmlparser")
|
||||
```
|
||||
Then, parse some html:
|
||||
```lua
|
||||
local root = htmlparser.parse(htmlstring)
|
||||
```
|
||||
Optionally, you can pass loop-limit value (integer). This value means the deepness of the tree, after which parser will give up. Default value is 1000.
|
||||
Also, global variable `htmlparser_looplimit` is supported (while this optional argument takes priority over global value)
|
||||
|
||||
The input to parse may be the contents of a complete html document, or any valid html snippet, as long as all tags are correctly opened and closed.
|
||||
Now, find specific contained elements by selecting:
|
||||
```lua
|
||||
@ -43,7 +45,7 @@ end
|
||||
The root element is a container for the top level elements in the parsed text, i.e. the `<html>` element in a parsed html document would be a child of the returned root element.
|
||||
|
||||
## Selectors
|
||||
Supported selectors are a subset of [jQuery's selectors][2]:
|
||||
Supported selectors are a subset of [jQuery's selectors][1]:
|
||||
|
||||
- `"*"` all contained elements
|
||||
- `"element"` elements with the given tagname
|
||||
|
@ -1,47 +1,108 @@
|
||||
-- vim: ft=lua ts=2
|
||||
|
||||
local esc = function(s) return string.gsub(s, "([%^%$%(%)%%%.%[%]%*%+%-%?])", "%%" .. "%1") end
|
||||
local str = tostring
|
||||
local char = string.char
|
||||
local err = function(s) io.stderr:write(s) end
|
||||
local out = function(s) io.stdout:write(s) end
|
||||
|
||||
local ElementNode = require("htmlparser.ElementNode")
|
||||
local voidelements = require("htmlparser.voidelements")
|
||||
|
||||
local HtmlParser = {}
|
||||
|
||||
local tpl_rep={
|
||||
-- Replace table for template engines syntax that can confuse us.
|
||||
-- Here we're replacing confusing sequences
|
||||
-- (things looking like tags, but appearing where tags can't)
|
||||
-- with definitelly invalid utf sequence, and later we'll replace them back
|
||||
["<%"] = char(208,209),
|
||||
["%>"] = char(209,208),
|
||||
}
|
||||
local tpl_rep_rev = {}
|
||||
|
||||
|
||||
local function parse(text)
|
||||
local text=str(text)
|
||||
|
||||
local limit = limit or htmlparser_looplimit or 1000
|
||||
|
||||
local tpl = false
|
||||
for k,v in pairs(tpl_rep) do
|
||||
local mtc="("..esc(k)..")"
|
||||
if text:match(mtc) then
|
||||
tpl=true
|
||||
text=text:gsub(mtc,tpl_rep)
|
||||
tpl_rep_rev[v]=k;
|
||||
end
|
||||
end
|
||||
|
||||
local index = 0
|
||||
local root = ElementNode:new(index, text)
|
||||
local root = ElementNode:new(index, str(text))
|
||||
|
||||
local node, descend, tpos, opentags = root, true, 1, {}
|
||||
while true do
|
||||
if index == limit then
|
||||
err("[HTMLParser] [ERR] Main loop reached loop limit ("..limit.."). Please, consider increasing it or check the code for errors")
|
||||
break
|
||||
end
|
||||
|
||||
local openstart, name
|
||||
openstart, tpos, name = string.find(root._text,
|
||||
openstart, tpos, name = root._text:find(
|
||||
"<" .. -- an uncaptured starting "<"
|
||||
"([%w-]+)" .. -- name = the first word, directly following the "<"
|
||||
"[^>]*>", -- include, but not capture everything up to the next ">"
|
||||
tpos)
|
||||
|
||||
if not name then break end
|
||||
|
||||
index = index + 1
|
||||
local tag = ElementNode:new(index, name, node, descend, openstart, tpos)
|
||||
|
||||
local tag = ElementNode:new(index, str(name), node, descend, openstart, tpos)
|
||||
node = tag
|
||||
|
||||
local tagloop
|
||||
local tagst, apos = tag:gettext(), 1
|
||||
while true do
|
||||
if tagloop == limit then
|
||||
err("[HTMLParser] [ERR] tag parsing loop reached loop limit ("..limit.."). Please, consider increasing it or check the code for errors")
|
||||
break
|
||||
end
|
||||
|
||||
local start, k, eq, quote, v
|
||||
start, apos, k, eq, quote = string.find(tagst,
|
||||
start, apos, k, eq, quote = tagst:find(
|
||||
"%s+" .. -- some uncaptured space
|
||||
"([^%s=/>]+)" .. -- k = an unspaced string up to an optional "=" or the "/" or ">"
|
||||
"(=?)" .. -- eq = the optional; "=", else ""
|
||||
"(['\"]?)", -- quote = an optional "'" or '"' following the "=", or ""
|
||||
apos)
|
||||
|
||||
if not k or k == "/>" or k == ">" then break end
|
||||
|
||||
if eq == "=" then
|
||||
local pattern = "=([^%s>]*)"
|
||||
if quote ~= "" then
|
||||
pattern = quote .. "([^" .. quote .. "]*)" .. quote
|
||||
end
|
||||
start, apos, v = string.find(tagst, pattern, apos)
|
||||
start, apos, v = tagst:find(pattern, apos)
|
||||
end
|
||||
tag:addattribute(k, v or "")
|
||||
|
||||
v=v or ""
|
||||
|
||||
if tpl then
|
||||
for rk,rv in pairs(tpl_rep_rev) do
|
||||
local mtc="("..esc(rk)..")"
|
||||
if text:match(mtc) then
|
||||
v = v:gsub(mtc,tpl_rep_rev)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
tag:addattribute(k, v)
|
||||
tagloop = (tagloop or 0) + 1
|
||||
end
|
||||
|
||||
if voidelements[string.lower(tag.name)] then
|
||||
if voidelements[tag.name:lower()] then
|
||||
descend = false
|
||||
tag:close()
|
||||
else
|
||||
@ -50,15 +111,33 @@ local function parse(text)
|
||||
end
|
||||
|
||||
local closeend = tpos
|
||||
local closingloop
|
||||
while true do
|
||||
if closingloop == limit then
|
||||
err("[HTMLParser] [ERR] tag closing loop reached loop limit ("..limit.."). Please, consider increasing it or check the code for errors")
|
||||
break
|
||||
end
|
||||
|
||||
local closestart, closing, closename
|
||||
closestart, closeend, closing, closename = string.find(root._text, "[^<]*<(/?)([%w-]+)", closeend)
|
||||
closestart, closeend, closing, closename = root._text:find("[^<]*<(/?)([%w-]+)", closeend)
|
||||
|
||||
if not closing or closing == "" then break end
|
||||
|
||||
tag = table.remove(opentags[closename] or {}) or tag -- kludges for the cases of closing void or non-opened tags
|
||||
closestart = string.find(root._text, "<", closestart)
|
||||
closestart = root._text:find("<", closestart)
|
||||
tag:close(closestart, closeend + 1)
|
||||
node = tag.parent
|
||||
descend = true
|
||||
closingloop = (closingloop or 0) + 1
|
||||
end
|
||||
end
|
||||
|
||||
if tpl then
|
||||
for k,v in pairs(tpl_rep_rev) do
|
||||
local mtc="("..esc(k)..")"
|
||||
if text:match(mtc) then
|
||||
root._text = root._text:gsub(mtc,tpl_rep_rev)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user