mirror of
https://github.com/msva/lua-htmlparser.git
synced 2024-11-27 12:44:22 +00:00
Added loop limit, added kludge for template engines, documented loop limit, added error messages, fixes #42
This commit is contained in:
parent
71c0a65006
commit
acad4d40eb
12
README.md
12
README.md
@ -2,25 +2,27 @@
|
|||||||
|
|
||||||
Parse HTML text into a tree of elements with selectors
|
Parse HTML text into a tree of elements with selectors
|
||||||
|
|
||||||
[1]: http://wscherphof.github.com/lua-set/
|
[1]: https://api.jquery.com/category/selectors/
|
||||||
[2]: http://api.jquery.com/category/selectors/
|
|
||||||
|
|
||||||
## Install
|
## Install
|
||||||
Htmlparser is a listed [LuaRock](http://luarocks.org/repositories/rocks/). Install using [LuaRocks](http://www.luarocks.org/): `luarocks install htmlparser`
|
Htmlparser is a listed [LuaRock](http://luarocks.org/repositories/rocks/). Install using [LuaRocks](http://www.luarocks.org/): `luarocks install htmlparser`
|
||||||
|
|
||||||
### Dependencies
|
### Dependencies
|
||||||
Htmlparser depends on [Lua 5.2](http://www.lua.org/download.html) (while work with LuaJIT, which provides 5.1-compatible ABI), and on the ["lua-set"][1] package, which is installed along automatically. To be able to run the tests, [lunitx](https://github.com/dcurrie/lunit) also comes along as a LuaRock
|
Htmlparser depends on [Lua 5.1-5.3](https://www.lua.org/download.html) or [LuaJIT](https://luajit.org/download.html), which provides 5.1-compatible ABI.
|
||||||
|
To be able to run the tests, [lunitx](https://github.com/dcurrie/lunit) also comes along as a LuaRock
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
Start off with
|
Start off with
|
||||||
```lua
|
```lua
|
||||||
pcall(require, "luarocks.loader")
|
|
||||||
local htmlparser = require("htmlparser")
|
local htmlparser = require("htmlparser")
|
||||||
```
|
```
|
||||||
Then, parse some html:
|
Then, parse some html:
|
||||||
```lua
|
```lua
|
||||||
local root = htmlparser.parse(htmlstring)
|
local root = htmlparser.parse(htmlstring)
|
||||||
```
|
```
|
||||||
|
Optionally, you can pass loop-limit value (integer). This value means the deepness of the tree, after which parser will give up. Default value is 1000.
|
||||||
|
Also, global variable `htmlparser_looplimit` is supported (while this optional argument takes priority over global value)
|
||||||
|
|
||||||
The input to parse may be the contents of a complete html document, or any valid html snippet, as long as all tags are correctly opened and closed.
|
The input to parse may be the contents of a complete html document, or any valid html snippet, as long as all tags are correctly opened and closed.
|
||||||
Now, find specific contained elements by selecting:
|
Now, find specific contained elements by selecting:
|
||||||
```lua
|
```lua
|
||||||
@ -43,7 +45,7 @@ end
|
|||||||
The root element is a container for the top level elements in the parsed text, i.e. the `<html>` element in a parsed html document would be a child of the returned root element.
|
The root element is a container for the top level elements in the parsed text, i.e. the `<html>` element in a parsed html document would be a child of the returned root element.
|
||||||
|
|
||||||
## Selectors
|
## Selectors
|
||||||
Supported selectors are a subset of [jQuery's selectors][2]:
|
Supported selectors are a subset of [jQuery's selectors][1]:
|
||||||
|
|
||||||
- `"*"` all contained elements
|
- `"*"` all contained elements
|
||||||
- `"element"` elements with the given tagname
|
- `"element"` elements with the given tagname
|
||||||
|
@ -1,47 +1,108 @@
|
|||||||
-- vim: ft=lua ts=2
|
-- vim: ft=lua ts=2
|
||||||
|
|
||||||
|
local esc = function(s) return string.gsub(s, "([%^%$%(%)%%%.%[%]%*%+%-%?])", "%%" .. "%1") end
|
||||||
|
local str = tostring
|
||||||
|
local char = string.char
|
||||||
|
local err = function(s) io.stderr:write(s) end
|
||||||
|
local out = function(s) io.stdout:write(s) end
|
||||||
|
|
||||||
local ElementNode = require("htmlparser.ElementNode")
|
local ElementNode = require("htmlparser.ElementNode")
|
||||||
local voidelements = require("htmlparser.voidelements")
|
local voidelements = require("htmlparser.voidelements")
|
||||||
|
|
||||||
local HtmlParser = {}
|
local HtmlParser = {}
|
||||||
|
|
||||||
|
local tpl_rep={
|
||||||
|
-- Replace table for template engines syntax that can confuse us.
|
||||||
|
-- Here we're replacing confusing sequences
|
||||||
|
-- (things looking like tags, but appearing where tags can't)
|
||||||
|
-- with definitelly invalid utf sequence, and later we'll replace them back
|
||||||
|
["<%"] = char(208,209),
|
||||||
|
["%>"] = char(209,208),
|
||||||
|
}
|
||||||
|
local tpl_rep_rev = {}
|
||||||
|
|
||||||
|
|
||||||
local function parse(text)
|
local function parse(text)
|
||||||
|
local text=str(text)
|
||||||
|
|
||||||
|
local limit = limit or htmlparser_looplimit or 1000
|
||||||
|
|
||||||
|
local tpl = false
|
||||||
|
for k,v in pairs(tpl_rep) do
|
||||||
|
local mtc="("..esc(k)..")"
|
||||||
|
if text:match(mtc) then
|
||||||
|
tpl=true
|
||||||
|
text=text:gsub(mtc,tpl_rep)
|
||||||
|
tpl_rep_rev[v]=k;
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
local index = 0
|
local index = 0
|
||||||
local root = ElementNode:new(index, text)
|
local root = ElementNode:new(index, str(text))
|
||||||
|
|
||||||
local node, descend, tpos, opentags = root, true, 1, {}
|
local node, descend, tpos, opentags = root, true, 1, {}
|
||||||
while true do
|
while true do
|
||||||
|
if index == limit then
|
||||||
|
err("[HTMLParser] [ERR] Main loop reached loop limit ("..limit.."). Please, consider increasing it or check the code for errors")
|
||||||
|
break
|
||||||
|
end
|
||||||
|
|
||||||
local openstart, name
|
local openstart, name
|
||||||
openstart, tpos, name = string.find(root._text,
|
openstart, tpos, name = root._text:find(
|
||||||
"<" .. -- an uncaptured starting "<"
|
"<" .. -- an uncaptured starting "<"
|
||||||
"([%w-]+)" .. -- name = the first word, directly following the "<"
|
"([%w-]+)" .. -- name = the first word, directly following the "<"
|
||||||
"[^>]*>", -- include, but not capture everything up to the next ">"
|
"[^>]*>", -- include, but not capture everything up to the next ">"
|
||||||
tpos)
|
tpos)
|
||||||
|
|
||||||
if not name then break end
|
if not name then break end
|
||||||
|
|
||||||
index = index + 1
|
index = index + 1
|
||||||
local tag = ElementNode:new(index, name, node, descend, openstart, tpos)
|
|
||||||
|
local tag = ElementNode:new(index, str(name), node, descend, openstart, tpos)
|
||||||
node = tag
|
node = tag
|
||||||
|
|
||||||
|
local tagloop
|
||||||
local tagst, apos = tag:gettext(), 1
|
local tagst, apos = tag:gettext(), 1
|
||||||
while true do
|
while true do
|
||||||
|
if tagloop == limit then
|
||||||
|
err("[HTMLParser] [ERR] tag parsing loop reached loop limit ("..limit.."). Please, consider increasing it or check the code for errors")
|
||||||
|
break
|
||||||
|
end
|
||||||
|
|
||||||
local start, k, eq, quote, v
|
local start, k, eq, quote, v
|
||||||
start, apos, k, eq, quote = string.find(tagst,
|
start, apos, k, eq, quote = tagst:find(
|
||||||
"%s+" .. -- some uncaptured space
|
"%s+" .. -- some uncaptured space
|
||||||
"([^%s=/>]+)" .. -- k = an unspaced string up to an optional "=" or the "/" or ">"
|
"([^%s=/>]+)" .. -- k = an unspaced string up to an optional "=" or the "/" or ">"
|
||||||
"(=?)" .. -- eq = the optional; "=", else ""
|
"(=?)" .. -- eq = the optional; "=", else ""
|
||||||
"(['\"]?)", -- quote = an optional "'" or '"' following the "=", or ""
|
"(['\"]?)", -- quote = an optional "'" or '"' following the "=", or ""
|
||||||
apos)
|
apos)
|
||||||
|
|
||||||
if not k or k == "/>" or k == ">" then break end
|
if not k or k == "/>" or k == ">" then break end
|
||||||
|
|
||||||
if eq == "=" then
|
if eq == "=" then
|
||||||
local pattern = "=([^%s>]*)"
|
local pattern = "=([^%s>]*)"
|
||||||
if quote ~= "" then
|
if quote ~= "" then
|
||||||
pattern = quote .. "([^" .. quote .. "]*)" .. quote
|
pattern = quote .. "([^" .. quote .. "]*)" .. quote
|
||||||
end
|
end
|
||||||
start, apos, v = string.find(tagst, pattern, apos)
|
start, apos, v = tagst:find(pattern, apos)
|
||||||
end
|
end
|
||||||
tag:addattribute(k, v or "")
|
|
||||||
|
v=v or ""
|
||||||
|
|
||||||
|
if tpl then
|
||||||
|
for rk,rv in pairs(tpl_rep_rev) do
|
||||||
|
local mtc="("..esc(rk)..")"
|
||||||
|
if text:match(mtc) then
|
||||||
|
v = v:gsub(mtc,tpl_rep_rev)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
tag:addattribute(k, v)
|
||||||
|
tagloop = (tagloop or 0) + 1
|
||||||
end
|
end
|
||||||
|
|
||||||
if voidelements[string.lower(tag.name)] then
|
if voidelements[tag.name:lower()] then
|
||||||
descend = false
|
descend = false
|
||||||
tag:close()
|
tag:close()
|
||||||
else
|
else
|
||||||
@ -50,15 +111,33 @@ local function parse(text)
|
|||||||
end
|
end
|
||||||
|
|
||||||
local closeend = tpos
|
local closeend = tpos
|
||||||
|
local closingloop
|
||||||
while true do
|
while true do
|
||||||
|
if closingloop == limit then
|
||||||
|
err("[HTMLParser] [ERR] tag closing loop reached loop limit ("..limit.."). Please, consider increasing it or check the code for errors")
|
||||||
|
break
|
||||||
|
end
|
||||||
|
|
||||||
local closestart, closing, closename
|
local closestart, closing, closename
|
||||||
closestart, closeend, closing, closename = string.find(root._text, "[^<]*<(/?)([%w-]+)", closeend)
|
closestart, closeend, closing, closename = root._text:find("[^<]*<(/?)([%w-]+)", closeend)
|
||||||
|
|
||||||
if not closing or closing == "" then break end
|
if not closing or closing == "" then break end
|
||||||
|
|
||||||
tag = table.remove(opentags[closename] or {}) or tag -- kludges for the cases of closing void or non-opened tags
|
tag = table.remove(opentags[closename] or {}) or tag -- kludges for the cases of closing void or non-opened tags
|
||||||
closestart = string.find(root._text, "<", closestart)
|
closestart = root._text:find("<", closestart)
|
||||||
tag:close(closestart, closeend + 1)
|
tag:close(closestart, closeend + 1)
|
||||||
node = tag.parent
|
node = tag.parent
|
||||||
descend = true
|
descend = true
|
||||||
|
closingloop = (closingloop or 0) + 1
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
if tpl then
|
||||||
|
for k,v in pairs(tpl_rep_rev) do
|
||||||
|
local mtc="("..esc(k)..")"
|
||||||
|
if text:match(mtc) then
|
||||||
|
root._text = root._text:gsub(mtc,tpl_rep_rev)
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user