mirror of
https://github.com/msva/lua-htmlparser.git
synced 2024-11-04 23:34:20 +00:00
64f3eb4df3
... regarding [attribute=value] and other matching. Also explained (in comments) the different matching patterns. And fixed a bug where /> would be listed as an attribute. And added a few more tests.
70 lines
2.1 KiB
Lua
70 lines
2.1 KiB
Lua
local ElementNode = require("ElementNode")
|
|
local voidelements = require("voidelements")
|
|
|
|
local HtmlParser = {}
|
|
|
|
local function parse(text)
|
|
local root = ElementNode:new(text)
|
|
|
|
local node, descend, tpos, opentags = root, true, 1, {}
|
|
while true do
|
|
local openstart, name
|
|
openstart, tpos, name = string.find(root._text,
|
|
"<" .. -- an uncaptured starting "<"
|
|
"(%w+)" .. -- name = the first word, directly following the "<"
|
|
"[^>]*>", -- include, but not capture everything up to the next ">"
|
|
tpos)
|
|
if not name then break end
|
|
local tag = ElementNode:new(name, node, descend, openstart, tpos)
|
|
node = tag
|
|
|
|
local tagst, apos = tag:gettext(), 1
|
|
while true do
|
|
local start, k, eq, quote, v
|
|
start, apos, k, eq, quote = string.find(tagst,
|
|
"%s+" .. -- some uncaptured space
|
|
"([^%s=]+)" .. -- k = an unspaced string up to an optional "="
|
|
"(=?)" .. -- eq = the optiona; "=", else ""
|
|
"(['\"]?)", -- quote = an optional "'" or '"' following the "=", or ""
|
|
apos)
|
|
if not k or k == "/>" then break end
|
|
if eq == "" then
|
|
v = ""
|
|
else
|
|
local pattern = "=([^%s>]*)"
|
|
if quote ~= '' then
|
|
pattern = quote .. "([^" .. quote .. "]*)" .. quote
|
|
end
|
|
start, apos, v = string.find(tagst, pattern, apos)
|
|
end
|
|
tag:addattribute(k, v)
|
|
end
|
|
|
|
if voidelements[string.lower(tag.name)] then
|
|
descend = false
|
|
tag:close()
|
|
else
|
|
opentags[tag.name] = opentags[tag.name] or {}
|
|
table.insert(opentags[tag.name], tag)
|
|
end
|
|
|
|
local closeend = tpos
|
|
while true do
|
|
local closestart, closing, closename
|
|
closestart, closeend, closing, closename = string.find(root._text, "[^<]*<(/?)(%w+)", closeend)
|
|
if not closing or closing == "" then break end
|
|
tag = table.remove(opentags[closename])
|
|
closestart = string.find(root._text, "<", closestart)
|
|
tag:close(closestart, closeend + 1)
|
|
node = tag.parent
|
|
descend = true
|
|
end
|
|
end
|
|
|
|
return root
|
|
end
|
|
HtmlParser.parse = parse
|
|
|
|
return HtmlParser
|
|
|