lua-htmlparser/htmlparser.lua
Wouter Scherphof 64f3eb4df3 Moved some details to a function
... regarding [attribute=value] and other matching.
Also explained (in comments) the different matching patterns.
And fixed a bug where /> would be listed as an attribute.
And added a few more tests.
2013-03-25 13:55:35 +01:00

70 lines
2.1 KiB
Lua

local ElementNode = require("ElementNode")
local voidelements = require("voidelements")
local HtmlParser = {}
local function parse(text)
local root = ElementNode:new(text)
local node, descend, tpos, opentags = root, true, 1, {}
while true do
local openstart, name
openstart, tpos, name = string.find(root._text,
"<" .. -- an uncaptured starting "<"
"(%w+)" .. -- name = the first word, directly following the "<"
"[^>]*>", -- include, but not capture everything up to the next ">"
tpos)
if not name then break end
local tag = ElementNode:new(name, node, descend, openstart, tpos)
node = tag
local tagst, apos = tag:gettext(), 1
while true do
local start, k, eq, quote, v
start, apos, k, eq, quote = string.find(tagst,
"%s+" .. -- some uncaptured space
"([^%s=]+)" .. -- k = an unspaced string up to an optional "="
"(=?)" .. -- eq = the optiona; "=", else ""
"(['\"]?)", -- quote = an optional "'" or '"' following the "=", or ""
apos)
if not k or k == "/>" then break end
if eq == "" then
v = ""
else
local pattern = "=([^%s>]*)"
if quote ~= '' then
pattern = quote .. "([^" .. quote .. "]*)" .. quote
end
start, apos, v = string.find(tagst, pattern, apos)
end
tag:addattribute(k, v)
end
if voidelements[string.lower(tag.name)] then
descend = false
tag:close()
else
opentags[tag.name] = opentags[tag.name] or {}
table.insert(opentags[tag.name], tag)
end
local closeend = tpos
while true do
local closestart, closing, closename
closestart, closeend, closing, closename = string.find(root._text, "[^<]*<(/?)(%w+)", closeend)
if not closing or closing == "" then break end
tag = table.remove(opentags[closename])
closestart = string.find(root._text, "<", closestart)
tag:close(closestart, closeend + 1)
node = tag.parent
descend = true
end
end
return root
end
HtmlParser.parse = parse
return HtmlParser