mirror of
https://github.com/msva/lua-htmlparser.git
synced 2024-11-27 12:44:22 +00:00
parse valueless attributes (e.g. itemscope)
And allow nested tags, which was a newly encountered bug
This commit is contained in:
parent
67be34285e
commit
82dbddfd19
@ -16,14 +16,18 @@ local function parse(text)
|
|||||||
|
|
||||||
local tagst, apos = tag:gettext(), 1
|
local tagst, apos = tag:gettext(), 1
|
||||||
while true do
|
while true do
|
||||||
local start, k, quote, v
|
local start, k, eq, quote, v
|
||||||
start, apos, k, quote = string.find(tagst, "%s+([^%s=]+)=(['\"]?)", apos)
|
start, apos, k, eq, quote = string.find(tagst, "%s+([^%s=]+)(=?)(['\"]?)", apos)
|
||||||
if not k then break end
|
if not k then break end
|
||||||
|
if eq == "" then
|
||||||
|
v = ""
|
||||||
|
else
|
||||||
local pattern = "=([^%s'\">]*)"
|
local pattern = "=([^%s'\">]*)"
|
||||||
if quote ~= '' then
|
if quote ~= '' then
|
||||||
pattern = quote .. "([^" .. quote .. "]*)" .. quote
|
pattern = quote .. "([^" .. quote .. "]*)" .. quote
|
||||||
end
|
end
|
||||||
start, apos, v = string.find(tagst, pattern, apos)
|
start, apos, v = string.find(tagst, pattern, apos)
|
||||||
|
end
|
||||||
tag:addattribute(k, v)
|
tag:addattribute(k, v)
|
||||||
end
|
end
|
||||||
|
|
||||||
@ -31,17 +35,16 @@ local function parse(text)
|
|||||||
descend = false
|
descend = false
|
||||||
tag:close()
|
tag:close()
|
||||||
else
|
else
|
||||||
opentags[tag.name] = tag
|
opentags[tag.name] = opentags[tag.name] or {}
|
||||||
|
table.insert(opentags[tag.name], tag)
|
||||||
end
|
end
|
||||||
|
|
||||||
local closeend = tpos
|
local closeend = tpos
|
||||||
while true do
|
while true do
|
||||||
local closestart, closing, closename
|
local closestart, closing, closename
|
||||||
closestart, closeend, closing, closename = string.find(root._text, "[^<]*<(/?)(%w+)", closeend)
|
closestart, closeend, closing, closename = string.find(root._text, "[^<]*<(/?)(%w+)", closeend)
|
||||||
closing = closing and closing ~= ''
|
if not closing or closing == "" then break end
|
||||||
if not closing then break end
|
tag = table.remove(opentags[closename])
|
||||||
tag = opentags[closename]
|
|
||||||
opentags[closename] = nil
|
|
||||||
closestart = string.find(root._text, "<", closestart)
|
closestart = string.find(root._text, "<", closestart)
|
||||||
tag:close(closestart, closeend + 1)
|
tag:close(closestart, closeend + 1)
|
||||||
node = tag.parent
|
node = tag.parent
|
||||||
|
18
test.html
18
test.html
@ -39,5 +39,23 @@
|
|||||||
</a>
|
</a>
|
||||||
</li>
|
</li>
|
||||||
</ul>
|
</ul>
|
||||||
|
<section itemscope itemtype="http://schema.org/Person">
|
||||||
|
Hello, my name is
|
||||||
|
<span itemprop="name">John Doe</span>,
|
||||||
|
I am a
|
||||||
|
<span itemprop="jobTitle">graduate research assistant</span>
|
||||||
|
at the
|
||||||
|
<span itemprop="affiliation">University of Dreams</span>.
|
||||||
|
My friends call me
|
||||||
|
<span itemprop="additionalName">Johnny</span>.
|
||||||
|
You can visit my homepage at
|
||||||
|
<a href="http://www.JohnnyD.com" itemprop="url">www.JohnnyD.com</a>.
|
||||||
|
<section itemprop="address" itemscope itemtype="http://schema.org/PostalAddress">
|
||||||
|
I live at
|
||||||
|
<span itemprop="streetAddress">1234 Peach Drive</span>,
|
||||||
|
<span itemprop="addressLocality">Warner Robins</span>,
|
||||||
|
<span itemprop="addressRegion">Georgia</span>.
|
||||||
|
</section>
|
||||||
|
</section>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
Loading…
Reference in New Issue
Block a user