mirror of
https://github.com/msva/lua-htmlparser.git
synced 2024-11-04 23:34:20 +00:00
Merge pull request #11 from wscherphof/issue-#7---microdata
Issue #7 microdata
This commit is contained in:
commit
d932191a52
@ -16,14 +16,18 @@ local function parse(text)
|
||||
|
||||
local tagst, apos = tag:gettext(), 1
|
||||
while true do
|
||||
local start, k, quote, v
|
||||
start, apos, k, quote = string.find(tagst, "%s+([^%s=]+)=(['\"]?)", apos)
|
||||
local start, k, eq, quote, v
|
||||
start, apos, k, eq, quote = string.find(tagst, "%s+([^%s=]+)(=?)(['\"]?)", apos)
|
||||
if not k then break end
|
||||
local pattern = "=([^%s'\">]*)"
|
||||
if quote ~= '' then
|
||||
pattern = quote .. "([^" .. quote .. "]*)" .. quote
|
||||
if eq == "" then
|
||||
v = ""
|
||||
else
|
||||
local pattern = "=([^%s'\">]*)"
|
||||
if quote ~= '' then
|
||||
pattern = quote .. "([^" .. quote .. "]*)" .. quote
|
||||
end
|
||||
start, apos, v = string.find(tagst, pattern, apos)
|
||||
end
|
||||
start, apos, v = string.find(tagst, pattern, apos)
|
||||
tag:addattribute(k, v)
|
||||
end
|
||||
|
||||
@ -31,17 +35,16 @@ local function parse(text)
|
||||
descend = false
|
||||
tag:close()
|
||||
else
|
||||
opentags[tag.name] = tag
|
||||
opentags[tag.name] = opentags[tag.name] or {}
|
||||
table.insert(opentags[tag.name], tag)
|
||||
end
|
||||
|
||||
local closeend = tpos
|
||||
while true do
|
||||
local closestart, closing, closename
|
||||
closestart, closeend, closing, closename = string.find(root._text, "[^<]*<(/?)(%w+)", closeend)
|
||||
closing = closing and closing ~= ''
|
||||
if not closing then break end
|
||||
tag = opentags[closename]
|
||||
opentags[closename] = nil
|
||||
if not closing or closing == "" then break end
|
||||
tag = table.remove(opentags[closename])
|
||||
closestart = string.find(root._text, "<", closestart)
|
||||
tag:close(closestart, closeend + 1)
|
||||
node = tag.parent
|
||||
|
19
test.html
19
test.html
@ -39,5 +39,24 @@
|
||||
</a>
|
||||
</li>
|
||||
</ul>
|
||||
<h1>About me</h1>
|
||||
<section itemscope itemtype="http://schema.org/Person">
|
||||
Hello, my name is
|
||||
<span itemprop="name">John Doe</span>,
|
||||
I am a
|
||||
<span itemprop="jobTitle">graduate research assistant</span>
|
||||
at the
|
||||
<span itemprop="affiliation">University of Dreams</span>.
|
||||
My friends call me
|
||||
<span itemprop="additionalName">Johnny</span>.
|
||||
You can visit my homepage at
|
||||
<a href="http://www.JohnnyD.com" itemprop="url">www.JohnnyD.com</a>.
|
||||
<section itemprop="address" itemscope itemtype="http://schema.org/PostalAddress">
|
||||
I live at
|
||||
<span itemprop="streetAddress">1234 Peach Drive</span>,
|
||||
<span itemprop="addressLocality">Warner Robins</span>,
|
||||
<span itemprop="addressRegion">Georgia</span>.
|
||||
</section>
|
||||
</section>
|
||||
</body>
|
||||
</html>
|
46
test.lua
46
test.lua
@ -51,22 +51,24 @@ select("ul > *")
|
||||
select("body [class]")
|
||||
select("body > [class]")
|
||||
|
||||
print("\nchapters")
|
||||
local sel, chapters = root("ol.chapters > li"), {}
|
||||
for _,v in ipairs(sel.nodes) do
|
||||
table.insert(chapters, v:getcontent())
|
||||
end
|
||||
print("\nchapters")
|
||||
-- print
|
||||
for i,v in ipairs(chapters) do
|
||||
print(i, v)
|
||||
end
|
||||
|
||||
print("\ncontacts")
|
||||
local sel, contacts = root("ul.contacts > li")("span[class]"), {}
|
||||
for _,v in ipairs(sel.nodes) do
|
||||
local id = v.parent.parent.id -- li > a > span
|
||||
contacts[id] = contacts[id] or {}
|
||||
contacts[id][v.classes[1]] = v:getcontent()
|
||||
end
|
||||
print("\ncontacts")
|
||||
-- print
|
||||
for k,v in pairs(contacts) do
|
||||
print(k)
|
||||
for fk,fv in pairs(v) do
|
||||
@ -74,5 +76,41 @@ for k,v in pairs(contacts) do
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
|
||||
print("\nmicrodata")
|
||||
local sel, scopes = root("[itemprop]"), {}
|
||||
for _,prop in ipairs(sel.nodes) do
|
||||
if prop.attributes["itemscope"] then goto nextprop end
|
||||
local descendantscopes, scope = {}, prop
|
||||
while true do
|
||||
repeat
|
||||
scope = scope.parent
|
||||
until scope.attributes["itemscope"]
|
||||
if not scope.attributes["itemprop"] then break end
|
||||
table.insert(descendantscopes, 1, scope)
|
||||
end
|
||||
scopes[scope] = scopes[scope] or {}
|
||||
local entry = scopes[scope]
|
||||
for _,v in ipairs(descendantscopes) do
|
||||
entry[v] = entry[v] or {}
|
||||
entry = entry[v]
|
||||
end
|
||||
local k, v = prop.attributes["itemprop"], prop:getcontent()
|
||||
entry[k] = v
|
||||
::nextprop::
|
||||
end
|
||||
-- print
|
||||
local function printscope(node, table, level)
|
||||
level = level or 1
|
||||
local scopeprop = node.attributes["itemprop"] or ""
|
||||
print(string.rep(" ", level - 1) .. node.attributes["itemtype"], scopeprop)
|
||||
for prop,v in pairs(table) do
|
||||
if type(prop) == "table" then
|
||||
printscope(prop, v, level + 1)
|
||||
else
|
||||
print(string.rep(" ", level) .. prop .. "=[" .. v .. "]")
|
||||
end
|
||||
end
|
||||
end
|
||||
for node,table in pairs(scopes) do
|
||||
printscope(node, table)
|
||||
end
|
Loading…
Reference in New Issue
Block a user