mirror of
https://github.com/msva/lua-htmlparser.git
synced 2024-11-27 12:44:22 +00:00
closes #7
Extracting microdata is fairly tedious compared to microformats, due to its generality. Should probably be included as a standard function of the parser, or even better, as a separate module, which then can concern itself with being fully standards compliant.
This commit is contained in:
parent
78d99a61f6
commit
f9b04866b4
35
test.html
35
test.html
@ -39,23 +39,24 @@
|
|||||||
</a>
|
</a>
|
||||||
</li>
|
</li>
|
||||||
</ul>
|
</ul>
|
||||||
<section itemscope itemtype="http://schema.org/Person">
|
<h1>About me</h1>
|
||||||
Hello, my name is
|
<section itemscope itemtype="http://schema.org/Person">
|
||||||
<span itemprop="name">John Doe</span>,
|
Hello, my name is
|
||||||
I am a
|
<span itemprop="name">John Doe</span>,
|
||||||
<span itemprop="jobTitle">graduate research assistant</span>
|
I am a
|
||||||
at the
|
<span itemprop="jobTitle">graduate research assistant</span>
|
||||||
<span itemprop="affiliation">University of Dreams</span>.
|
at the
|
||||||
My friends call me
|
<span itemprop="affiliation">University of Dreams</span>.
|
||||||
<span itemprop="additionalName">Johnny</span>.
|
My friends call me
|
||||||
You can visit my homepage at
|
<span itemprop="additionalName">Johnny</span>.
|
||||||
<a href="http://www.JohnnyD.com" itemprop="url">www.JohnnyD.com</a>.
|
You can visit my homepage at
|
||||||
<section itemprop="address" itemscope itemtype="http://schema.org/PostalAddress">
|
<a href="http://www.JohnnyD.com" itemprop="url">www.JohnnyD.com</a>.
|
||||||
I live at
|
<section itemprop="address" itemscope itemtype="http://schema.org/PostalAddress">
|
||||||
<span itemprop="streetAddress">1234 Peach Drive</span>,
|
I live at
|
||||||
<span itemprop="addressLocality">Warner Robins</span>,
|
<span itemprop="streetAddress">1234 Peach Drive</span>,
|
||||||
<span itemprop="addressRegion">Georgia</span>.
|
<span itemprop="addressLocality">Warner Robins</span>,
|
||||||
|
<span itemprop="addressRegion">Georgia</span>.
|
||||||
|
</section>
|
||||||
</section>
|
</section>
|
||||||
</section>
|
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
56
test.lua
56
test.lua
@ -51,22 +51,24 @@ select("ul > *")
|
|||||||
select("body [class]")
|
select("body [class]")
|
||||||
select("body > [class]")
|
select("body > [class]")
|
||||||
|
|
||||||
|
print("\nchapters")
|
||||||
local sel, chapters = root("ol.chapters > li"), {}
|
local sel, chapters = root("ol.chapters > li"), {}
|
||||||
for _,v in ipairs(sel.nodes) do
|
for _,v in ipairs(sel.nodes) do
|
||||||
table.insert(chapters, v:getcontent())
|
table.insert(chapters, v:getcontent())
|
||||||
end
|
end
|
||||||
print("\nchapters")
|
-- print
|
||||||
for i,v in ipairs(chapters) do
|
for i,v in ipairs(chapters) do
|
||||||
print(i, v)
|
print(i, v)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
print("\ncontacts")
|
||||||
local sel, contacts = root("ul.contacts > li")("span[class]"), {}
|
local sel, contacts = root("ul.contacts > li")("span[class]"), {}
|
||||||
for _,v in ipairs(sel.nodes) do
|
for _,v in ipairs(sel.nodes) do
|
||||||
local id = v.parent.parent.id -- li > a > span
|
local id = v.parent.parent.id -- li > a > span
|
||||||
contacts[id] = contacts[id] or {}
|
contacts[id] = contacts[id] or {}
|
||||||
contacts[id][v.classes[1]] = v:getcontent()
|
contacts[id][v.classes[1]] = v:getcontent()
|
||||||
end
|
end
|
||||||
print("\ncontacts")
|
-- print
|
||||||
for k,v in pairs(contacts) do
|
for k,v in pairs(contacts) do
|
||||||
print(k)
|
print(k)
|
||||||
for fk,fv in pairs(v) do
|
for fk,fv in pairs(v) do
|
||||||
@ -75,20 +77,40 @@ for k,v in pairs(contacts) do
|
|||||||
end
|
end
|
||||||
|
|
||||||
print("\nmicrodata")
|
print("\nmicrodata")
|
||||||
local sel, scopes = root("[itemscope]"), {}
|
local sel, scopes = root("[itemprop]"), {}
|
||||||
for i,v in ipairs(sel.nodes) do
|
for _,prop in ipairs(sel.nodes) do
|
||||||
local type = v.attributes["itemtype"]
|
if prop.attributes["itemscope"] then goto nextprop end
|
||||||
if not v.attributes["itemprop"] then
|
local descendantscopes, scope = {}, prop
|
||||||
scopes[type] = scopes[type] or {}
|
while true do
|
||||||
local item = {}
|
repeat
|
||||||
local sel = sel("[itemprop]")
|
scope = scope.parent
|
||||||
for i,v in ipairs(sel.nodes) do
|
until scope.attributes["itemscope"]
|
||||||
-- TODO
|
if not scope.attributes["itemprop"] then break end
|
||||||
print("prop", v.attributes["itemprop"])
|
table.insert(descendantscopes, 1, scope)
|
||||||
end
|
end
|
||||||
table.insert(scopes[type], item)
|
scopes[scope] = scopes[scope] or {}
|
||||||
|
local entry = scopes[scope]
|
||||||
|
for _,v in ipairs(descendantscopes) do
|
||||||
|
entry[v] = entry[v] or {}
|
||||||
|
entry = entry[v]
|
||||||
|
end
|
||||||
|
local k, v = prop.attributes["itemprop"], prop:getcontent()
|
||||||
|
entry[k] = v
|
||||||
|
::nextprop::
|
||||||
|
end
|
||||||
|
-- print
|
||||||
|
local function printscope(node, table, level)
|
||||||
|
level = level or 1
|
||||||
|
local scopeprop = node.attributes["itemprop"] or ""
|
||||||
|
print(string.rep(" ", level - 1) .. node.attributes["itemtype"], scopeprop)
|
||||||
|
for prop,v in pairs(table) do
|
||||||
|
if type(prop) == "table" then
|
||||||
|
printscope(prop, v, level + 1)
|
||||||
|
else
|
||||||
|
print(string.rep(" ", level) .. prop .. "=[" .. v .. "]")
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
for node,table in pairs(scopes) do
|
||||||
|
printscope(node, table)
|
||||||
|
end
|
Loading…
Reference in New Issue
Block a user