Extracting microdata is fairly tedious compared to microformats, due to
its generality. Should probably be included as a standard function of
the parser, or even better, as a separate module, which then can
concern itself with being fully standards compliant.
This commit is contained in:
Wouter Scherphof 2013-03-22 00:10:24 +01:00
parent 78d99a61f6
commit f9b04866b4
2 changed files with 57 additions and 34 deletions

View File

@ -39,23 +39,24 @@
</a> </a>
</li> </li>
</ul> </ul>
<section itemscope itemtype="http://schema.org/Person"> <h1>About me</h1>
Hello, my name is <section itemscope itemtype="http://schema.org/Person">
<span itemprop="name">John Doe</span>, Hello, my name is
I am a <span itemprop="name">John Doe</span>,
<span itemprop="jobTitle">graduate research assistant</span> I am a
at the <span itemprop="jobTitle">graduate research assistant</span>
<span itemprop="affiliation">University of Dreams</span>. at the
My friends call me <span itemprop="affiliation">University of Dreams</span>.
<span itemprop="additionalName">Johnny</span>. My friends call me
You can visit my homepage at <span itemprop="additionalName">Johnny</span>.
<a href="http://www.JohnnyD.com" itemprop="url">www.JohnnyD.com</a>. You can visit my homepage at
<section itemprop="address" itemscope itemtype="http://schema.org/PostalAddress"> <a href="http://www.JohnnyD.com" itemprop="url">www.JohnnyD.com</a>.
I live at <section itemprop="address" itemscope itemtype="http://schema.org/PostalAddress">
<span itemprop="streetAddress">1234 Peach Drive</span>, I live at
<span itemprop="addressLocality">Warner Robins</span>, <span itemprop="streetAddress">1234 Peach Drive</span>,
<span itemprop="addressRegion">Georgia</span>. <span itemprop="addressLocality">Warner Robins</span>,
<span itemprop="addressRegion">Georgia</span>.
</section>
</section> </section>
</section>
</body> </body>
</html> </html>

View File

@ -51,22 +51,24 @@ select("ul > *")
select("body [class]") select("body [class]")
select("body > [class]") select("body > [class]")
print("\nchapters")
local sel, chapters = root("ol.chapters > li"), {} local sel, chapters = root("ol.chapters > li"), {}
for _,v in ipairs(sel.nodes) do for _,v in ipairs(sel.nodes) do
table.insert(chapters, v:getcontent()) table.insert(chapters, v:getcontent())
end end
print("\nchapters") -- print
for i,v in ipairs(chapters) do for i,v in ipairs(chapters) do
print(i, v) print(i, v)
end end
print("\ncontacts")
local sel, contacts = root("ul.contacts > li")("span[class]"), {} local sel, contacts = root("ul.contacts > li")("span[class]"), {}
for _,v in ipairs(sel.nodes) do for _,v in ipairs(sel.nodes) do
local id = v.parent.parent.id -- li > a > span local id = v.parent.parent.id -- li > a > span
contacts[id] = contacts[id] or {} contacts[id] = contacts[id] or {}
contacts[id][v.classes[1]] = v:getcontent() contacts[id][v.classes[1]] = v:getcontent()
end end
print("\ncontacts") -- print
for k,v in pairs(contacts) do for k,v in pairs(contacts) do
print(k) print(k)
for fk,fv in pairs(v) do for fk,fv in pairs(v) do
@ -75,20 +77,40 @@ for k,v in pairs(contacts) do
end end
print("\nmicrodata") print("\nmicrodata")
local sel, scopes = root("[itemscope]"), {} local sel, scopes = root("[itemprop]"), {}
for i,v in ipairs(sel.nodes) do for _,prop in ipairs(sel.nodes) do
local type = v.attributes["itemtype"] if prop.attributes["itemscope"] then goto nextprop end
if not v.attributes["itemprop"] then local descendantscopes, scope = {}, prop
scopes[type] = scopes[type] or {} while true do
local item = {} repeat
local sel = sel("[itemprop]") scope = scope.parent
for i,v in ipairs(sel.nodes) do until scope.attributes["itemscope"]
-- TODO if not scope.attributes["itemprop"] then break end
print("prop", v.attributes["itemprop"]) table.insert(descendantscopes, 1, scope)
end end
table.insert(scopes[type], item) scopes[scope] = scopes[scope] or {}
local entry = scopes[scope]
for _,v in ipairs(descendantscopes) do
entry[v] = entry[v] or {}
entry = entry[v]
end
local k, v = prop.attributes["itemprop"], prop:getcontent()
entry[k] = v
::nextprop::
end
-- print
local function printscope(node, table, level)
level = level or 1
local scopeprop = node.attributes["itemprop"] or ""
print(string.rep(" ", level - 1) .. node.attributes["itemtype"], scopeprop)
for prop,v in pairs(table) do
if type(prop) == "table" then
printscope(prop, v, level + 1)
else
print(string.rep(" ", level) .. prop .. "=[" .. v .. "]")
end
end end
end end
for node,table in pairs(scopes) do
printscope(node, table)
end