mirror of
https://github.com/TangentFoxy/lua-htmlparser.git
synced 2025-07-28 02:52:19 +00:00
First draft for a Rock setup
This commit is contained in:
62
doc/sample.html
Normal file
62
doc/sample.html
Normal file
@@ -0,0 +1,62 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en" test1='val1' test2="val='2'" test3='val="3"' test4="val = 4" test5=val5 test6=val""6>
|
||||
<head words="testing one two three">
|
||||
<meta charset="utf-8" />
|
||||
<link rel="stylesheet" href="test.css" hreflang="en" />
|
||||
<link rel="alternate" title="Feed" type="application/atom+xml" href="#" hreflang="en-gb" />
|
||||
</head>
|
||||
<body words="testing three four five">
|
||||
<h1>Contents</h1>
|
||||
<ol class="chapters">
|
||||
<li>Preface</li>
|
||||
<li>Introduction</li>
|
||||
<li>Concepts</li>
|
||||
<li>Theory</li>
|
||||
<li>Hypotheses</li>
|
||||
<li>Experiments</li>
|
||||
<li>Conclusions</li>
|
||||
<li>References</li>
|
||||
</ol>
|
||||
<h1>Acknowledgements</h1>
|
||||
<p>
|
||||
Surely, we could not have done this huge amount of work all by ourselves.<br />
|
||||
Therefore, we cannot thank enough the following persons for their kind contributions:
|
||||
<!--
|
||||
Of course, the text in this paragraph only serve presentation purposes, i.e. it's not actually part of the machine-consumable structured data that this API is serving.
|
||||
-->
|
||||
</p>
|
||||
<ul class="contacts">
|
||||
<li id="/contacts/4711">
|
||||
<a href="/contacts/4711" hreflang="en-us">
|
||||
<span class="firstname">Jon</span>
|
||||
<span class="lastname">Moore</span>
|
||||
</a>
|
||||
</li>
|
||||
<li id="/contacts/4712">
|
||||
<a href="/contacts/4712" hreflang="english">
|
||||
<span class="firstname">Homer</span>
|
||||
<span class="lastname">Simpson</span>
|
||||
</a>
|
||||
</li>
|
||||
</ul>
|
||||
<h1>About me</h1>
|
||||
<section itemscope itemtype="http://schema.org/Person">
|
||||
Hello, my name is
|
||||
<span itemprop="name">John Doe</span>,
|
||||
I am a
|
||||
<span itemprop="jobTitle">graduate research assistant</span>
|
||||
at the
|
||||
<span itemprop="affiliation">University of Dreams</span>.
|
||||
My friends call me
|
||||
<span itemprop="additionalName">Johnny</span>.
|
||||
You can visit my homepage at
|
||||
<a href="http://www.JohnnyD.com" itemprop="url">www.JohnnyD.com</a>.
|
||||
<section itemprop="address" itemscope itemtype="http://schema.org/PostalAddress">
|
||||
I live at
|
||||
<span itemprop="streetAddress">1234 Peach Drive</span>,
|
||||
<span itemprop="addressLocality">Warner Robins</span>,
|
||||
<span itemprop="addressRegion">Georgia</span>.
|
||||
</section>
|
||||
</section>
|
||||
</body>
|
||||
</html>
|
140
doc/sample.lua
Normal file
140
doc/sample.lua
Normal file
@@ -0,0 +1,140 @@
|
||||
require("luarocks.loader")
|
||||
-- Omit next line in actual module clients; it's only to support development of the module itself
|
||||
package.path = "../src/?.lua;" .. package.path
|
||||
local htmlparser = require("htmlparser")
|
||||
|
||||
local io = require("io")
|
||||
local file = io.input("./sample.html")
|
||||
local text = io.read("*a") file:close()
|
||||
|
||||
local root = htmlparser.parse(text)
|
||||
|
||||
-- print the tree
|
||||
local function p(n)
|
||||
local space = string.rep(" ", n.level)
|
||||
local s = space .. n.name
|
||||
for k,v in pairs(n.attributes) do
|
||||
s = s .. " " .. k .. "=[[" .. v .. "]]"
|
||||
end
|
||||
print(s)
|
||||
for i,v in ipairs(n.nodes) do
|
||||
p(v)
|
||||
end
|
||||
end
|
||||
p(root)
|
||||
|
||||
local function select( s )
|
||||
print ""
|
||||
print("->", s)
|
||||
local sel = root:select(s)
|
||||
for element in pairs(sel) do
|
||||
print(element.name)
|
||||
end
|
||||
print(sel:len())
|
||||
end
|
||||
|
||||
select("*")
|
||||
select("link")
|
||||
select("#/contacts/4711")
|
||||
select(".chapters")
|
||||
select("[href]")
|
||||
select("span.firstname")
|
||||
select("ul[id]")
|
||||
|
||||
select("#/contacts/4711")
|
||||
select("#/contacts/4711 *")
|
||||
select("#/contacts/4711 .lastname")
|
||||
select("body li[id]")
|
||||
|
||||
select("ul")
|
||||
select("ul *")
|
||||
select("ul > *")
|
||||
select("body [class]")
|
||||
select("body > [class]")
|
||||
|
||||
select(".contacts span:not(.firstname)")
|
||||
select(":not(a)[href]")
|
||||
select("[itemscope]:not([itemprop])")
|
||||
|
||||
select("link[rel='alternate']")
|
||||
select("[test2=\"val='2'\"]")
|
||||
select("[test5='val5']")
|
||||
select("[test6='val\"\"6']")
|
||||
select("[itemscope='']")
|
||||
select("[itemscope=]")
|
||||
select("[itemscope]")
|
||||
|
||||
select("[itemscope][itemprop='address']")
|
||||
select("[itemscope][itemprop!='address']")
|
||||
select("[itemscope][itemprop!='adres']")
|
||||
select("[itemscope][itemprop!='']")
|
||||
select("[hreflang|='en']")
|
||||
select("[itemprop*='address']")
|
||||
select("[words~='two']")
|
||||
select("[words~='three']")
|
||||
select("[itemprop$='ion']")
|
||||
select("[hreflang^='en']")
|
||||
|
||||
print("\nchapters")
|
||||
local sel, chapters = root("ol.chapters > li"), {}
|
||||
for e in pairs(sel) do
|
||||
table.insert(chapters, e:getcontent())
|
||||
end
|
||||
-- print
|
||||
for i,v in ipairs(chapters) do
|
||||
print(i, v)
|
||||
end
|
||||
|
||||
print("\ncontacts")
|
||||
local sel, contacts = root("ul.contacts span[class]"), {}
|
||||
for e in pairs(sel) do
|
||||
local id = e.parent.parent.id -- li > a > span
|
||||
contacts[id] = contacts[id] or {}
|
||||
contacts[id][e.classes[1]] = e:getcontent()
|
||||
end
|
||||
-- print
|
||||
for k,v in pairs(contacts) do
|
||||
print(k)
|
||||
for fk,fv in pairs(v) do
|
||||
print(fk, fv)
|
||||
end
|
||||
end
|
||||
|
||||
print("\nmicrodata")
|
||||
local sel, scopes = root("[itemprop]"), {}
|
||||
for prop in pairs(sel) do
|
||||
if prop.attributes["itemscope"] then goto nextprop end
|
||||
local descendantscopes, scope = {}, prop
|
||||
while true do
|
||||
repeat
|
||||
scope = scope.parent
|
||||
until scope.attributes["itemscope"]
|
||||
if not scope.attributes["itemprop"] then break end
|
||||
table.insert(descendantscopes, 1, scope)
|
||||
end
|
||||
scopes[scope] = scopes[scope] or {}
|
||||
local entry = scopes[scope]
|
||||
for _,v in ipairs(descendantscopes) do
|
||||
entry[v] = entry[v] or {}
|
||||
entry = entry[v]
|
||||
end
|
||||
local k, v = prop.attributes["itemprop"], prop:getcontent()
|
||||
entry[k] = v
|
||||
::nextprop::
|
||||
end
|
||||
-- print
|
||||
local function printscope(node, table, level)
|
||||
level = level or 1
|
||||
local scopeprop = node.attributes["itemprop"] or ""
|
||||
print(string.rep(" ", level - 1) .. node.attributes["itemtype"], scopeprop)
|
||||
for prop,v in pairs(table) do
|
||||
if type(prop) == "table" then
|
||||
printscope(prop, v, level + 1)
|
||||
else
|
||||
print(string.rep(" ", level) .. prop .. "=[" .. v .. "]")
|
||||
end
|
||||
end
|
||||
end
|
||||
for node,table in pairs(scopes) do
|
||||
printscope(node, table)
|
||||
end
|
Reference in New Issue
Block a user