first draft

Inital working version in version control
This commit is contained in:
Wouter Scherphof 2013-03-19 11:37:08 +01:00
parent 65aff05b29
commit 76000166e0
7 changed files with 378 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
.DS_Store

119
ElementNode.lua Normal file
View File

@ -0,0 +1,119 @@
local Set = require "Set"
local ElementNode = {}
ElementNode.mt = {__index = ElementNode}
function ElementNode:new(nameortext, node, descend, openstart, openend)
local instance = {
name = nameortext,
level = 0,
parent = nil,
root = nil,
nodes = {},
_openstart = openstart, _openend = openend,
_closestart = openstart, _closeend = openend,
attributes = {},
id = nil,
classes = {},
deepernodes = Set:new(),
deeperelements = {}, deeperattributes = {}, deeperids = {}, deeperclasses = {}
}
if not node then
instance.name = "root"
instance.root = instance
instance._text = nameortext
local length = string.len(nameortext)
instance._openstart, instance._openend = 1, length
instance._closestart, instance._closeend = 1, length
elseif descend then
instance.root = node.root
instance.parent = node
instance.level = node.level + 1
table.insert(node.nodes, instance)
else
instance.root = node.root
instance.parent = node.parent
instance.level = node.level
table.insert(node.parent.nodes, instance)
end
return setmetatable(instance, ElementNode.mt)
end
function ElementNode:gettext()
return string.sub(self.root._text, self._openstart, self._closeend)
end
function ElementNode:getcontent()
return string.sub(self.root._text, self._openend + 1, self._closestart - 1)
end
function ElementNode:addattribute(k, v)
self.attributes[k] = v
if string.lower(k) == "id" then
self.id = v
end
-- class attribute contains "space-separated tokens", each of which we'd like quick access to
if string.lower(k) == "class" then
for class in string.gmatch(v, "%S+") do
table.insert(self.classes, class)
end
end
end
local function insert(list, name, node)
if not list[name] then
list[name] = Set:new()
end
list[name]:add(node)
end
function ElementNode:close(closestart, closeend)
if closestart and closeend then
self._closestart, self._closeend = closestart, closeend
end
-- inform hihger level nodes about this element's existence in their branches
local node = self
while true do
node = node.parent
if not node then break end
node.deepernodes:add(self)
insert(node.deeperelements, self.name, self)
for k in pairs(self.attributes) do
insert(node.deeperattributes, k, self)
end
if self.id then
insert(node.deeperids, self.id, self)
end
for _,v in ipairs(self.classes) do
insert(node.deeperclasses, v, self)
end
end
end
function ElementNode:select(s)
if not s or type(s) ~= "string" then return {} end
local subjects = Set:new({self})
local resultset
local childrenonly
for part in string.gmatch(s, "%S+") do
if part == ">" then childrenonly = true goto nextpart end
resultset = Set:new()
for subject in pairs(subjects) do
local init = subject.deepernodes
if childrenonly then init = Set:new(subject.nodes) childrenonly = false end
resultset = resultset + init
end
if part == "*" then goto nextpart end
for t, w in string.gmatch(part, "([%[#%.]?)([^%[%]#%.]+)") do
if t == "" then resultset = resultset * self.deeperelements[w]
elseif t == "[" then resultset = resultset * self.deeperattributes[w]
elseif t == "#" then resultset = resultset * self.deeperids[w]
elseif t == "." then resultset = resultset * self.deeperclasses[w]
end
end
subjects = Set:new(resultset)
::nextpart::
end
return resultset:tolist()
end
return ElementNode

57
HtmlParser.lua Normal file
View File

@ -0,0 +1,57 @@
local ElementNode = require("ElementNode")
local voidelements = require("voidelements")
local HtmlParser = {}
local function parse(text)
local root = ElementNode:new(text)
local node, descend, tpos, opentags = root, true, 1, {}
while true do
local openstart, name
openstart, tpos, name = string.find(root._text, "<(%w+)[^>]*>", tpos)
if not name then break end
local tag = ElementNode:new(name, node, descend, openstart, tpos)
node = tag
local tagst, apos = tag:gettext(), 1
while true do
local start, k, quote, v
start, apos, k, quote = string.find(tagst, "%s+([^%s=]+)=(['\"]?)", apos)
if not k then break end
local pattern = "=([^%s'\">]*)"
if quote ~= '' then
pattern = quote .. "([^" .. quote .. "]*)" .. quote
end
start, apos, v = string.find(tagst, pattern, apos)
tag:addattribute(k, v)
end
if voidelements[string.lower(tag.name)] then
descend = false
tag:close()
else
opentags[tag.name] = tag
end
local closeend = tpos
while true do
local closestart, closing, closename
closestart, closeend, closing, closename = string.find(root._text, "[^<]*<(/?)(%w+)", closeend)
closing = closing and closing ~= ''
if not closing then break end
tag = opentags[closename]
opentags[closename] = nil
closestart = string.find(root._text, "<", closestart)
tag:close(closestart, closeend + 1)
node = tag.parent
descend = true
end
end
return root
end
HtmlParser.parse = parse
return HtmlParser

87
Set.lua Normal file
View File

@ -0,0 +1,87 @@
local Set = {}
Set.mt = {__index = Set}
function Set:new(t)
local instance = {}
if type(t) == "table" then
if #t > 0 then
for _,v in ipairs(t) do
instance[v] = true
end
else
for k in pairs(t) do
instance[k] = true
end
end
else
instance = {t}
end
return setmetatable(instance, Set.mt)
end
function Set:add(e)
self[e] = true
end
function Set:remove(e)
self[e] = nil
end
-- Union
Set.mt.__add = function (a, b)
local res = Set:new()
if getmetatable(a) ~= Set.mt then a = Set:new(a) end
if getmetatable(b) ~= Set.mt then b = Set:new(b) end
for k in pairs(a) do res[k] = true end
for k in pairs(b) do res[k] = true end
return res
end
-- Subtraction
Set.mt.__sub = function (a, b)
local res = Set:new()
if getmetatable(a) ~= Set.mt then a = Set:new(a) end
if getmetatable(b) ~= Set.mt then b = Set:new(b) end
for k in pairs(a) do res[k] = true end
for k in pairs(b) do res[k] = nil end
return res
end
-- Intersection
Set.mt.__mul = function (a, b)
local res = Set:new()
if getmetatable(a) ~= Set.mt then a = Set:new(a) end
if getmetatable(b) ~= Set.mt then b = Set:new(b) end
for k in pairs(a) do
res[k] = b[k]
end
return res
end
-- String representation
Set.mt.__tostring = function (set)
local s = "{"
local sep = ""
for k in pairs(set) do
s = s .. sep .. k
sep = ", "
end
return s .. "}"
end
function Set:len()
local num = 0
for _ in pairs(self) do
num = num + 1
end
return num
end
function Set:tolist()
local res = {}
for k in pairs(self) do
table.insert(res, k)
end
return res
end
return Set

43
test.html Normal file
View File

@ -0,0 +1,43 @@
<!DOCTYPE html>
<html lang="en" test1='val1' test2="val='2'" test3='val="3"' test4="val = 4" test5=val5>
<head>
<meta charset="utf-8" />
<link rel="stylesheet" href="test.css" />
<link rel="alternate" title="Feed" type="application/atom+xml" href="#" />
</head>
<body>
<h1>Contents</h1>
<ol class="chapters">
<li>Preface</li>
<li>Introduction</li>
<li>Concepts</li>
<li>Theory</li>
<li>Hypotheses</li>
<li>Experiments</li>
<li>Conclusions</li>
<li>References</li>
</ol>
<h1>Acknowledgements</h1>
<p>
Surely, we could not have done this huge amount of work all by ourselves.<br />
Therefore, we cannot thank enough the following persons for their kind contributions:
<!--
Of course, the text in this paragraph only serve presentation purposes, i.e. it's not actually part of the machine-consumable structured data that this API is serving.
-->
</p>
<ul class="contacts">
<li id="/contacts/4711">
<a href="/contacts/4711">
<span class="firstname">Jon</span>
<span class="lastname">Moore</span>
</a>
</li>
<li id="/contacts/4712">
<a href="/contacts/4712">
<span class="firstname">Homer</span>
<span class="lastname">Simpson</span>
</a>
</li>
</ul>
</body>
</html>

52
test.lua Normal file
View File

@ -0,0 +1,52 @@
local HtmlParser = require("HtmlParser")
local io = require("io")
local file = io.input("./test.html")
local text = io.read("*a") file:close()
local root = HtmlParser.parse(text)
-- print the tree
local function p(n)
local space = string.rep(" ", n.level)
local s = space .. n.name
for i,v in ipairs(n.nodes) do
s = s .. " nodes[" .. i .. "]=" .. v.name
end
for k,v in pairs(n.attributes) do
s = s .. " " .. k .. "=[" .. v .. "]"
end
print(s)
for i,v in ipairs(n.nodes) do
p(v)
end
end
p(root)
local function select( s )
print ""
print("->", s)
local tags = root:select(s)
for i,t in ipairs(tags) do
print(t.name)
end
print(# tags)
end
select("*")
select("link")
select("#/contacts/4711")
select(".chapters")
select("[href]")
select("span.firstname")
select("ul[id]")
select("#/contacts/4711")
select("#/contacts/4711 *")
select("#/contacts/4711 .lastname")
select("body li[id]")
select("ul")
select("ul *")
select("ul > *")
select("body [class]")
select("body > [class]")

18
voidelements.lua Normal file
View File

@ -0,0 +1,18 @@
return {
area = true,
base = true,
br = true,
col = true,
command = true,
embed = true,
hr = true,
img = true,
input = true,
keygen = true,
link = true,
meta = true,
param = true,
source = true,
track = true,
wbr = true
}