mirror of
https://github.com/msva/lua-htmlparser.git
synced 2024-11-27 12:44:22 +00:00
first draft
Inital working version in version control
This commit is contained in:
parent
65aff05b29
commit
76000166e0
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
|
||||
.DS_Store
|
119
ElementNode.lua
Normal file
119
ElementNode.lua
Normal file
@ -0,0 +1,119 @@
|
||||
local Set = require "Set"
|
||||
|
||||
local ElementNode = {}
|
||||
ElementNode.mt = {__index = ElementNode}
|
||||
function ElementNode:new(nameortext, node, descend, openstart, openend)
|
||||
local instance = {
|
||||
name = nameortext,
|
||||
level = 0,
|
||||
parent = nil,
|
||||
root = nil,
|
||||
nodes = {},
|
||||
_openstart = openstart, _openend = openend,
|
||||
_closestart = openstart, _closeend = openend,
|
||||
attributes = {},
|
||||
id = nil,
|
||||
classes = {},
|
||||
deepernodes = Set:new(),
|
||||
deeperelements = {}, deeperattributes = {}, deeperids = {}, deeperclasses = {}
|
||||
}
|
||||
if not node then
|
||||
instance.name = "root"
|
||||
instance.root = instance
|
||||
instance._text = nameortext
|
||||
local length = string.len(nameortext)
|
||||
instance._openstart, instance._openend = 1, length
|
||||
instance._closestart, instance._closeend = 1, length
|
||||
elseif descend then
|
||||
instance.root = node.root
|
||||
instance.parent = node
|
||||
instance.level = node.level + 1
|
||||
table.insert(node.nodes, instance)
|
||||
else
|
||||
instance.root = node.root
|
||||
instance.parent = node.parent
|
||||
instance.level = node.level
|
||||
table.insert(node.parent.nodes, instance)
|
||||
end
|
||||
return setmetatable(instance, ElementNode.mt)
|
||||
end
|
||||
|
||||
function ElementNode:gettext()
|
||||
return string.sub(self.root._text, self._openstart, self._closeend)
|
||||
end
|
||||
|
||||
function ElementNode:getcontent()
|
||||
return string.sub(self.root._text, self._openend + 1, self._closestart - 1)
|
||||
end
|
||||
|
||||
function ElementNode:addattribute(k, v)
|
||||
self.attributes[k] = v
|
||||
if string.lower(k) == "id" then
|
||||
self.id = v
|
||||
end
|
||||
-- class attribute contains "space-separated tokens", each of which we'd like quick access to
|
||||
if string.lower(k) == "class" then
|
||||
for class in string.gmatch(v, "%S+") do
|
||||
table.insert(self.classes, class)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
local function insert(list, name, node)
|
||||
if not list[name] then
|
||||
list[name] = Set:new()
|
||||
end
|
||||
list[name]:add(node)
|
||||
end
|
||||
|
||||
function ElementNode:close(closestart, closeend)
|
||||
if closestart and closeend then
|
||||
self._closestart, self._closeend = closestart, closeend
|
||||
end
|
||||
-- inform hihger level nodes about this element's existence in their branches
|
||||
local node = self
|
||||
while true do
|
||||
node = node.parent
|
||||
if not node then break end
|
||||
node.deepernodes:add(self)
|
||||
insert(node.deeperelements, self.name, self)
|
||||
for k in pairs(self.attributes) do
|
||||
insert(node.deeperattributes, k, self)
|
||||
end
|
||||
if self.id then
|
||||
insert(node.deeperids, self.id, self)
|
||||
end
|
||||
for _,v in ipairs(self.classes) do
|
||||
insert(node.deeperclasses, v, self)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
function ElementNode:select(s)
|
||||
if not s or type(s) ~= "string" then return {} end
|
||||
local subjects = Set:new({self})
|
||||
local resultset
|
||||
local childrenonly
|
||||
for part in string.gmatch(s, "%S+") do
|
||||
if part == ">" then childrenonly = true goto nextpart end
|
||||
resultset = Set:new()
|
||||
for subject in pairs(subjects) do
|
||||
local init = subject.deepernodes
|
||||
if childrenonly then init = Set:new(subject.nodes) childrenonly = false end
|
||||
resultset = resultset + init
|
||||
end
|
||||
if part == "*" then goto nextpart end
|
||||
for t, w in string.gmatch(part, "([%[#%.]?)([^%[%]#%.]+)") do
|
||||
if t == "" then resultset = resultset * self.deeperelements[w]
|
||||
elseif t == "[" then resultset = resultset * self.deeperattributes[w]
|
||||
elseif t == "#" then resultset = resultset * self.deeperids[w]
|
||||
elseif t == "." then resultset = resultset * self.deeperclasses[w]
|
||||
end
|
||||
end
|
||||
subjects = Set:new(resultset)
|
||||
::nextpart::
|
||||
end
|
||||
return resultset:tolist()
|
||||
end
|
||||
|
||||
return ElementNode
|
57
HtmlParser.lua
Normal file
57
HtmlParser.lua
Normal file
@ -0,0 +1,57 @@
|
||||
local ElementNode = require("ElementNode")
|
||||
local voidelements = require("voidelements")
|
||||
|
||||
local HtmlParser = {}
|
||||
|
||||
local function parse(text)
|
||||
local root = ElementNode:new(text)
|
||||
|
||||
local node, descend, tpos, opentags = root, true, 1, {}
|
||||
while true do
|
||||
local openstart, name
|
||||
openstart, tpos, name = string.find(root._text, "<(%w+)[^>]*>", tpos)
|
||||
if not name then break end
|
||||
local tag = ElementNode:new(name, node, descend, openstart, tpos)
|
||||
node = tag
|
||||
|
||||
local tagst, apos = tag:gettext(), 1
|
||||
while true do
|
||||
local start, k, quote, v
|
||||
start, apos, k, quote = string.find(tagst, "%s+([^%s=]+)=(['\"]?)", apos)
|
||||
if not k then break end
|
||||
local pattern = "=([^%s'\">]*)"
|
||||
if quote ~= '' then
|
||||
pattern = quote .. "([^" .. quote .. "]*)" .. quote
|
||||
end
|
||||
start, apos, v = string.find(tagst, pattern, apos)
|
||||
tag:addattribute(k, v)
|
||||
end
|
||||
|
||||
if voidelements[string.lower(tag.name)] then
|
||||
descend = false
|
||||
tag:close()
|
||||
else
|
||||
opentags[tag.name] = tag
|
||||
end
|
||||
|
||||
local closeend = tpos
|
||||
while true do
|
||||
local closestart, closing, closename
|
||||
closestart, closeend, closing, closename = string.find(root._text, "[^<]*<(/?)(%w+)", closeend)
|
||||
closing = closing and closing ~= ''
|
||||
if not closing then break end
|
||||
tag = opentags[closename]
|
||||
opentags[closename] = nil
|
||||
closestart = string.find(root._text, "<", closestart)
|
||||
tag:close(closestart, closeend + 1)
|
||||
node = tag.parent
|
||||
descend = true
|
||||
end
|
||||
end
|
||||
|
||||
return root
|
||||
end
|
||||
HtmlParser.parse = parse
|
||||
|
||||
return HtmlParser
|
||||
|
87
Set.lua
Normal file
87
Set.lua
Normal file
@ -0,0 +1,87 @@
|
||||
local Set = {}
|
||||
Set.mt = {__index = Set}
|
||||
function Set:new(t)
|
||||
local instance = {}
|
||||
if type(t) == "table" then
|
||||
if #t > 0 then
|
||||
for _,v in ipairs(t) do
|
||||
instance[v] = true
|
||||
end
|
||||
else
|
||||
for k in pairs(t) do
|
||||
instance[k] = true
|
||||
end
|
||||
end
|
||||
else
|
||||
instance = {t}
|
||||
end
|
||||
return setmetatable(instance, Set.mt)
|
||||
end
|
||||
|
||||
function Set:add(e)
|
||||
self[e] = true
|
||||
end
|
||||
|
||||
function Set:remove(e)
|
||||
self[e] = nil
|
||||
end
|
||||
|
||||
-- Union
|
||||
Set.mt.__add = function (a, b)
|
||||
local res = Set:new()
|
||||
if getmetatable(a) ~= Set.mt then a = Set:new(a) end
|
||||
if getmetatable(b) ~= Set.mt then b = Set:new(b) end
|
||||
for k in pairs(a) do res[k] = true end
|
||||
for k in pairs(b) do res[k] = true end
|
||||
return res
|
||||
end
|
||||
|
||||
-- Subtraction
|
||||
Set.mt.__sub = function (a, b)
|
||||
local res = Set:new()
|
||||
if getmetatable(a) ~= Set.mt then a = Set:new(a) end
|
||||
if getmetatable(b) ~= Set.mt then b = Set:new(b) end
|
||||
for k in pairs(a) do res[k] = true end
|
||||
for k in pairs(b) do res[k] = nil end
|
||||
return res
|
||||
end
|
||||
|
||||
-- Intersection
|
||||
Set.mt.__mul = function (a, b)
|
||||
local res = Set:new()
|
||||
if getmetatable(a) ~= Set.mt then a = Set:new(a) end
|
||||
if getmetatable(b) ~= Set.mt then b = Set:new(b) end
|
||||
for k in pairs(a) do
|
||||
res[k] = b[k]
|
||||
end
|
||||
return res
|
||||
end
|
||||
|
||||
-- String representation
|
||||
Set.mt.__tostring = function (set)
|
||||
local s = "{"
|
||||
local sep = ""
|
||||
for k in pairs(set) do
|
||||
s = s .. sep .. k
|
||||
sep = ", "
|
||||
end
|
||||
return s .. "}"
|
||||
end
|
||||
|
||||
function Set:len()
|
||||
local num = 0
|
||||
for _ in pairs(self) do
|
||||
num = num + 1
|
||||
end
|
||||
return num
|
||||
end
|
||||
|
||||
function Set:tolist()
|
||||
local res = {}
|
||||
for k in pairs(self) do
|
||||
table.insert(res, k)
|
||||
end
|
||||
return res
|
||||
end
|
||||
|
||||
return Set
|
43
test.html
Normal file
43
test.html
Normal file
@ -0,0 +1,43 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en" test1='val1' test2="val='2'" test3='val="3"' test4="val = 4" test5=val5>
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<link rel="stylesheet" href="test.css" />
|
||||
<link rel="alternate" title="Feed" type="application/atom+xml" href="#" />
|
||||
</head>
|
||||
<body>
|
||||
<h1>Contents</h1>
|
||||
<ol class="chapters">
|
||||
<li>Preface</li>
|
||||
<li>Introduction</li>
|
||||
<li>Concepts</li>
|
||||
<li>Theory</li>
|
||||
<li>Hypotheses</li>
|
||||
<li>Experiments</li>
|
||||
<li>Conclusions</li>
|
||||
<li>References</li>
|
||||
</ol>
|
||||
<h1>Acknowledgements</h1>
|
||||
<p>
|
||||
Surely, we could not have done this huge amount of work all by ourselves.<br />
|
||||
Therefore, we cannot thank enough the following persons for their kind contributions:
|
||||
<!--
|
||||
Of course, the text in this paragraph only serve presentation purposes, i.e. it's not actually part of the machine-consumable structured data that this API is serving.
|
||||
-->
|
||||
</p>
|
||||
<ul class="contacts">
|
||||
<li id="/contacts/4711">
|
||||
<a href="/contacts/4711">
|
||||
<span class="firstname">Jon</span>
|
||||
<span class="lastname">Moore</span>
|
||||
</a>
|
||||
</li>
|
||||
<li id="/contacts/4712">
|
||||
<a href="/contacts/4712">
|
||||
<span class="firstname">Homer</span>
|
||||
<span class="lastname">Simpson</span>
|
||||
</a>
|
||||
</li>
|
||||
</ul>
|
||||
</body>
|
||||
</html>
|
52
test.lua
Normal file
52
test.lua
Normal file
@ -0,0 +1,52 @@
|
||||
local HtmlParser = require("HtmlParser")
|
||||
|
||||
local io = require("io")
|
||||
local file = io.input("./test.html")
|
||||
local text = io.read("*a") file:close()
|
||||
|
||||
local root = HtmlParser.parse(text)
|
||||
|
||||
-- print the tree
|
||||
local function p(n)
|
||||
local space = string.rep(" ", n.level)
|
||||
local s = space .. n.name
|
||||
for i,v in ipairs(n.nodes) do
|
||||
s = s .. " nodes[" .. i .. "]=" .. v.name
|
||||
end
|
||||
for k,v in pairs(n.attributes) do
|
||||
s = s .. " " .. k .. "=[" .. v .. "]"
|
||||
end
|
||||
print(s)
|
||||
for i,v in ipairs(n.nodes) do
|
||||
p(v)
|
||||
end
|
||||
end
|
||||
p(root)
|
||||
|
||||
local function select( s )
|
||||
print ""
|
||||
print("->", s)
|
||||
local tags = root:select(s)
|
||||
for i,t in ipairs(tags) do
|
||||
print(t.name)
|
||||
end
|
||||
print(# tags)
|
||||
end
|
||||
select("*")
|
||||
select("link")
|
||||
select("#/contacts/4711")
|
||||
select(".chapters")
|
||||
select("[href]")
|
||||
select("span.firstname")
|
||||
select("ul[id]")
|
||||
|
||||
select("#/contacts/4711")
|
||||
select("#/contacts/4711 *")
|
||||
select("#/contacts/4711 .lastname")
|
||||
select("body li[id]")
|
||||
|
||||
select("ul")
|
||||
select("ul *")
|
||||
select("ul > *")
|
||||
select("body [class]")
|
||||
select("body > [class]")
|
18
voidelements.lua
Normal file
18
voidelements.lua
Normal file
@ -0,0 +1,18 @@
|
||||
return {
|
||||
area = true,
|
||||
base = true,
|
||||
br = true,
|
||||
col = true,
|
||||
command = true,
|
||||
embed = true,
|
||||
hr = true,
|
||||
img = true,
|
||||
input = true,
|
||||
keygen = true,
|
||||
link = true,
|
||||
meta = true,
|
||||
param = true,
|
||||
source = true,
|
||||
track = true,
|
||||
wbr = true
|
||||
}
|
Loading…
Reference in New Issue
Block a user