diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9bea433 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ + +.DS_Store diff --git a/ElementNode.lua b/ElementNode.lua new file mode 100644 index 0000000..5d92d4b --- /dev/null +++ b/ElementNode.lua @@ -0,0 +1,119 @@ +local Set = require "Set" + +local ElementNode = {} +ElementNode.mt = {__index = ElementNode} +function ElementNode:new(nameortext, node, descend, openstart, openend) + local instance = { + name = nameortext, + level = 0, + parent = nil, + root = nil, + nodes = {}, + _openstart = openstart, _openend = openend, + _closestart = openstart, _closeend = openend, + attributes = {}, + id = nil, + classes = {}, + deepernodes = Set:new(), + deeperelements = {}, deeperattributes = {}, deeperids = {}, deeperclasses = {} + } + if not node then + instance.name = "root" + instance.root = instance + instance._text = nameortext + local length = string.len(nameortext) + instance._openstart, instance._openend = 1, length + instance._closestart, instance._closeend = 1, length + elseif descend then + instance.root = node.root + instance.parent = node + instance.level = node.level + 1 + table.insert(node.nodes, instance) + else + instance.root = node.root + instance.parent = node.parent + instance.level = node.level + table.insert(node.parent.nodes, instance) + end + return setmetatable(instance, ElementNode.mt) +end + +function ElementNode:gettext() + return string.sub(self.root._text, self._openstart, self._closeend) +end + +function ElementNode:getcontent() + return string.sub(self.root._text, self._openend + 1, self._closestart - 1) +end + +function ElementNode:addattribute(k, v) + self.attributes[k] = v + if string.lower(k) == "id" then + self.id = v + end + -- class attribute contains "space-separated tokens", each of which we'd like quick access to + if string.lower(k) == "class" then + for class in string.gmatch(v, "%S+") do + table.insert(self.classes, class) + end + end +end + +local function insert(list, name, node) + if not list[name] then + list[name] = Set:new() + end + list[name]:add(node) +end + +function ElementNode:close(closestart, closeend) + if closestart and closeend then + self._closestart, self._closeend = closestart, closeend + end + -- inform hihger level nodes about this element's existence in their branches + local node = self + while true do + node = node.parent + if not node then break end + node.deepernodes:add(self) + insert(node.deeperelements, self.name, self) + for k in pairs(self.attributes) do + insert(node.deeperattributes, k, self) + end + if self.id then + insert(node.deeperids, self.id, self) + end + for _,v in ipairs(self.classes) do + insert(node.deeperclasses, v, self) + end + end +end + +function ElementNode:select(s) + if not s or type(s) ~= "string" then return {} end + local subjects = Set:new({self}) + local resultset + local childrenonly + for part in string.gmatch(s, "%S+") do + if part == ">" then childrenonly = true goto nextpart end + resultset = Set:new() + for subject in pairs(subjects) do + local init = subject.deepernodes + if childrenonly then init = Set:new(subject.nodes) childrenonly = false end + resultset = resultset + init + end + if part == "*" then goto nextpart end + for t, w in string.gmatch(part, "([%[#%.]?)([^%[%]#%.]+)") do + if t == "" then resultset = resultset * self.deeperelements[w] + elseif t == "[" then resultset = resultset * self.deeperattributes[w] + elseif t == "#" then resultset = resultset * self.deeperids[w] + elseif t == "." then resultset = resultset * self.deeperclasses[w] + end + end + subjects = Set:new(resultset) + ::nextpart:: + end + return resultset:tolist() +end + +return ElementNode \ No newline at end of file diff --git a/HtmlParser.lua b/HtmlParser.lua new file mode 100644 index 0000000..be80013 --- /dev/null +++ b/HtmlParser.lua @@ -0,0 +1,57 @@ +local ElementNode = require("ElementNode") +local voidelements = require("voidelements") + +local HtmlParser = {} + +local function parse(text) + local root = ElementNode:new(text) + + local node, descend, tpos, opentags = root, true, 1, {} + while true do + local openstart, name + openstart, tpos, name = string.find(root._text, "<(%w+)[^>]*>", tpos) + if not name then break end + local tag = ElementNode:new(name, node, descend, openstart, tpos) + node = tag + + local tagst, apos = tag:gettext(), 1 + while true do + local start, k, quote, v + start, apos, k, quote = string.find(tagst, "%s+([^%s=]+)=(['\"]?)", apos) + if not k then break end + local pattern = "=([^%s'\">]*)" + if quote ~= '' then + pattern = quote .. "([^" .. quote .. "]*)" .. quote + end + start, apos, v = string.find(tagst, pattern, apos) + tag:addattribute(k, v) + end + + if voidelements[string.lower(tag.name)] then + descend = false + tag:close() + else + opentags[tag.name] = tag + end + + local closeend = tpos + while true do + local closestart, closing, closename + closestart, closeend, closing, closename = string.find(root._text, "[^<]*<(/?)(%w+)", closeend) + closing = closing and closing ~= '' + if not closing then break end + tag = opentags[closename] + opentags[closename] = nil + closestart = string.find(root._text, "<", closestart) + tag:close(closestart, closeend + 1) + node = tag.parent + descend = true + end + end + + return root +end +HtmlParser.parse = parse + +return HtmlParser + diff --git a/Set.lua b/Set.lua new file mode 100644 index 0000000..104e31f --- /dev/null +++ b/Set.lua @@ -0,0 +1,87 @@ +local Set = {} +Set.mt = {__index = Set} +function Set:new(t) + local instance = {} + if type(t) == "table" then + if #t > 0 then + for _,v in ipairs(t) do + instance[v] = true + end + else + for k in pairs(t) do + instance[k] = true + end + end + else + instance = {t} + end + return setmetatable(instance, Set.mt) +end + +function Set:add(e) + self[e] = true +end + +function Set:remove(e) + self[e] = nil +end + +-- Union +Set.mt.__add = function (a, b) + local res = Set:new() + if getmetatable(a) ~= Set.mt then a = Set:new(a) end + if getmetatable(b) ~= Set.mt then b = Set:new(b) end + for k in pairs(a) do res[k] = true end + for k in pairs(b) do res[k] = true end + return res +end + +-- Subtraction +Set.mt.__sub = function (a, b) + local res = Set:new() + if getmetatable(a) ~= Set.mt then a = Set:new(a) end + if getmetatable(b) ~= Set.mt then b = Set:new(b) end + for k in pairs(a) do res[k] = true end + for k in pairs(b) do res[k] = nil end + return res +end + +-- Intersection +Set.mt.__mul = function (a, b) + local res = Set:new() + if getmetatable(a) ~= Set.mt then a = Set:new(a) end + if getmetatable(b) ~= Set.mt then b = Set:new(b) end + for k in pairs(a) do + res[k] = b[k] + end + return res +end + +-- String representation +Set.mt.__tostring = function (set) + local s = "{" + local sep = "" + for k in pairs(set) do + s = s .. sep .. k + sep = ", " + end + return s .. "}" +end + +function Set:len() + local num = 0 + for _ in pairs(self) do + num = num + 1 + end + return num +end + +function Set:tolist() + local res = {} + for k in pairs(self) do + table.insert(res, k) + end + return res +end + +return Set \ No newline at end of file diff --git a/test.html b/test.html new file mode 100644 index 0000000..3e86fb5 --- /dev/null +++ b/test.html @@ -0,0 +1,43 @@ + + + + + + + + +

Contents

+
    +
  1. Preface
  2. +
  3. Introduction
  4. +
  5. Concepts
  6. +
  7. Theory
  8. +
  9. Hypotheses
  10. +
  11. Experiments
  12. +
  13. Conclusions
  14. +
  15. References
  16. +
+

Acknowledgements

+

+ Surely, we could not have done this huge amount of work all by ourselves.
+ Therefore, we cannot thank enough the following persons for their kind contributions: + +

+ + + \ No newline at end of file diff --git a/test.lua b/test.lua new file mode 100644 index 0000000..77ed097 --- /dev/null +++ b/test.lua @@ -0,0 +1,52 @@ +local HtmlParser = require("HtmlParser") + +local io = require("io") +local file = io.input("./test.html") +local text = io.read("*a") file:close() + +local root = HtmlParser.parse(text) + +-- print the tree +local function p(n) + local space = string.rep(" ", n.level) + local s = space .. n.name + for i,v in ipairs(n.nodes) do + s = s .. " nodes[" .. i .. "]=" .. v.name + end + for k,v in pairs(n.attributes) do + s = s .. " " .. k .. "=[" .. v .. "]" + end + print(s) + for i,v in ipairs(n.nodes) do + p(v) + end +end +p(root) + +local function select( s ) + print "" + print("->", s) + local tags = root:select(s) + for i,t in ipairs(tags) do + print(t.name) + end + print(# tags) +end +select("*") +select("link") +select("#/contacts/4711") +select(".chapters") +select("[href]") +select("span.firstname") +select("ul[id]") + +select("#/contacts/4711") +select("#/contacts/4711 *") +select("#/contacts/4711 .lastname") +select("body li[id]") + +select("ul") +select("ul *") +select("ul > *") +select("body [class]") +select("body > [class]") diff --git a/voidelements.lua b/voidelements.lua new file mode 100644 index 0000000..b95b010 --- /dev/null +++ b/voidelements.lua @@ -0,0 +1,18 @@ +return { + area = true, + base = true, + br = true, + col = true, + command = true, + embed = true, + hr = true, + img = true, + input = true, + keygen = true, + link = true, + meta = true, + param = true, + source = true, + track = true, + wbr = true +}