diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9bea433 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ + +.DS_Store diff --git a/ElementNode.lua b/ElementNode.lua new file mode 100644 index 0000000..5d92d4b --- /dev/null +++ b/ElementNode.lua @@ -0,0 +1,119 @@ +local Set = require "Set" + +local ElementNode = {} +ElementNode.mt = {__index = ElementNode} +function ElementNode:new(nameortext, node, descend, openstart, openend) + local instance = { + name = nameortext, + level = 0, + parent = nil, + root = nil, + nodes = {}, + _openstart = openstart, _openend = openend, + _closestart = openstart, _closeend = openend, + attributes = {}, + id = nil, + classes = {}, + deepernodes = Set:new(), + deeperelements = {}, deeperattributes = {}, deeperids = {}, deeperclasses = {} + } + if not node then + instance.name = "root" + instance.root = instance + instance._text = nameortext + local length = string.len(nameortext) + instance._openstart, instance._openend = 1, length + instance._closestart, instance._closeend = 1, length + elseif descend then + instance.root = node.root + instance.parent = node + instance.level = node.level + 1 + table.insert(node.nodes, instance) + else + instance.root = node.root + instance.parent = node.parent + instance.level = node.level + table.insert(node.parent.nodes, instance) + end + return setmetatable(instance, ElementNode.mt) +end + +function ElementNode:gettext() + return string.sub(self.root._text, self._openstart, self._closeend) +end + +function ElementNode:getcontent() + return string.sub(self.root._text, self._openend + 1, self._closestart - 1) +end + +function ElementNode:addattribute(k, v) + self.attributes[k] = v + if string.lower(k) == "id" then + self.id = v + end + -- class attribute contains "space-separated tokens", each of which we'd like quick access to + if string.lower(k) == "class" then + for class in string.gmatch(v, "%S+") do + table.insert(self.classes, class) + end + end +end + +local function insert(list, name, node) + if not list[name] then + list[name] = Set:new() + end + list[name]:add(node) +end + +function ElementNode:close(closestart, closeend) + if closestart and closeend then + self._closestart, self._closeend = closestart, closeend + end + -- inform hihger level nodes about this element's existence in their branches + local node = self + while true do + node = node.parent + if not node then break end + node.deepernodes:add(self) + insert(node.deeperelements, self.name, self) + for k in pairs(self.attributes) do + insert(node.deeperattributes, k, self) + end + if self.id then + insert(node.deeperids, self.id, self) + end + for _,v in ipairs(self.classes) do + insert(node.deeperclasses, v, self) + end + end +end + +function ElementNode:select(s) + if not s or type(s) ~= "string" then return {} end + local subjects = Set:new({self}) + local resultset + local childrenonly + for part in string.gmatch(s, "%S+") do + if part == ">" then childrenonly = true goto nextpart end + resultset = Set:new() + for subject in pairs(subjects) do + local init = subject.deepernodes + if childrenonly then init = Set:new(subject.nodes) childrenonly = false end + resultset = resultset + init + end + if part == "*" then goto nextpart end + for t, w in string.gmatch(part, "([%[#%.]?)([^%[%]#%.]+)") do + if t == "" then resultset = resultset * self.deeperelements[w] + elseif t == "[" then resultset = resultset * self.deeperattributes[w] + elseif t == "#" then resultset = resultset * self.deeperids[w] + elseif t == "." then resultset = resultset * self.deeperclasses[w] + end + end + subjects = Set:new(resultset) + ::nextpart:: + end + return resultset:tolist() +end + +return ElementNode \ No newline at end of file diff --git a/HtmlParser.lua b/HtmlParser.lua new file mode 100644 index 0000000..be80013 --- /dev/null +++ b/HtmlParser.lua @@ -0,0 +1,57 @@ +local ElementNode = require("ElementNode") +local voidelements = require("voidelements") + +local HtmlParser = {} + +local function parse(text) + local root = ElementNode:new(text) + + local node, descend, tpos, opentags = root, true, 1, {} + while true do + local openstart, name + openstart, tpos, name = string.find(root._text, "<(%w+)[^>]*>", tpos) + if not name then break end + local tag = ElementNode:new(name, node, descend, openstart, tpos) + node = tag + + local tagst, apos = tag:gettext(), 1 + while true do + local start, k, quote, v + start, apos, k, quote = string.find(tagst, "%s+([^%s=]+)=(['\"]?)", apos) + if not k then break end + local pattern = "=([^%s'\">]*)" + if quote ~= '' then + pattern = quote .. "([^" .. quote .. "]*)" .. quote + end + start, apos, v = string.find(tagst, pattern, apos) + tag:addattribute(k, v) + end + + if voidelements[string.lower(tag.name)] then + descend = false + tag:close() + else + opentags[tag.name] = tag + end + + local closeend = tpos + while true do + local closestart, closing, closename + closestart, closeend, closing, closename = string.find(root._text, "[^<]*<(/?)(%w+)", closeend) + closing = closing and closing ~= '' + if not closing then break end + tag = opentags[closename] + opentags[closename] = nil + closestart = string.find(root._text, "<", closestart) + tag:close(closestart, closeend + 1) + node = tag.parent + descend = true + end + end + + return root +end +HtmlParser.parse = parse + +return HtmlParser + diff --git a/Set.lua b/Set.lua new file mode 100644 index 0000000..104e31f --- /dev/null +++ b/Set.lua @@ -0,0 +1,87 @@ +local Set = {} +Set.mt = {__index = Set} +function Set:new(t) + local instance = {} + if type(t) == "table" then + if #t > 0 then + for _,v in ipairs(t) do + instance[v] = true + end + else + for k in pairs(t) do + instance[k] = true + end + end + else + instance = {t} + end + return setmetatable(instance, Set.mt) +end + +function Set:add(e) + self[e] = true +end + +function Set:remove(e) + self[e] = nil +end + +-- Union +Set.mt.__add = function (a, b) + local res = Set:new() + if getmetatable(a) ~= Set.mt then a = Set:new(a) end + if getmetatable(b) ~= Set.mt then b = Set:new(b) end + for k in pairs(a) do res[k] = true end + for k in pairs(b) do res[k] = true end + return res +end + +-- Subtraction +Set.mt.__sub = function (a, b) + local res = Set:new() + if getmetatable(a) ~= Set.mt then a = Set:new(a) end + if getmetatable(b) ~= Set.mt then b = Set:new(b) end + for k in pairs(a) do res[k] = true end + for k in pairs(b) do res[k] = nil end + return res +end + +-- Intersection +Set.mt.__mul = function (a, b) + local res = Set:new() + if getmetatable(a) ~= Set.mt then a = Set:new(a) end + if getmetatable(b) ~= Set.mt then b = Set:new(b) end + for k in pairs(a) do + res[k] = b[k] + end + return res +end + +-- String representation +Set.mt.__tostring = function (set) + local s = "{" + local sep = "" + for k in pairs(set) do + s = s .. sep .. k + sep = ", " + end + return s .. "}" +end + +function Set:len() + local num = 0 + for _ in pairs(self) do + num = num + 1 + end + return num +end + +function Set:tolist() + local res = {} + for k in pairs(self) do + table.insert(res, k) + end + return res +end + +return Set \ No newline at end of file diff --git a/test.html b/test.html new file mode 100644 index 0000000..3e86fb5 --- /dev/null +++ b/test.html @@ -0,0 +1,43 @@ + + +
+ + + + + +
+ Surely, we could not have done this huge amount of work all by ourselves.
+ Therefore, we cannot thank enough the following persons for their kind contributions:
+
+