diff --git a/src/htmlparser.lua b/src/htmlparser.lua index 1328887..a96ac58 100644 --- a/src/htmlparser.lua +++ b/src/htmlparser.lua @@ -1,67 +1,68 @@ +-- vim: ft=lua ts=2 local ElementNode = require("htmlparser.ElementNode") local voidelements = require("htmlparser.voidelements") local HtmlParser = {} local function parse(text) - local index = 0 - local root = ElementNode:new(index, text) + local index = 0 + local root = ElementNode:new(index, text) - local node, descend, tpos, opentags = root, true, 1, {} - while true do - local openstart, name - openstart, tpos, name = string.find(root._text, - "<" .. -- an uncaptured starting "<" - "([%w-]+)" .. -- name = the first word, directly following the "<" - "[^>]*>", -- include, but not capture everything up to the next ">" - tpos) - if not name then break end - index = index + 1 - local tag = ElementNode:new(index, name, node, descend, openstart, tpos) - node = tag + local node, descend, tpos, opentags = root, true, 1, {} + while true do + local openstart, name + openstart, tpos, name = string.find(root._text, + "<" .. -- an uncaptured starting "<" + "([%w-]+)" .. -- name = the first word, directly following the "<" + "[^>]*>", -- include, but not capture everything up to the next ">" + tpos) + if not name then break end + index = index + 1 + local tag = ElementNode:new(index, name, node, descend, openstart, tpos) + node = tag - local tagst, apos = tag:gettext(), 1 - while true do - local start, k, eq, quote, v - start, apos, k, eq, quote = string.find(tagst, - "%s+" .. -- some uncaptured space - "([^%s=/>]+)" .. -- k = an unspaced string up to an optional "=" or the "/" or ">" - "(=?)" .. -- eq = the optional; "=", else "" - "(['\"]?)", -- quote = an optional "'" or '"' following the "=", or "" - apos) - if not k or k == "/>" or k == ">" then break end - if eq == "=" then - local pattern = "=([^%s>]*)" - if quote ~= "" then - pattern = quote .. "([^" .. quote .. "]*)" .. quote - end - start, apos, v = string.find(tagst, pattern, apos) - end - tag:addattribute(k, v or "") - end + local tagst, apos = tag:gettext(), 1 + while true do + local start, k, eq, quote, v + start, apos, k, eq, quote = string.find(tagst, + "%s+" .. -- some uncaptured space + "([^%s=/>]+)" .. -- k = an unspaced string up to an optional "=" or the "/" or ">" + "(=?)" .. -- eq = the optional; "=", else "" + "(['\"]?)", -- quote = an optional "'" or '"' following the "=", or "" + apos) + if not k or k == "/>" or k == ">" then break end + if eq == "=" then + local pattern = "=([^%s>]*)" + if quote ~= "" then + pattern = quote .. "([^" .. quote .. "]*)" .. quote + end + start, apos, v = string.find(tagst, pattern, apos) + end + tag:addattribute(k, v or "") + end - if voidelements[string.lower(tag.name)] then - descend = false - tag:close() - else - opentags[tag.name] = opentags[tag.name] or {} - table.insert(opentags[tag.name], tag) - end + if voidelements[string.lower(tag.name)] then + descend = false + tag:close() + else + opentags[tag.name] = opentags[tag.name] or {} + table.insert(opentags[tag.name], tag) + end - local closeend = tpos - while true do - local closestart, closing, closename - closestart, closeend, closing, closename = string.find(root._text, "[^<]*<(/?)([%w-]+)", closeend) - if not closing or closing == "" then break end - tag = table.remove(opentags[closename] or {}) or tag -- kludges for the cases of closing void or non-opened tags - closestart = string.find(root._text, "<", closestart) - tag:close(closestart, closeend + 1) - node = tag.parent - descend = true - end - end + local closeend = tpos + while true do + local closestart, closing, closename + closestart, closeend, closing, closename = string.find(root._text, "[^<]*<(/?)([%w-]+)", closeend) + if not closing or closing == "" then break end + tag = table.remove(opentags[closename] or {}) or tag -- kludges for the cases of closing void or non-opened tags + closestart = string.find(root._text, "<", closestart) + tag:close(closestart, closeend + 1) + node = tag.parent + descend = true + end + end - return root + return root end HtmlParser.parse = parse diff --git a/src/htmlparser/ElementNode.lua b/src/htmlparser/ElementNode.lua index de21937..7c8d571 100644 --- a/src/htmlparser/ElementNode.lua +++ b/src/htmlparser/ElementNode.lua @@ -1,272 +1,273 @@ +-- vim: ft=lua ts=2 pcall(require, "luarocks.loader") local Set = {} Set.mt = {__index = Set} function Set:new(values) - local instance = {} - local isSet if getmetatable(values) == Set.mt then isSet = true end - if type(values) == "table" then - if not isSet and #values > 0 then - for _,v in ipairs(values) do - instance[v] = true - end - else - for k in pairs(values) do - instance[k] = true - end - end - elseif values ~= nil then - instance = {[values] = true} - end - return setmetatable(instance, Set.mt) + local instance = {} + local isSet if getmetatable(values) == Set.mt then isSet = true end + if type(values) == "table" then + if not isSet and #values > 0 then + for _,v in ipairs(values) do + instance[v] = true + end + else + for k in pairs(values) do + instance[k] = true + end + end + elseif values ~= nil then + instance = {[values] = true} + end + return setmetatable(instance, Set.mt) end function Set:add(e) - if e ~= nil then self[e] = true end - return self + if e ~= nil then self[e] = true end + return self end function Set:remove(e) - if e ~= nil then self[e] = nil end - return self + if e ~= nil then self[e] = nil end + return self end function Set:tolist() - local res = {} - for k in pairs(self) do - table.insert(res, k) - end - return res + local res = {} + for k in pairs(self) do + table.insert(res, k) + end + return res end Set.mt.__add = function (a, b) - local res, a, b = Set:new(), Set:new(a), Set:new(b) - for k in pairs(a) do res[k] = true end - for k in pairs(b) do res[k] = true end - return res + local res, a, b = Set:new(), Set:new(a), Set:new(b) + for k in pairs(a) do res[k] = true end + for k in pairs(b) do res[k] = true end + return res end -- Subtraction Set.mt.__sub = function (a, b) - local res, a, b = Set:new(), Set:new(a), Set:new(b) - for k in pairs(a) do res[k] = true end - for k in pairs(b) do res[k] = nil end - return res + local res, a, b = Set:new(), Set:new(a), Set:new(b) + for k in pairs(a) do res[k] = true end + for k in pairs(b) do res[k] = nil end + return res end -- Intersection Set.mt.__mul = function (a, b) - local res, a, b = Set:new(), Set:new(a), Set:new(b) - for k in pairs(a) do - res[k] = b[k] - end - return res + local res, a, b = Set:new(), Set:new(a), Set:new(b) + for k in pairs(a) do + res[k] = b[k] + end + return res end -- String representation Set.mt.__tostring = function (set) - local s = "{" - local sep = "" - for k in pairs(set) do - s = s .. sep .. tostring(k) - sep = ", " - end - return s .. "}" + local s = "{" + local sep = "" + for k in pairs(set) do + s = s .. sep .. tostring(k) + sep = ", " + end + return s .. "}" end local ElementNode = {} ElementNode.mt = {__index = ElementNode} function ElementNode:new(index, nameortext, node, descend, openstart, openend) - local instance = { - index = index, - name = nameortext, - level = 0, - parent = nil, - root = nil, - nodes = {}, - _openstart = openstart, _openend = openend, - _closestart = openstart, _closeend = openend, - attributes = {}, - id = nil, - classes = {}, - deepernodes = Set:new(), - deeperelements = {}, deeperattributes = {}, deeperids = {}, deeperclasses = {} - } - if not node then - instance.name = "root" - instance.root = instance - instance._text = nameortext - local length = string.len(nameortext) - instance._openstart, instance._openend = 1, length - instance._closestart, instance._closeend = 1, length - elseif descend then - instance.root = node.root - instance.parent = node - instance.level = node.level + 1 - table.insert(node.nodes, instance) - else - instance.root = node.root - instance.parent = node.parent - instance.level = node.level - table.insert(node.parent.nodes, instance) - end - return setmetatable(instance, ElementNode.mt) + local instance = { + index = index, + name = nameortext, + level = 0, + parent = nil, + root = nil, + nodes = {}, + _openstart = openstart, _openend = openend, + _closestart = openstart, _closeend = openend, + attributes = {}, + id = nil, + classes = {}, + deepernodes = Set:new(), + deeperelements = {}, deeperattributes = {}, deeperids = {}, deeperclasses = {} + } + if not node then + instance.name = "root" + instance.root = instance + instance._text = nameortext + local length = string.len(nameortext) + instance._openstart, instance._openend = 1, length + instance._closestart, instance._closeend = 1, length + elseif descend then + instance.root = node.root + instance.parent = node + instance.level = node.level + 1 + table.insert(node.nodes, instance) + else + instance.root = node.root + instance.parent = node.parent + instance.level = node.level + table.insert(node.parent.nodes, instance) + end + return setmetatable(instance, ElementNode.mt) end function ElementNode:gettext() - return string.sub(self.root._text, self._openstart, self._closeend) + return string.sub(self.root._text, self._openstart, self._closeend) end function ElementNode:getcontent() - return string.sub(self.root._text, self._openend + 1, self._closestart - 1) + return string.sub(self.root._text, self._openend + 1, self._closestart - 1) end function ElementNode:addattribute(k, v) - self.attributes[k] = v - if string.lower(k) == "id" then - self.id = v - -- class attribute contains "space-separated tokens", each of which we'd like quick access to - elseif string.lower(k) == "class" then - for class in string.gmatch(v, "%S+") do - table.insert(self.classes, class) - end - end + self.attributes[k] = v + if string.lower(k) == "id" then + self.id = v + -- class attribute contains "space-separated tokens", each of which we'd like quick access to + elseif string.lower(k) == "class" then + for class in string.gmatch(v, "%S+") do + table.insert(self.classes, class) + end + end end local function insert(table, name, node) - table[name] = table[name] or Set:new() - table[name]:add(node) + table[name] = table[name] or Set:new() + table[name]:add(node) end function ElementNode:close(closestart, closeend) - if closestart and closeend then - self._closestart, self._closeend = closestart, closeend - end - -- inform hihger level nodes about this element's existence in their branches - local node = self - while true do - node = node.parent - if not node then break end - node.deepernodes:add(self) - insert(node.deeperelements, self.name, self) - for k in pairs(self.attributes) do - insert(node.deeperattributes, k, self) - end - if self.id then - insert(node.deeperids, self.id, self) - end - for _,v in ipairs(self.classes) do - insert(node.deeperclasses, v, self) - end - end + if closestart and closeend then + self._closestart, self._closeend = closestart, closeend + end + -- inform hihger level nodes about this element's existence in their branches + local node = self + while true do + node = node.parent + if not node then break end + node.deepernodes:add(self) + insert(node.deeperelements, self.name, self) + for k in pairs(self.attributes) do + insert(node.deeperattributes, k, self) + end + if self.id then + insert(node.deeperids, self.id, self) + end + for _,v in ipairs(self.classes) do + insert(node.deeperclasses, v, self) + end + end end local function escape(s) - -- escape all ^, $, (, ), %, ., [, ], *, +, - , and ? with a % prefix - return string.gsub(s, "([%^%$%(%)%%%.%[%]%*%+%-%?])", "%%" .. "%1") + -- escape all ^, $, (, ), %, ., [, ], *, +, - , and ? with a % prefix + return string.gsub(s, "([%^%$%(%)%%%.%[%]%*%+%-%?])", "%%" .. "%1") end local function select(self, s) - if not s or type(s) ~= "string" or s == "" then return Set:new() end - local sets = {[""] = self.deeperelements, ["["] = self.deeperattributes, - ["#"] = self.deeperids, ["."] = self.deeperclasses} - local function match(t, w) - local m, e, v - if t == "[" then w, m, e, v = string.match(w, - "([^=|%*~%$!%^]+)" .. -- w = 1 or more characters up to a possible "=", "|", "*", "~", "$", "!", or "^" - "([|%*~%$!%^]?)" .. -- m = an optional "|", "*", "~", "$", "!", or "^", preceding the optional "=" - "(=?)" .. -- e = the optional "=" - "(.*)" -- v = anything following the "=", or else "" - ) - end - local matched = Set:new(sets[t][w]) - -- attribute value selectors - if e == "=" then - if #v < 2 then v = "'" .. v .. "'" end -- values should be quoted - v = string.sub(v, 2, #v - 1) -- strip quotes - if m == "!" then matched = Set:new(self.deepernodes) end -- include those without that attribute - for node in pairs(matched) do - local a = node.attributes[w] - -- equals - if m == "" and a ~= v then matched:remove(node) - -- not equals - elseif m == "!" and a == v then matched:remove(node) - -- prefix - elseif m =="|" and string.match(a, "^[^-]*") ~= v then matched:remove(node) - -- contains - elseif m =="*" and string.match(a, escape(v)) ~= v then matched:remove(node) - -- word - elseif m =="~" then matched:remove(node) - for word in string.gmatch(a, "%S+") do - if word == v then matched:add(node) break end - end - -- starts with - elseif m =="^" and string.match(a, "^" .. escape(v)) ~= v then matched:remove(node) - -- ends with - elseif m =="$" and string.match(a, escape(v) .. "$") ~= v then matched:remove(node) - end - end -- for node - end -- if v - return matched - end + if not s or type(s) ~= "string" or s == "" then return Set:new() end + local sets = {[""] = self.deeperelements, ["["] = self.deeperattributes, + ["#"] = self.deeperids, ["."] = self.deeperclasses} + local function match(t, w) + local m, e, v + if t == "[" then w, m, e, v = string.match(w, + "([^=|%*~%$!%^]+)" .. -- w = 1 or more characters up to a possible "=", "|", "*", "~", "$", "!", or "^" + "([|%*~%$!%^]?)" .. -- m = an optional "|", "*", "~", "$", "!", or "^", preceding the optional "=" + "(=?)" .. -- e = the optional "=" + "(.*)" -- v = anything following the "=", or else "" + ) + end + local matched = Set:new(sets[t][w]) + -- attribute value selectors + if e == "=" then + if #v < 2 then v = "'" .. v .. "'" end -- values should be quoted + v = string.sub(v, 2, #v - 1) -- strip quotes + if m == "!" then matched = Set:new(self.deepernodes) end -- include those without that attribute + for node in pairs(matched) do + local a = node.attributes[w] + -- equals + if m == "" and a ~= v then matched:remove(node) + -- not equals + elseif m == "!" and a == v then matched:remove(node) + -- prefix + elseif m =="|" and string.match(a, "^[^-]*") ~= v then matched:remove(node) + -- contains + elseif m =="*" and string.match(a, escape(v)) ~= v then matched:remove(node) + -- word + elseif m =="~" then matched:remove(node) + for word in string.gmatch(a, "%S+") do + if word == v then matched:add(node) break end + end + -- starts with + elseif m =="^" and string.match(a, "^" .. escape(v)) ~= v then matched:remove(node) + -- ends with + elseif m =="$" and string.match(a, escape(v) .. "$") ~= v then matched:remove(node) + end + end -- for node + end -- if v + return matched + end - local subjects, resultset, childrenonly = Set:new({self}) - for part in string.gmatch(s, "%S+") do + local subjects, resultset, childrenonly = Set:new({self}) + for part in string.gmatch(s, "%S+") do repeat - if part == ">" then childrenonly = true --[[goto nextpart]] break end - resultset = Set:new() - for subject in pairs(subjects) do - local star = subject.deepernodes - if childrenonly then star = Set:new(subject.nodes) end - resultset = resultset + star - end - childrenonly = false - if part == "*" then --[[goto nextpart]] break end - local excludes, filter = Set:new() - local start, pos = 0, 0 - while true do - local switch, stype, name, eq, quote - start, pos, switch, stype, name, eq, quote = string.find(part, - "(%(?%)?)" .. -- switch = a possible ( or ) switching the filter on or off - "([:%[#.]?)" .. -- stype = a possible :, [, #, or . - "([%w-_\\]+)" .. -- name = 1 or more alfanumeric chars (+ hyphen, reverse slash and uderscore) - "([|%*~%$!%^]?=?)" .. -- eq = a possible |=, *=, ~=, $=, !=, ^=, or = - "(['\"]?)", -- quote = a ' or " delimiting a possible attribute value - pos + 1 - ) - if not name then break end + if part == ">" then childrenonly = true --[[goto nextpart]] break end + resultset = Set:new() + for subject in pairs(subjects) do + local star = subject.deepernodes + if childrenonly then star = Set:new(subject.nodes) end + resultset = resultset + star + end + childrenonly = false + if part == "*" then --[[goto nextpart]] break end + local excludes, filter = Set:new() + local start, pos = 0, 0 + while true do + local switch, stype, name, eq, quote + start, pos, switch, stype, name, eq, quote = string.find(part, + "(%(?%)?)" .. -- switch = a possible ( or ) switching the filter on or off + "([:%[#.]?)" .. -- stype = a possible :, [, #, or . + "([%w-_\\]+)" .. -- name = 1 or more alfanumeric chars (+ hyphen, reverse slash and uderscore) + "([|%*~%$!%^]?=?)" .. -- eq = a possible |=, *=, ~=, $=, !=, ^=, or = + "(['\"]?)", -- quote = a ' or " delimiting a possible attribute value + pos + 1 + ) + if not name then break end repeat - if ":" == stype then - filter = name - --[[goto nextname]] break - end - if ")" == switch then - filter = nil - end - if "[" == stype and "" ~= quote then - local value - start, pos, value = string.find(part, "(%b" .. quote .. quote .. ")]", pos) - name = name .. eq .. value - end - local matched = match(stype, name) - if filter == "not" then - excludes = excludes + matched - else - resultset = resultset * matched - end - --::nextname:: + if ":" == stype then + filter = name + --[[goto nextname]] break + end + if ")" == switch then + filter = nil + end + if "[" == stype and "" ~= quote then + local value + start, pos, value = string.find(part, "(%b" .. quote .. quote .. ")]", pos) + name = name .. eq .. value + end + local matched = match(stype, name) + if filter == "not" then + excludes = excludes + matched + else + resultset = resultset * matched + end + --::nextname:: break until true - end - resultset = resultset - excludes - subjects = Set:new(resultset) - --::nextpart:: + end + resultset = resultset - excludes + subjects = Set:new(resultset) + --::nextpart:: break until true - end - resultset = resultset:tolist() - table.sort(resultset, function (a, b) return a.index < b.index end) - return resultset + end + resultset = resultset:tolist() + table.sort(resultset, function (a, b) return a.index < b.index end) + return resultset end function ElementNode:select(s) return select(self, s) end diff --git a/src/htmlparser/voidelements.lua b/src/htmlparser/voidelements.lua index b95b010..43dedf5 100644 --- a/src/htmlparser/voidelements.lua +++ b/src/htmlparser/voidelements.lua @@ -1,18 +1,19 @@ +-- vim: ft=lua ts=2 return { - area = true, - base = true, - br = true, - col = true, - command = true, - embed = true, - hr = true, - img = true, - input = true, - keygen = true, - link = true, - meta = true, - param = true, - source = true, - track = true, - wbr = true + area = true, + base = true, + br = true, + col = true, + command = true, + embed = true, + hr = true, + img = true, + input = true, + keygen = true, + link = true, + meta = true, + param = true, + source = true, + track = true, + wbr = true }