.lua-files/htmlparser/ElementNode.lua

284 lines
7.9 KiB
Lua

-- vim: ft=lua ts=2
local Set = {}
Set.mt = {__index = Set}
function Set:new(values)
local instance = {}
local isSet if getmetatable(values) == Set.mt then isSet = true end
if type(values) == "table" then
if not isSet and #values > 0 then
for _,v in ipairs(values) do
instance[v] = true
end
else
for k in pairs(values) do
instance[k] = true
end
end
elseif values ~= nil then
instance = {[values] = true}
end
return setmetatable(instance, Set.mt)
end
function Set:add(e)
if e ~= nil then self[e] = true end
return self
end
function Set:remove(e)
if e ~= nil then self[e] = nil end
return self
end
function Set:tolist()
local res = {}
for k in pairs(self) do
table.insert(res, k)
end
return res
end
Set.mt.__add = function (a, b)
local res, a, b = Set:new(), Set:new(a), Set:new(b)
for k in pairs(a) do res[k] = true end
for k in pairs(b) do res[k] = true end
return res
end
-- Subtraction
Set.mt.__sub = function (a, b)
local res, a, b = Set:new(), Set:new(a), Set:new(b)
for k in pairs(a) do res[k] = true end
for k in pairs(b) do res[k] = nil end
return res
end
-- Intersection
Set.mt.__mul = function (a, b)
local res, a, b = Set:new(), Set:new(a), Set:new(b)
for k in pairs(a) do
res[k] = b[k]
end
return res
end
-- String representation
Set.mt.__tostring = function (set)
local s = "{"
local sep = ""
for k in pairs(set) do
s = s .. sep .. tostring(k)
sep = ", "
end
return s .. "}"
end
local ElementNode = {}
ElementNode.mt = {__index = ElementNode}
function ElementNode:new(index, nameortext, node, descend, openstart, openend)
local instance = {
index = index,
name = nameortext,
level = 0,
parent = nil,
root = nil,
nodes = {},
_openstart = openstart, _openend = openend,
_closestart = openstart, _closeend = openend,
attributes = {},
id = nil,
classes = {},
deepernodes = Set:new(),
deeperelements = {}, deeperattributes = {}, deeperids = {}, deeperclasses = {}
}
if not node then
instance.name = "root"
instance.root = instance
instance._text = nameortext
local length = string.len(nameortext)
instance._openstart, instance._openend = 1, length
instance._closestart, instance._closeend = 1, length
elseif descend then
instance.root = node.root
instance.parent = node
instance.level = node.level + 1
table.insert(node.nodes, instance)
else
instance.root = node.root
instance.parent = node.parent or node --XXX: adds some safety but needs more testing for heisenbugs in corner cases
instance.level = node.level
table.insert((node.parent and node.parent.nodes or node.nodes), instance) --XXX: see above about heisenbugs
end
return setmetatable(instance, ElementNode.mt)
end
function ElementNode:gettext()
return string.sub(self.root._text, self._openstart, self._closeend)
end
function ElementNode:settext(c)
self.root._text=c
end
function ElementNode:textonly()
return (self:gettext():gsub("<[^>]*>",""))
end
function ElementNode:getcontent()
return string.sub(self.root._text, self._openend + 1, self._closestart - 1)
end
function ElementNode:addattribute(k, v)
self.attributes[k] = v
if string.lower(k) == "id" then
self.id = v
-- class attribute contains "space-separated tokens", each of which we'd like quick access to
elseif string.lower(k) == "class" then
for class in string.gmatch(v, "%S+") do
table.insert(self.classes, class)
end
end
end
local function insert(table, name, node)
table[name] = table[name] or Set:new()
table[name]:add(node)
end
function ElementNode:close(closestart, closeend)
if closestart and closeend then
self._closestart, self._closeend = closestart, closeend
end
-- inform hihger level nodes about this element's existence in their branches
local node = self
while true do
node = node.parent
if not node then break end
node.deepernodes:add(self)
insert(node.deeperelements, self.name, self)
for k in pairs(self.attributes) do
insert(node.deeperattributes, k, self)
end
if self.id then
insert(node.deeperids, self.id, self)
end
for _,v in ipairs(self.classes) do
insert(node.deeperclasses, v, self)
end
end
end
local function escape(s)
-- escape all ^, $, (, ), %, ., [, ], *, +, - , and ? with a % prefix
return string.gsub(s, "([%^%$%(%)%%%.%[%]%*%+%-%?])", "%%" .. "%1")
end
local function select(self, s)
if not s or type(s) ~= "string" or s == "" then return Set:new() end
local sets = {[""] = self.deeperelements, ["["] = self.deeperattributes,
["#"] = self.deeperids, ["."] = self.deeperclasses}
local function match(t, w)
local m, e, v
if t == "[" then w, m, e, v = string.match(w,
"([^=|%*~%$!%^]+)" .. -- w = 1 or more characters up to a possible "=", "|", "*", "~", "$", "!", or "^"
"([|%*~%$!%^]?)" .. -- m = an optional "|", "*", "~", "$", "!", or "^", preceding the optional "="
"(=?)" .. -- e = the optional "="
"(.*)" -- v = anything following the "=", or else ""
)
end
local matched = Set:new(sets[t][w])
-- attribute value selectors
if e == "=" then
if #v < 2 then v = "'" .. v .. "'" end -- values should be quoted
v = string.sub(v, 2, #v - 1) -- strip quotes
if m == "!" then matched = Set:new(self.deepernodes) end -- include those without that attribute
for node in pairs(matched) do
local a = node.attributes[w]
-- equals
if m == "" and a ~= v then matched:remove(node)
-- not equals
elseif m == "!" and a == v then matched:remove(node)
-- prefix
elseif m =="|" and string.match(a, "^[^-]*") ~= v then matched:remove(node)
-- contains
elseif m =="*" and string.match(a, escape(v)) ~= v then matched:remove(node)
-- word
elseif m =="~" then matched:remove(node)
for word in string.gmatch(a, "%S+") do
if word == v then matched:add(node) break end
end
-- starts with
elseif m =="^" and string.match(a, "^" .. escape(v)) ~= v then matched:remove(node)
-- ends with
elseif m =="$" and string.match(a, escape(v) .. "$") ~= v then matched:remove(node)
end
end -- for node
end -- if v
return matched
end
local subjects, resultset, childrenonly = Set:new({self})
for part in string.gmatch(s, "%S+") do
repeat
if part == ">" then childrenonly = true --[[goto nextpart]] break end
resultset = Set:new()
for subject in pairs(subjects) do
local star = subject.deepernodes
if childrenonly then star = Set:new(subject.nodes) end
resultset = resultset + star
end
childrenonly = false
if part == "*" then --[[goto nextpart]] break end
local excludes, filter = Set:new()
local start, pos = 0, 0
while true do
local switch, stype, name, eq, quote
start, pos, switch, stype, name, eq, quote = string.find(part,
"(%(?%)?)" .. -- switch = a possible ( or ) switching the filter on or off
"([:%[#.]?)" .. -- stype = a possible :, [, #, or .
"([%w-_\\]+)" .. -- name = 1 or more alfanumeric chars (+ hyphen, reverse slash and uderscore)
"([|%*~%$!%^]?=?)" .. -- eq = a possible |=, *=, ~=, $=, !=, ^=, or =
"(['\"]?)", -- quote = a ' or " delimiting a possible attribute value
pos + 1
)
if not name then break end
repeat
if ":" == stype then
filter = name
--[[goto nextname]] break
end
if ")" == switch then
filter = nil
end
if "[" == stype and "" ~= quote then
local value
start, pos, value = string.find(part, "(%b" .. quote .. quote .. ")]", pos)
name = name .. eq .. value
end
local matched = match(stype, name)
if filter == "not" then
excludes = excludes + matched
else
resultset = resultset * matched
end
--::nextname::
break
until true
end
resultset = resultset - excludes
subjects = Set:new(resultset)
--::nextpart::
break
until true
end
resultset = resultset:tolist()
table.sort(resultset, function (a, b) return a.index < b.index end)
return resultset
end
function ElementNode:select(s) return select(self, s) end
ElementNode.mt.__call = select
return ElementNode