source reformatting (tabs)

This commit is contained in:
Vadim A. Misbakh-Soloviov 2017-04-09 01:50:14 +07:00
parent 2c5b1d1689
commit 4dbae96e8d
No known key found for this signature in database
GPG Key ID: 26503D349B3B334B
3 changed files with 279 additions and 276 deletions

View File

@ -1,67 +1,68 @@
-- vim: ft=lua ts=2
local ElementNode = require("htmlparser.ElementNode") local ElementNode = require("htmlparser.ElementNode")
local voidelements = require("htmlparser.voidelements") local voidelements = require("htmlparser.voidelements")
local HtmlParser = {} local HtmlParser = {}
local function parse(text) local function parse(text)
local index = 0 local index = 0
local root = ElementNode:new(index, text) local root = ElementNode:new(index, text)
local node, descend, tpos, opentags = root, true, 1, {} local node, descend, tpos, opentags = root, true, 1, {}
while true do while true do
local openstart, name local openstart, name
openstart, tpos, name = string.find(root._text, openstart, tpos, name = string.find(root._text,
"<" .. -- an uncaptured starting "<" "<" .. -- an uncaptured starting "<"
"([%w-]+)" .. -- name = the first word, directly following the "<" "([%w-]+)" .. -- name = the first word, directly following the "<"
"[^>]*>", -- include, but not capture everything up to the next ">" "[^>]*>", -- include, but not capture everything up to the next ">"
tpos) tpos)
if not name then break end if not name then break end
index = index + 1 index = index + 1
local tag = ElementNode:new(index, name, node, descend, openstart, tpos) local tag = ElementNode:new(index, name, node, descend, openstart, tpos)
node = tag node = tag
local tagst, apos = tag:gettext(), 1 local tagst, apos = tag:gettext(), 1
while true do while true do
local start, k, eq, quote, v local start, k, eq, quote, v
start, apos, k, eq, quote = string.find(tagst, start, apos, k, eq, quote = string.find(tagst,
"%s+" .. -- some uncaptured space "%s+" .. -- some uncaptured space
"([^%s=/>]+)" .. -- k = an unspaced string up to an optional "=" or the "/" or ">" "([^%s=/>]+)" .. -- k = an unspaced string up to an optional "=" or the "/" or ">"
"(=?)" .. -- eq = the optional; "=", else "" "(=?)" .. -- eq = the optional; "=", else ""
"(['\"]?)", -- quote = an optional "'" or '"' following the "=", or "" "(['\"]?)", -- quote = an optional "'" or '"' following the "=", or ""
apos) apos)
if not k or k == "/>" or k == ">" then break end if not k or k == "/>" or k == ">" then break end
if eq == "=" then if eq == "=" then
local pattern = "=([^%s>]*)" local pattern = "=([^%s>]*)"
if quote ~= "" then if quote ~= "" then
pattern = quote .. "([^" .. quote .. "]*)" .. quote pattern = quote .. "([^" .. quote .. "]*)" .. quote
end end
start, apos, v = string.find(tagst, pattern, apos) start, apos, v = string.find(tagst, pattern, apos)
end end
tag:addattribute(k, v or "") tag:addattribute(k, v or "")
end end
if voidelements[string.lower(tag.name)] then if voidelements[string.lower(tag.name)] then
descend = false descend = false
tag:close() tag:close()
else else
opentags[tag.name] = opentags[tag.name] or {} opentags[tag.name] = opentags[tag.name] or {}
table.insert(opentags[tag.name], tag) table.insert(opentags[tag.name], tag)
end end
local closeend = tpos local closeend = tpos
while true do while true do
local closestart, closing, closename local closestart, closing, closename
closestart, closeend, closing, closename = string.find(root._text, "[^<]*<(/?)([%w-]+)", closeend) closestart, closeend, closing, closename = string.find(root._text, "[^<]*<(/?)([%w-]+)", closeend)
if not closing or closing == "" then break end if not closing or closing == "" then break end
tag = table.remove(opentags[closename] or {}) or tag -- kludges for the cases of closing void or non-opened tags tag = table.remove(opentags[closename] or {}) or tag -- kludges for the cases of closing void or non-opened tags
closestart = string.find(root._text, "<", closestart) closestart = string.find(root._text, "<", closestart)
tag:close(closestart, closeend + 1) tag:close(closestart, closeend + 1)
node = tag.parent node = tag.parent
descend = true descend = true
end end
end end
return root return root
end end
HtmlParser.parse = parse HtmlParser.parse = parse

View File

@ -1,272 +1,273 @@
-- vim: ft=lua ts=2
pcall(require, "luarocks.loader") pcall(require, "luarocks.loader")
local Set = {} local Set = {}
Set.mt = {__index = Set} Set.mt = {__index = Set}
function Set:new(values) function Set:new(values)
local instance = {} local instance = {}
local isSet if getmetatable(values) == Set.mt then isSet = true end local isSet if getmetatable(values) == Set.mt then isSet = true end
if type(values) == "table" then if type(values) == "table" then
if not isSet and #values > 0 then if not isSet and #values > 0 then
for _,v in ipairs(values) do for _,v in ipairs(values) do
instance[v] = true instance[v] = true
end end
else else
for k in pairs(values) do for k in pairs(values) do
instance[k] = true instance[k] = true
end end
end end
elseif values ~= nil then elseif values ~= nil then
instance = {[values] = true} instance = {[values] = true}
end end
return setmetatable(instance, Set.mt) return setmetatable(instance, Set.mt)
end end
function Set:add(e) function Set:add(e)
if e ~= nil then self[e] = true end if e ~= nil then self[e] = true end
return self return self
end end
function Set:remove(e) function Set:remove(e)
if e ~= nil then self[e] = nil end if e ~= nil then self[e] = nil end
return self return self
end end
function Set:tolist() function Set:tolist()
local res = {} local res = {}
for k in pairs(self) do for k in pairs(self) do
table.insert(res, k) table.insert(res, k)
end end
return res return res
end end
Set.mt.__add = function (a, b) Set.mt.__add = function (a, b)
local res, a, b = Set:new(), Set:new(a), Set:new(b) local res, a, b = Set:new(), Set:new(a), Set:new(b)
for k in pairs(a) do res[k] = true end for k in pairs(a) do res[k] = true end
for k in pairs(b) do res[k] = true end for k in pairs(b) do res[k] = true end
return res return res
end end
-- Subtraction -- Subtraction
Set.mt.__sub = function (a, b) Set.mt.__sub = function (a, b)
local res, a, b = Set:new(), Set:new(a), Set:new(b) local res, a, b = Set:new(), Set:new(a), Set:new(b)
for k in pairs(a) do res[k] = true end for k in pairs(a) do res[k] = true end
for k in pairs(b) do res[k] = nil end for k in pairs(b) do res[k] = nil end
return res return res
end end
-- Intersection -- Intersection
Set.mt.__mul = function (a, b) Set.mt.__mul = function (a, b)
local res, a, b = Set:new(), Set:new(a), Set:new(b) local res, a, b = Set:new(), Set:new(a), Set:new(b)
for k in pairs(a) do for k in pairs(a) do
res[k] = b[k] res[k] = b[k]
end end
return res return res
end end
-- String representation -- String representation
Set.mt.__tostring = function (set) Set.mt.__tostring = function (set)
local s = "{" local s = "{"
local sep = "" local sep = ""
for k in pairs(set) do for k in pairs(set) do
s = s .. sep .. tostring(k) s = s .. sep .. tostring(k)
sep = ", " sep = ", "
end end
return s .. "}" return s .. "}"
end end
local ElementNode = {} local ElementNode = {}
ElementNode.mt = {__index = ElementNode} ElementNode.mt = {__index = ElementNode}
function ElementNode:new(index, nameortext, node, descend, openstart, openend) function ElementNode:new(index, nameortext, node, descend, openstart, openend)
local instance = { local instance = {
index = index, index = index,
name = nameortext, name = nameortext,
level = 0, level = 0,
parent = nil, parent = nil,
root = nil, root = nil,
nodes = {}, nodes = {},
_openstart = openstart, _openend = openend, _openstart = openstart, _openend = openend,
_closestart = openstart, _closeend = openend, _closestart = openstart, _closeend = openend,
attributes = {}, attributes = {},
id = nil, id = nil,
classes = {}, classes = {},
deepernodes = Set:new(), deepernodes = Set:new(),
deeperelements = {}, deeperattributes = {}, deeperids = {}, deeperclasses = {} deeperelements = {}, deeperattributes = {}, deeperids = {}, deeperclasses = {}
} }
if not node then if not node then
instance.name = "root" instance.name = "root"
instance.root = instance instance.root = instance
instance._text = nameortext instance._text = nameortext
local length = string.len(nameortext) local length = string.len(nameortext)
instance._openstart, instance._openend = 1, length instance._openstart, instance._openend = 1, length
instance._closestart, instance._closeend = 1, length instance._closestart, instance._closeend = 1, length
elseif descend then elseif descend then
instance.root = node.root instance.root = node.root
instance.parent = node instance.parent = node
instance.level = node.level + 1 instance.level = node.level + 1
table.insert(node.nodes, instance) table.insert(node.nodes, instance)
else else
instance.root = node.root instance.root = node.root
instance.parent = node.parent instance.parent = node.parent
instance.level = node.level instance.level = node.level
table.insert(node.parent.nodes, instance) table.insert(node.parent.nodes, instance)
end end
return setmetatable(instance, ElementNode.mt) return setmetatable(instance, ElementNode.mt)
end end
function ElementNode:gettext() function ElementNode:gettext()
return string.sub(self.root._text, self._openstart, self._closeend) return string.sub(self.root._text, self._openstart, self._closeend)
end end
function ElementNode:getcontent() function ElementNode:getcontent()
return string.sub(self.root._text, self._openend + 1, self._closestart - 1) return string.sub(self.root._text, self._openend + 1, self._closestart - 1)
end end
function ElementNode:addattribute(k, v) function ElementNode:addattribute(k, v)
self.attributes[k] = v self.attributes[k] = v
if string.lower(k) == "id" then if string.lower(k) == "id" then
self.id = v self.id = v
-- class attribute contains "space-separated tokens", each of which we'd like quick access to -- class attribute contains "space-separated tokens", each of which we'd like quick access to
elseif string.lower(k) == "class" then elseif string.lower(k) == "class" then
for class in string.gmatch(v, "%S+") do for class in string.gmatch(v, "%S+") do
table.insert(self.classes, class) table.insert(self.classes, class)
end end
end end
end end
local function insert(table, name, node) local function insert(table, name, node)
table[name] = table[name] or Set:new() table[name] = table[name] or Set:new()
table[name]:add(node) table[name]:add(node)
end end
function ElementNode:close(closestart, closeend) function ElementNode:close(closestart, closeend)
if closestart and closeend then if closestart and closeend then
self._closestart, self._closeend = closestart, closeend self._closestart, self._closeend = closestart, closeend
end end
-- inform hihger level nodes about this element's existence in their branches -- inform hihger level nodes about this element's existence in their branches
local node = self local node = self
while true do while true do
node = node.parent node = node.parent
if not node then break end if not node then break end
node.deepernodes:add(self) node.deepernodes:add(self)
insert(node.deeperelements, self.name, self) insert(node.deeperelements, self.name, self)
for k in pairs(self.attributes) do for k in pairs(self.attributes) do
insert(node.deeperattributes, k, self) insert(node.deeperattributes, k, self)
end end
if self.id then if self.id then
insert(node.deeperids, self.id, self) insert(node.deeperids, self.id, self)
end end
for _,v in ipairs(self.classes) do for _,v in ipairs(self.classes) do
insert(node.deeperclasses, v, self) insert(node.deeperclasses, v, self)
end end
end end
end end
local function escape(s) local function escape(s)
-- escape all ^, $, (, ), %, ., [, ], *, +, - , and ? with a % prefix -- escape all ^, $, (, ), %, ., [, ], *, +, - , and ? with a % prefix
return string.gsub(s, "([%^%$%(%)%%%.%[%]%*%+%-%?])", "%%" .. "%1") return string.gsub(s, "([%^%$%(%)%%%.%[%]%*%+%-%?])", "%%" .. "%1")
end end
local function select(self, s) local function select(self, s)
if not s or type(s) ~= "string" or s == "" then return Set:new() end if not s or type(s) ~= "string" or s == "" then return Set:new() end
local sets = {[""] = self.deeperelements, ["["] = self.deeperattributes, local sets = {[""] = self.deeperelements, ["["] = self.deeperattributes,
["#"] = self.deeperids, ["."] = self.deeperclasses} ["#"] = self.deeperids, ["."] = self.deeperclasses}
local function match(t, w) local function match(t, w)
local m, e, v local m, e, v
if t == "[" then w, m, e, v = string.match(w, if t == "[" then w, m, e, v = string.match(w,
"([^=|%*~%$!%^]+)" .. -- w = 1 or more characters up to a possible "=", "|", "*", "~", "$", "!", or "^" "([^=|%*~%$!%^]+)" .. -- w = 1 or more characters up to a possible "=", "|", "*", "~", "$", "!", or "^"
"([|%*~%$!%^]?)" .. -- m = an optional "|", "*", "~", "$", "!", or "^", preceding the optional "=" "([|%*~%$!%^]?)" .. -- m = an optional "|", "*", "~", "$", "!", or "^", preceding the optional "="
"(=?)" .. -- e = the optional "=" "(=?)" .. -- e = the optional "="
"(.*)" -- v = anything following the "=", or else "" "(.*)" -- v = anything following the "=", or else ""
) )
end end
local matched = Set:new(sets[t][w]) local matched = Set:new(sets[t][w])
-- attribute value selectors -- attribute value selectors
if e == "=" then if e == "=" then
if #v < 2 then v = "'" .. v .. "'" end -- values should be quoted if #v < 2 then v = "'" .. v .. "'" end -- values should be quoted
v = string.sub(v, 2, #v - 1) -- strip quotes v = string.sub(v, 2, #v - 1) -- strip quotes
if m == "!" then matched = Set:new(self.deepernodes) end -- include those without that attribute if m == "!" then matched = Set:new(self.deepernodes) end -- include those without that attribute
for node in pairs(matched) do for node in pairs(matched) do
local a = node.attributes[w] local a = node.attributes[w]
-- equals -- equals
if m == "" and a ~= v then matched:remove(node) if m == "" and a ~= v then matched:remove(node)
-- not equals -- not equals
elseif m == "!" and a == v then matched:remove(node) elseif m == "!" and a == v then matched:remove(node)
-- prefix -- prefix
elseif m =="|" and string.match(a, "^[^-]*") ~= v then matched:remove(node) elseif m =="|" and string.match(a, "^[^-]*") ~= v then matched:remove(node)
-- contains -- contains
elseif m =="*" and string.match(a, escape(v)) ~= v then matched:remove(node) elseif m =="*" and string.match(a, escape(v)) ~= v then matched:remove(node)
-- word -- word
elseif m =="~" then matched:remove(node) elseif m =="~" then matched:remove(node)
for word in string.gmatch(a, "%S+") do for word in string.gmatch(a, "%S+") do
if word == v then matched:add(node) break end if word == v then matched:add(node) break end
end end
-- starts with -- starts with
elseif m =="^" and string.match(a, "^" .. escape(v)) ~= v then matched:remove(node) elseif m =="^" and string.match(a, "^" .. escape(v)) ~= v then matched:remove(node)
-- ends with -- ends with
elseif m =="$" and string.match(a, escape(v) .. "$") ~= v then matched:remove(node) elseif m =="$" and string.match(a, escape(v) .. "$") ~= v then matched:remove(node)
end end
end -- for node end -- for node
end -- if v end -- if v
return matched return matched
end end
local subjects, resultset, childrenonly = Set:new({self}) local subjects, resultset, childrenonly = Set:new({self})
for part in string.gmatch(s, "%S+") do for part in string.gmatch(s, "%S+") do
repeat repeat
if part == ">" then childrenonly = true --[[goto nextpart]] break end if part == ">" then childrenonly = true --[[goto nextpart]] break end
resultset = Set:new() resultset = Set:new()
for subject in pairs(subjects) do for subject in pairs(subjects) do
local star = subject.deepernodes local star = subject.deepernodes
if childrenonly then star = Set:new(subject.nodes) end if childrenonly then star = Set:new(subject.nodes) end
resultset = resultset + star resultset = resultset + star
end end
childrenonly = false childrenonly = false
if part == "*" then --[[goto nextpart]] break end if part == "*" then --[[goto nextpart]] break end
local excludes, filter = Set:new() local excludes, filter = Set:new()
local start, pos = 0, 0 local start, pos = 0, 0
while true do while true do
local switch, stype, name, eq, quote local switch, stype, name, eq, quote
start, pos, switch, stype, name, eq, quote = string.find(part, start, pos, switch, stype, name, eq, quote = string.find(part,
"(%(?%)?)" .. -- switch = a possible ( or ) switching the filter on or off "(%(?%)?)" .. -- switch = a possible ( or ) switching the filter on or off
"([:%[#.]?)" .. -- stype = a possible :, [, #, or . "([:%[#.]?)" .. -- stype = a possible :, [, #, or .
"([%w-_\\]+)" .. -- name = 1 or more alfanumeric chars (+ hyphen, reverse slash and uderscore) "([%w-_\\]+)" .. -- name = 1 or more alfanumeric chars (+ hyphen, reverse slash and uderscore)
"([|%*~%$!%^]?=?)" .. -- eq = a possible |=, *=, ~=, $=, !=, ^=, or = "([|%*~%$!%^]?=?)" .. -- eq = a possible |=, *=, ~=, $=, !=, ^=, or =
"(['\"]?)", -- quote = a ' or " delimiting a possible attribute value "(['\"]?)", -- quote = a ' or " delimiting a possible attribute value
pos + 1 pos + 1
) )
if not name then break end if not name then break end
repeat repeat
if ":" == stype then if ":" == stype then
filter = name filter = name
--[[goto nextname]] break --[[goto nextname]] break
end end
if ")" == switch then if ")" == switch then
filter = nil filter = nil
end end
if "[" == stype and "" ~= quote then if "[" == stype and "" ~= quote then
local value local value
start, pos, value = string.find(part, "(%b" .. quote .. quote .. ")]", pos) start, pos, value = string.find(part, "(%b" .. quote .. quote .. ")]", pos)
name = name .. eq .. value name = name .. eq .. value
end end
local matched = match(stype, name) local matched = match(stype, name)
if filter == "not" then if filter == "not" then
excludes = excludes + matched excludes = excludes + matched
else else
resultset = resultset * matched resultset = resultset * matched
end end
--::nextname:: --::nextname::
break break
until true until true
end end
resultset = resultset - excludes resultset = resultset - excludes
subjects = Set:new(resultset) subjects = Set:new(resultset)
--::nextpart:: --::nextpart::
break break
until true until true
end end
resultset = resultset:tolist() resultset = resultset:tolist()
table.sort(resultset, function (a, b) return a.index < b.index end) table.sort(resultset, function (a, b) return a.index < b.index end)
return resultset return resultset
end end
function ElementNode:select(s) return select(self, s) end function ElementNode:select(s) return select(self, s) end

View File

@ -1,18 +1,19 @@
-- vim: ft=lua ts=2
return { return {
area = true, area = true,
base = true, base = true,
br = true, br = true,
col = true, col = true,
command = true, command = true,
embed = true, embed = true,
hr = true, hr = true,
img = true, img = true,
input = true, input = true,
keygen = true, keygen = true,
link = true, link = true,
meta = true, meta = true,
param = true, param = true,
source = true, source = true,
track = true, track = true,
wbr = true wbr = true
} }