diff --git a/src/htmlparser.lua b/src/htmlparser.lua
index 1328887..a96ac58 100644
--- a/src/htmlparser.lua
+++ b/src/htmlparser.lua
@@ -1,67 +1,68 @@
+-- vim: ft=lua ts=2
local ElementNode = require("htmlparser.ElementNode")
local voidelements = require("htmlparser.voidelements")
local HtmlParser = {}
local function parse(text)
- local index = 0
- local root = ElementNode:new(index, text)
+ local index = 0
+ local root = ElementNode:new(index, text)
- local node, descend, tpos, opentags = root, true, 1, {}
- while true do
- local openstart, name
- openstart, tpos, name = string.find(root._text,
- "<" .. -- an uncaptured starting "<"
- "([%w-]+)" .. -- name = the first word, directly following the "<"
- "[^>]*>", -- include, but not capture everything up to the next ">"
- tpos)
- if not name then break end
- index = index + 1
- local tag = ElementNode:new(index, name, node, descend, openstart, tpos)
- node = tag
+ local node, descend, tpos, opentags = root, true, 1, {}
+ while true do
+ local openstart, name
+ openstart, tpos, name = string.find(root._text,
+ "<" .. -- an uncaptured starting "<"
+ "([%w-]+)" .. -- name = the first word, directly following the "<"
+ "[^>]*>", -- include, but not capture everything up to the next ">"
+ tpos)
+ if not name then break end
+ index = index + 1
+ local tag = ElementNode:new(index, name, node, descend, openstart, tpos)
+ node = tag
- local tagst, apos = tag:gettext(), 1
- while true do
- local start, k, eq, quote, v
- start, apos, k, eq, quote = string.find(tagst,
- "%s+" .. -- some uncaptured space
- "([^%s=/>]+)" .. -- k = an unspaced string up to an optional "=" or the "/" or ">"
- "(=?)" .. -- eq = the optional; "=", else ""
- "(['\"]?)", -- quote = an optional "'" or '"' following the "=", or ""
- apos)
- if not k or k == "/>" or k == ">" then break end
- if eq == "=" then
- local pattern = "=([^%s>]*)"
- if quote ~= "" then
- pattern = quote .. "([^" .. quote .. "]*)" .. quote
- end
- start, apos, v = string.find(tagst, pattern, apos)
- end
- tag:addattribute(k, v or "")
- end
+ local tagst, apos = tag:gettext(), 1
+ while true do
+ local start, k, eq, quote, v
+ start, apos, k, eq, quote = string.find(tagst,
+ "%s+" .. -- some uncaptured space
+ "([^%s=/>]+)" .. -- k = an unspaced string up to an optional "=" or the "/" or ">"
+ "(=?)" .. -- eq = the optional; "=", else ""
+ "(['\"]?)", -- quote = an optional "'" or '"' following the "=", or ""
+ apos)
+ if not k or k == "/>" or k == ">" then break end
+ if eq == "=" then
+ local pattern = "=([^%s>]*)"
+ if quote ~= "" then
+ pattern = quote .. "([^" .. quote .. "]*)" .. quote
+ end
+ start, apos, v = string.find(tagst, pattern, apos)
+ end
+ tag:addattribute(k, v or "")
+ end
- if voidelements[string.lower(tag.name)] then
- descend = false
- tag:close()
- else
- opentags[tag.name] = opentags[tag.name] or {}
- table.insert(opentags[tag.name], tag)
- end
+ if voidelements[string.lower(tag.name)] then
+ descend = false
+ tag:close()
+ else
+ opentags[tag.name] = opentags[tag.name] or {}
+ table.insert(opentags[tag.name], tag)
+ end
- local closeend = tpos
- while true do
- local closestart, closing, closename
- closestart, closeend, closing, closename = string.find(root._text, "[^<]*<(/?)([%w-]+)", closeend)
- if not closing or closing == "" then break end
- tag = table.remove(opentags[closename] or {}) or tag -- kludges for the cases of closing void or non-opened tags
- closestart = string.find(root._text, "<", closestart)
- tag:close(closestart, closeend + 1)
- node = tag.parent
- descend = true
- end
- end
+ local closeend = tpos
+ while true do
+ local closestart, closing, closename
+ closestart, closeend, closing, closename = string.find(root._text, "[^<]*<(/?)([%w-]+)", closeend)
+ if not closing or closing == "" then break end
+ tag = table.remove(opentags[closename] or {}) or tag -- kludges for the cases of closing void or non-opened tags
+ closestart = string.find(root._text, "<", closestart)
+ tag:close(closestart, closeend + 1)
+ node = tag.parent
+ descend = true
+ end
+ end
- return root
+ return root
end
HtmlParser.parse = parse
diff --git a/src/htmlparser/ElementNode.lua b/src/htmlparser/ElementNode.lua
index de21937..7c8d571 100644
--- a/src/htmlparser/ElementNode.lua
+++ b/src/htmlparser/ElementNode.lua
@@ -1,272 +1,273 @@
+-- vim: ft=lua ts=2
pcall(require, "luarocks.loader")
local Set = {}
Set.mt = {__index = Set}
function Set:new(values)
- local instance = {}
- local isSet if getmetatable(values) == Set.mt then isSet = true end
- if type(values) == "table" then
- if not isSet and #values > 0 then
- for _,v in ipairs(values) do
- instance[v] = true
- end
- else
- for k in pairs(values) do
- instance[k] = true
- end
- end
- elseif values ~= nil then
- instance = {[values] = true}
- end
- return setmetatable(instance, Set.mt)
+ local instance = {}
+ local isSet if getmetatable(values) == Set.mt then isSet = true end
+ if type(values) == "table" then
+ if not isSet and #values > 0 then
+ for _,v in ipairs(values) do
+ instance[v] = true
+ end
+ else
+ for k in pairs(values) do
+ instance[k] = true
+ end
+ end
+ elseif values ~= nil then
+ instance = {[values] = true}
+ end
+ return setmetatable(instance, Set.mt)
end
function Set:add(e)
- if e ~= nil then self[e] = true end
- return self
+ if e ~= nil then self[e] = true end
+ return self
end
function Set:remove(e)
- if e ~= nil then self[e] = nil end
- return self
+ if e ~= nil then self[e] = nil end
+ return self
end
function Set:tolist()
- local res = {}
- for k in pairs(self) do
- table.insert(res, k)
- end
- return res
+ local res = {}
+ for k in pairs(self) do
+ table.insert(res, k)
+ end
+ return res
end
Set.mt.__add = function (a, b)
- local res, a, b = Set:new(), Set:new(a), Set:new(b)
- for k in pairs(a) do res[k] = true end
- for k in pairs(b) do res[k] = true end
- return res
+ local res, a, b = Set:new(), Set:new(a), Set:new(b)
+ for k in pairs(a) do res[k] = true end
+ for k in pairs(b) do res[k] = true end
+ return res
end
-- Subtraction
Set.mt.__sub = function (a, b)
- local res, a, b = Set:new(), Set:new(a), Set:new(b)
- for k in pairs(a) do res[k] = true end
- for k in pairs(b) do res[k] = nil end
- return res
+ local res, a, b = Set:new(), Set:new(a), Set:new(b)
+ for k in pairs(a) do res[k] = true end
+ for k in pairs(b) do res[k] = nil end
+ return res
end
-- Intersection
Set.mt.__mul = function (a, b)
- local res, a, b = Set:new(), Set:new(a), Set:new(b)
- for k in pairs(a) do
- res[k] = b[k]
- end
- return res
+ local res, a, b = Set:new(), Set:new(a), Set:new(b)
+ for k in pairs(a) do
+ res[k] = b[k]
+ end
+ return res
end
-- String representation
Set.mt.__tostring = function (set)
- local s = "{"
- local sep = ""
- for k in pairs(set) do
- s = s .. sep .. tostring(k)
- sep = ", "
- end
- return s .. "}"
+ local s = "{"
+ local sep = ""
+ for k in pairs(set) do
+ s = s .. sep .. tostring(k)
+ sep = ", "
+ end
+ return s .. "}"
end
local ElementNode = {}
ElementNode.mt = {__index = ElementNode}
function ElementNode:new(index, nameortext, node, descend, openstart, openend)
- local instance = {
- index = index,
- name = nameortext,
- level = 0,
- parent = nil,
- root = nil,
- nodes = {},
- _openstart = openstart, _openend = openend,
- _closestart = openstart, _closeend = openend,
- attributes = {},
- id = nil,
- classes = {},
- deepernodes = Set:new(),
- deeperelements = {}, deeperattributes = {}, deeperids = {}, deeperclasses = {}
- }
- if not node then
- instance.name = "root"
- instance.root = instance
- instance._text = nameortext
- local length = string.len(nameortext)
- instance._openstart, instance._openend = 1, length
- instance._closestart, instance._closeend = 1, length
- elseif descend then
- instance.root = node.root
- instance.parent = node
- instance.level = node.level + 1
- table.insert(node.nodes, instance)
- else
- instance.root = node.root
- instance.parent = node.parent
- instance.level = node.level
- table.insert(node.parent.nodes, instance)
- end
- return setmetatable(instance, ElementNode.mt)
+ local instance = {
+ index = index,
+ name = nameortext,
+ level = 0,
+ parent = nil,
+ root = nil,
+ nodes = {},
+ _openstart = openstart, _openend = openend,
+ _closestart = openstart, _closeend = openend,
+ attributes = {},
+ id = nil,
+ classes = {},
+ deepernodes = Set:new(),
+ deeperelements = {}, deeperattributes = {}, deeperids = {}, deeperclasses = {}
+ }
+ if not node then
+ instance.name = "root"
+ instance.root = instance
+ instance._text = nameortext
+ local length = string.len(nameortext)
+ instance._openstart, instance._openend = 1, length
+ instance._closestart, instance._closeend = 1, length
+ elseif descend then
+ instance.root = node.root
+ instance.parent = node
+ instance.level = node.level + 1
+ table.insert(node.nodes, instance)
+ else
+ instance.root = node.root
+ instance.parent = node.parent
+ instance.level = node.level
+ table.insert(node.parent.nodes, instance)
+ end
+ return setmetatable(instance, ElementNode.mt)
end
function ElementNode:gettext()
- return string.sub(self.root._text, self._openstart, self._closeend)
+ return string.sub(self.root._text, self._openstart, self._closeend)
end
function ElementNode:getcontent()
- return string.sub(self.root._text, self._openend + 1, self._closestart - 1)
+ return string.sub(self.root._text, self._openend + 1, self._closestart - 1)
end
function ElementNode:addattribute(k, v)
- self.attributes[k] = v
- if string.lower(k) == "id" then
- self.id = v
- -- class attribute contains "space-separated tokens", each of which we'd like quick access to
- elseif string.lower(k) == "class" then
- for class in string.gmatch(v, "%S+") do
- table.insert(self.classes, class)
- end
- end
+ self.attributes[k] = v
+ if string.lower(k) == "id" then
+ self.id = v
+ -- class attribute contains "space-separated tokens", each of which we'd like quick access to
+ elseif string.lower(k) == "class" then
+ for class in string.gmatch(v, "%S+") do
+ table.insert(self.classes, class)
+ end
+ end
end
local function insert(table, name, node)
- table[name] = table[name] or Set:new()
- table[name]:add(node)
+ table[name] = table[name] or Set:new()
+ table[name]:add(node)
end
function ElementNode:close(closestart, closeend)
- if closestart and closeend then
- self._closestart, self._closeend = closestart, closeend
- end
- -- inform hihger level nodes about this element's existence in their branches
- local node = self
- while true do
- node = node.parent
- if not node then break end
- node.deepernodes:add(self)
- insert(node.deeperelements, self.name, self)
- for k in pairs(self.attributes) do
- insert(node.deeperattributes, k, self)
- end
- if self.id then
- insert(node.deeperids, self.id, self)
- end
- for _,v in ipairs(self.classes) do
- insert(node.deeperclasses, v, self)
- end
- end
+ if closestart and closeend then
+ self._closestart, self._closeend = closestart, closeend
+ end
+ -- inform hihger level nodes about this element's existence in their branches
+ local node = self
+ while true do
+ node = node.parent
+ if not node then break end
+ node.deepernodes:add(self)
+ insert(node.deeperelements, self.name, self)
+ for k in pairs(self.attributes) do
+ insert(node.deeperattributes, k, self)
+ end
+ if self.id then
+ insert(node.deeperids, self.id, self)
+ end
+ for _,v in ipairs(self.classes) do
+ insert(node.deeperclasses, v, self)
+ end
+ end
end
local function escape(s)
- -- escape all ^, $, (, ), %, ., [, ], *, +, - , and ? with a % prefix
- return string.gsub(s, "([%^%$%(%)%%%.%[%]%*%+%-%?])", "%%" .. "%1")
+ -- escape all ^, $, (, ), %, ., [, ], *, +, - , and ? with a % prefix
+ return string.gsub(s, "([%^%$%(%)%%%.%[%]%*%+%-%?])", "%%" .. "%1")
end
local function select(self, s)
- if not s or type(s) ~= "string" or s == "" then return Set:new() end
- local sets = {[""] = self.deeperelements, ["["] = self.deeperattributes,
- ["#"] = self.deeperids, ["."] = self.deeperclasses}
- local function match(t, w)
- local m, e, v
- if t == "[" then w, m, e, v = string.match(w,
- "([^=|%*~%$!%^]+)" .. -- w = 1 or more characters up to a possible "=", "|", "*", "~", "$", "!", or "^"
- "([|%*~%$!%^]?)" .. -- m = an optional "|", "*", "~", "$", "!", or "^", preceding the optional "="
- "(=?)" .. -- e = the optional "="
- "(.*)" -- v = anything following the "=", or else ""
- )
- end
- local matched = Set:new(sets[t][w])
- -- attribute value selectors
- if e == "=" then
- if #v < 2 then v = "'" .. v .. "'" end -- values should be quoted
- v = string.sub(v, 2, #v - 1) -- strip quotes
- if m == "!" then matched = Set:new(self.deepernodes) end -- include those without that attribute
- for node in pairs(matched) do
- local a = node.attributes[w]
- -- equals
- if m == "" and a ~= v then matched:remove(node)
- -- not equals
- elseif m == "!" and a == v then matched:remove(node)
- -- prefix
- elseif m =="|" and string.match(a, "^[^-]*") ~= v then matched:remove(node)
- -- contains
- elseif m =="*" and string.match(a, escape(v)) ~= v then matched:remove(node)
- -- word
- elseif m =="~" then matched:remove(node)
- for word in string.gmatch(a, "%S+") do
- if word == v then matched:add(node) break end
- end
- -- starts with
- elseif m =="^" and string.match(a, "^" .. escape(v)) ~= v then matched:remove(node)
- -- ends with
- elseif m =="$" and string.match(a, escape(v) .. "$") ~= v then matched:remove(node)
- end
- end -- for node
- end -- if v
- return matched
- end
+ if not s or type(s) ~= "string" or s == "" then return Set:new() end
+ local sets = {[""] = self.deeperelements, ["["] = self.deeperattributes,
+ ["#"] = self.deeperids, ["."] = self.deeperclasses}
+ local function match(t, w)
+ local m, e, v
+ if t == "[" then w, m, e, v = string.match(w,
+ "([^=|%*~%$!%^]+)" .. -- w = 1 or more characters up to a possible "=", "|", "*", "~", "$", "!", or "^"
+ "([|%*~%$!%^]?)" .. -- m = an optional "|", "*", "~", "$", "!", or "^", preceding the optional "="
+ "(=?)" .. -- e = the optional "="
+ "(.*)" -- v = anything following the "=", or else ""
+ )
+ end
+ local matched = Set:new(sets[t][w])
+ -- attribute value selectors
+ if e == "=" then
+ if #v < 2 then v = "'" .. v .. "'" end -- values should be quoted
+ v = string.sub(v, 2, #v - 1) -- strip quotes
+ if m == "!" then matched = Set:new(self.deepernodes) end -- include those without that attribute
+ for node in pairs(matched) do
+ local a = node.attributes[w]
+ -- equals
+ if m == "" and a ~= v then matched:remove(node)
+ -- not equals
+ elseif m == "!" and a == v then matched:remove(node)
+ -- prefix
+ elseif m =="|" and string.match(a, "^[^-]*") ~= v then matched:remove(node)
+ -- contains
+ elseif m =="*" and string.match(a, escape(v)) ~= v then matched:remove(node)
+ -- word
+ elseif m =="~" then matched:remove(node)
+ for word in string.gmatch(a, "%S+") do
+ if word == v then matched:add(node) break end
+ end
+ -- starts with
+ elseif m =="^" and string.match(a, "^" .. escape(v)) ~= v then matched:remove(node)
+ -- ends with
+ elseif m =="$" and string.match(a, escape(v) .. "$") ~= v then matched:remove(node)
+ end
+ end -- for node
+ end -- if v
+ return matched
+ end
- local subjects, resultset, childrenonly = Set:new({self})
- for part in string.gmatch(s, "%S+") do
+ local subjects, resultset, childrenonly = Set:new({self})
+ for part in string.gmatch(s, "%S+") do
repeat
- if part == ">" then childrenonly = true --[[goto nextpart]] break end
- resultset = Set:new()
- for subject in pairs(subjects) do
- local star = subject.deepernodes
- if childrenonly then star = Set:new(subject.nodes) end
- resultset = resultset + star
- end
- childrenonly = false
- if part == "*" then --[[goto nextpart]] break end
- local excludes, filter = Set:new()
- local start, pos = 0, 0
- while true do
- local switch, stype, name, eq, quote
- start, pos, switch, stype, name, eq, quote = string.find(part,
- "(%(?%)?)" .. -- switch = a possible ( or ) switching the filter on or off
- "([:%[#.]?)" .. -- stype = a possible :, [, #, or .
- "([%w-_\\]+)" .. -- name = 1 or more alfanumeric chars (+ hyphen, reverse slash and uderscore)
- "([|%*~%$!%^]?=?)" .. -- eq = a possible |=, *=, ~=, $=, !=, ^=, or =
- "(['\"]?)", -- quote = a ' or " delimiting a possible attribute value
- pos + 1
- )
- if not name then break end
+ if part == ">" then childrenonly = true --[[goto nextpart]] break end
+ resultset = Set:new()
+ for subject in pairs(subjects) do
+ local star = subject.deepernodes
+ if childrenonly then star = Set:new(subject.nodes) end
+ resultset = resultset + star
+ end
+ childrenonly = false
+ if part == "*" then --[[goto nextpart]] break end
+ local excludes, filter = Set:new()
+ local start, pos = 0, 0
+ while true do
+ local switch, stype, name, eq, quote
+ start, pos, switch, stype, name, eq, quote = string.find(part,
+ "(%(?%)?)" .. -- switch = a possible ( or ) switching the filter on or off
+ "([:%[#.]?)" .. -- stype = a possible :, [, #, or .
+ "([%w-_\\]+)" .. -- name = 1 or more alfanumeric chars (+ hyphen, reverse slash and uderscore)
+ "([|%*~%$!%^]?=?)" .. -- eq = a possible |=, *=, ~=, $=, !=, ^=, or =
+ "(['\"]?)", -- quote = a ' or " delimiting a possible attribute value
+ pos + 1
+ )
+ if not name then break end
repeat
- if ":" == stype then
- filter = name
- --[[goto nextname]] break
- end
- if ")" == switch then
- filter = nil
- end
- if "[" == stype and "" ~= quote then
- local value
- start, pos, value = string.find(part, "(%b" .. quote .. quote .. ")]", pos)
- name = name .. eq .. value
- end
- local matched = match(stype, name)
- if filter == "not" then
- excludes = excludes + matched
- else
- resultset = resultset * matched
- end
- --::nextname::
+ if ":" == stype then
+ filter = name
+ --[[goto nextname]] break
+ end
+ if ")" == switch then
+ filter = nil
+ end
+ if "[" == stype and "" ~= quote then
+ local value
+ start, pos, value = string.find(part, "(%b" .. quote .. quote .. ")]", pos)
+ name = name .. eq .. value
+ end
+ local matched = match(stype, name)
+ if filter == "not" then
+ excludes = excludes + matched
+ else
+ resultset = resultset * matched
+ end
+ --::nextname::
break
until true
- end
- resultset = resultset - excludes
- subjects = Set:new(resultset)
- --::nextpart::
+ end
+ resultset = resultset - excludes
+ subjects = Set:new(resultset)
+ --::nextpart::
break
until true
- end
- resultset = resultset:tolist()
- table.sort(resultset, function (a, b) return a.index < b.index end)
- return resultset
+ end
+ resultset = resultset:tolist()
+ table.sort(resultset, function (a, b) return a.index < b.index end)
+ return resultset
end
function ElementNode:select(s) return select(self, s) end
diff --git a/src/htmlparser/voidelements.lua b/src/htmlparser/voidelements.lua
index b95b010..43dedf5 100644
--- a/src/htmlparser/voidelements.lua
+++ b/src/htmlparser/voidelements.lua
@@ -1,18 +1,19 @@
+-- vim: ft=lua ts=2
return {
- area = true,
- base = true,
- br = true,
- col = true,
- command = true,
- embed = true,
- hr = true,
- img = true,
- input = true,
- keygen = true,
- link = true,
- meta = true,
- param = true,
- source = true,
- track = true,
- wbr = true
+ area = true,
+ base = true,
+ br = true,
+ col = true,
+ command = true,
+ embed = true,
+ hr = true,
+ img = true,
+ input = true,
+ keygen = true,
+ link = true,
+ meta = true,
+ param = true,
+ source = true,
+ track = true,
+ wbr = true
}