mirror of
https://github.com/TangentFoxy/lua-htmlparser.git
synced 2026-01-10 16:08:21 +00:00
First draft for a Rock setup
This commit is contained in:
69
src/htmlparser.lua
Normal file
69
src/htmlparser.lua
Normal file
@@ -0,0 +1,69 @@
|
||||
local ElementNode = require("htmlparser.ElementNode")
|
||||
local voidelements = require("htmlparser.voidelements")
|
||||
|
||||
local HtmlParser = {}
|
||||
|
||||
local function parse(text)
|
||||
local root = ElementNode:new(text)
|
||||
|
||||
local node, descend, tpos, opentags = root, true, 1, {}
|
||||
while true do
|
||||
local openstart, name
|
||||
openstart, tpos, name = string.find(root._text,
|
||||
"<" .. -- an uncaptured starting "<"
|
||||
"(%w+)" .. -- name = the first word, directly following the "<"
|
||||
"[^>]*>", -- include, but not capture everything up to the next ">"
|
||||
tpos)
|
||||
if not name then break end
|
||||
local tag = ElementNode:new(name, node, descend, openstart, tpos)
|
||||
node = tag
|
||||
|
||||
local tagst, apos = tag:gettext(), 1
|
||||
while true do
|
||||
local start, k, eq, quote, v
|
||||
start, apos, k, eq, quote = string.find(tagst,
|
||||
"%s+" .. -- some uncaptured space
|
||||
"([^%s=]+)" .. -- k = an unspaced string up to an optional "="
|
||||
"(=?)" .. -- eq = the optiona; "=", else ""
|
||||
"(['\"]?)", -- quote = an optional "'" or '"' following the "=", or ""
|
||||
apos)
|
||||
if not k or k == "/>" then break end
|
||||
if eq == "" then
|
||||
v = ""
|
||||
else
|
||||
local pattern = "=([^%s>]*)"
|
||||
if quote ~= '' then
|
||||
pattern = quote .. "([^" .. quote .. "]*)" .. quote
|
||||
end
|
||||
start, apos, v = string.find(tagst, pattern, apos)
|
||||
end
|
||||
tag:addattribute(k, v)
|
||||
end
|
||||
|
||||
if voidelements[string.lower(tag.name)] then
|
||||
descend = false
|
||||
tag:close()
|
||||
else
|
||||
opentags[tag.name] = opentags[tag.name] or {}
|
||||
table.insert(opentags[tag.name], tag)
|
||||
end
|
||||
|
||||
local closeend = tpos
|
||||
while true do
|
||||
local closestart, closing, closename
|
||||
closestart, closeend, closing, closename = string.find(root._text, "[^<]*<(/?)(%w+)", closeend)
|
||||
if not closing or closing == "" then break end
|
||||
tag = table.remove(opentags[closename])
|
||||
closestart = string.find(root._text, "<", closestart)
|
||||
tag:close(closestart, closeend + 1)
|
||||
node = tag.parent
|
||||
descend = true
|
||||
end
|
||||
end
|
||||
|
||||
return root
|
||||
end
|
||||
HtmlParser.parse = parse
|
||||
|
||||
return HtmlParser
|
||||
|
||||
174
src/htmlparser/ElementNode.lua
Normal file
174
src/htmlparser/ElementNode.lua
Normal file
@@ -0,0 +1,174 @@
|
||||
require("luarocks.loader")
|
||||
local Set = require("Set")
|
||||
|
||||
local ElementNode = {}
|
||||
ElementNode.mt = {__index = ElementNode}
|
||||
function ElementNode:new(nameortext, node, descend, openstart, openend)
|
||||
local instance = {
|
||||
name = nameortext,
|
||||
level = 0,
|
||||
parent = nil,
|
||||
root = nil,
|
||||
nodes = {},
|
||||
_openstart = openstart, _openend = openend,
|
||||
_closestart = openstart, _closeend = openend,
|
||||
attributes = {},
|
||||
id = nil,
|
||||
classes = {},
|
||||
deepernodes = Set:new(),
|
||||
deeperelements = {}, deeperattributes = {}, deeperids = {}, deeperclasses = {}
|
||||
}
|
||||
if not node then
|
||||
instance.name = "root"
|
||||
instance.root = instance
|
||||
instance._text = nameortext
|
||||
local length = string.len(nameortext)
|
||||
instance._openstart, instance._openend = 1, length
|
||||
instance._closestart, instance._closeend = 1, length
|
||||
elseif descend then
|
||||
instance.root = node.root
|
||||
instance.parent = node
|
||||
instance.level = node.level + 1
|
||||
table.insert(node.nodes, instance)
|
||||
else
|
||||
instance.root = node.root
|
||||
instance.parent = node.parent
|
||||
instance.level = node.level
|
||||
table.insert(node.parent.nodes, instance)
|
||||
end
|
||||
return setmetatable(instance, ElementNode.mt)
|
||||
end
|
||||
|
||||
function ElementNode:gettext()
|
||||
return string.sub(self.root._text, self._openstart, self._closeend)
|
||||
end
|
||||
|
||||
function ElementNode:getcontent()
|
||||
return string.sub(self.root._text, self._openend + 1, self._closestart - 1)
|
||||
end
|
||||
|
||||
function ElementNode:addattribute(k, v)
|
||||
self.attributes[k] = v
|
||||
if string.lower(k) == "id" then
|
||||
self.id = v
|
||||
end
|
||||
-- class attribute contains "space-separated tokens", each of which we'd like quick access to
|
||||
if string.lower(k) == "class" then
|
||||
for class in string.gmatch(v, "%S+") do
|
||||
table.insert(self.classes, class)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
local function insert(table, name, node)
|
||||
table[name] = table[name] or Set:new()
|
||||
table[name]:add(node)
|
||||
end
|
||||
|
||||
function ElementNode:close(closestart, closeend)
|
||||
if closestart and closeend then
|
||||
self._closestart, self._closeend = closestart, closeend
|
||||
end
|
||||
-- inform hihger level nodes about this element's existence in their branches
|
||||
local node = self
|
||||
while true do
|
||||
node = node.parent
|
||||
if not node then break end
|
||||
node.deepernodes:add(self)
|
||||
insert(node.deeperelements, self.name, self)
|
||||
for k in pairs(self.attributes) do
|
||||
insert(node.deeperattributes, k, self)
|
||||
end
|
||||
if self.id then
|
||||
insert(node.deeperids, self.id, self)
|
||||
end
|
||||
for _,v in ipairs(self.classes) do
|
||||
insert(node.deeperclasses, v, self)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
local function escape(s)
|
||||
-- escape all ^, $, (, ), %, ., [, ], *, +, - , and ? with a % prefix
|
||||
return string.gsub(s, "([%^%$%(%)%%%.%[%]%*%+%-%?])", "%%" .. "%1")
|
||||
end
|
||||
|
||||
local function select(self, s)
|
||||
if not s or type(s) ~= "string" or s == "" then return Set:new() end
|
||||
local sets = {[""] = self.deeperelements, ["["] = self.deeperattributes,
|
||||
["#"] = self.deeperids, ["."] = self.deeperclasses}
|
||||
local function match(t, w)
|
||||
local m, v
|
||||
if t == "[" then w, m, v = string.match(w,
|
||||
"([^=|%*~%$!%^]+)" .. -- w = 1 or more characters up to a possible "=", "|", "*", "~", "$", "!", or "^"
|
||||
"([|%*~%$!%^]?)" .. -- m = an optional "|", "*", "~", "$", "!", or "^", preceding the optional "="
|
||||
"=?" .. -- an optional uncaptured "="
|
||||
"(.*)" -- v = anything following the "=", or else ""
|
||||
)
|
||||
end
|
||||
local matched = Set:new(sets[t][w])
|
||||
-- attribute value selectors
|
||||
if v and v ~= "" then
|
||||
v = string.sub(v, 2, #v - 1) -- strip quotes
|
||||
for node in pairs(matched) do
|
||||
local a = node.attributes[w]
|
||||
-- equals
|
||||
if m == "" and a ~= v then matched:remove(node)
|
||||
-- not equals
|
||||
elseif m == "!" and a == v then matched:remove(node)
|
||||
-- prefix
|
||||
elseif m =="|" and string.match(a, "^[^-]*") ~= v then matched:remove(node)
|
||||
-- contains
|
||||
elseif m =="*" and string.match(a, escape(v)) ~= v then matched:remove(node)
|
||||
-- word
|
||||
elseif m =="~" then matched:remove(node)
|
||||
for word in string.gmatch(a, "%S+") do
|
||||
if word == v then matched:add(node) break end
|
||||
end
|
||||
-- starts with
|
||||
elseif m =="^" and string.match(a, "^" .. escape(v)) ~= v then matched:remove(node)
|
||||
-- ends with
|
||||
elseif m =="$" and string.match(a, escape(v) .. "$") ~= v then matched:remove(node)
|
||||
end
|
||||
end -- for node
|
||||
end -- if v
|
||||
return matched
|
||||
end
|
||||
|
||||
local subjects, resultset, childrenonly = Set:new({self})
|
||||
for part in string.gmatch(s, "%S+") do
|
||||
if part == ">" then childrenonly = true goto nextpart end
|
||||
resultset = Set:new()
|
||||
for subject in pairs(subjects) do
|
||||
local star = subject.deepernodes
|
||||
if childrenonly then star = Set:new(subject.nodes) childrenonly = false end
|
||||
resultset = resultset + star
|
||||
end
|
||||
if part == "*" then goto nextpart end
|
||||
local excludes, filter = Set:new()
|
||||
for t, w in string.gmatch(part,
|
||||
"([:%[#.]?)" .. -- t = an optional :, [, #, or .
|
||||
"([^:%(%[#.%]%)]+)" .. -- w = 1 or more of anything not :, (, [, #, ., ], or )
|
||||
"%]?%)?" -- followed by an uncaptured optional ] and/or )
|
||||
) do
|
||||
if t == ":" then filter = w goto nextw end
|
||||
local matched = match(t, w)
|
||||
if filter == "not" then
|
||||
excludes = excludes + matched
|
||||
else
|
||||
resultset = resultset * matched
|
||||
end
|
||||
filter = nil
|
||||
::nextw::
|
||||
end
|
||||
resultset = resultset - excludes
|
||||
subjects = Set:new(resultset)
|
||||
::nextpart::
|
||||
end
|
||||
return resultset
|
||||
end
|
||||
|
||||
function ElementNode:select(s) return select(self, s) end
|
||||
ElementNode.mt.__call = select
|
||||
|
||||
return ElementNode
|
||||
18
src/htmlparser/voidelements.lua
Normal file
18
src/htmlparser/voidelements.lua
Normal file
@@ -0,0 +1,18 @@
|
||||
return {
|
||||
area = true,
|
||||
base = true,
|
||||
br = true,
|
||||
col = true,
|
||||
command = true,
|
||||
embed = true,
|
||||
hr = true,
|
||||
img = true,
|
||||
input = true,
|
||||
keygen = true,
|
||||
link = true,
|
||||
meta = true,
|
||||
param = true,
|
||||
source = true,
|
||||
track = true,
|
||||
wbr = true
|
||||
}
|
||||
Reference in New Issue
Block a user