mirror of
https://github.com/TangentFoxy/lua-htmlparser.git
synced 2025-07-28 11:02:18 +00:00
Merge pull request #12 from wscherphof/#6-selectors
#6 selectors: :not() and [attribute="value"] working
This commit is contained in:
@@ -17,9 +17,7 @@ function ElementNode:new(nameortext, node, descend, openstart, openend)
|
|||||||
deepernodes = Set:new(),
|
deepernodes = Set:new(),
|
||||||
deeperelements = {}, deeperattributes = {}, deeperids = {}, deeperclasses = {}
|
deeperelements = {}, deeperattributes = {}, deeperids = {}, deeperclasses = {}
|
||||||
}
|
}
|
||||||
if nameortext == "container" then
|
if not node then
|
||||||
instance.root = node
|
|
||||||
elseif not node then
|
|
||||||
instance.name = "root"
|
instance.name = "root"
|
||||||
instance.root = instance
|
instance.root = instance
|
||||||
instance._text = nameortext
|
instance._text = nameortext
|
||||||
@@ -90,48 +88,67 @@ function ElementNode:close(closestart, closeend)
|
|||||||
end
|
end
|
||||||
|
|
||||||
local function select(self, s)
|
local function select(self, s)
|
||||||
if not s or type(s) ~= "string" then return {} end
|
if not s or type(s) ~= "string" or s == "" then return Set:new() end
|
||||||
local subjects = Set:new({self})
|
|
||||||
local resultset
|
local function match(t, w)
|
||||||
local childrenonly
|
local sets = {
|
||||||
|
[""] = self.deeperelements,
|
||||||
|
["["] = self.deeperattributes,
|
||||||
|
["#"] = self.deeperids,
|
||||||
|
["."] = self.deeperclasses
|
||||||
|
}
|
||||||
|
local v
|
||||||
|
if t == "[" then
|
||||||
|
w, v = string.match(w,
|
||||||
|
"([^=]+)" .. -- w = 1 or more characters up to a possible "="
|
||||||
|
"=?" .. -- an optional uncaptured "="
|
||||||
|
"(.*)" -- v = anything following the "=", or else ""
|
||||||
|
)
|
||||||
|
end
|
||||||
|
local matched = sets[t][w]
|
||||||
|
if v and v ~= "" then
|
||||||
|
v = string.sub(v, 2, #v - 1) -- strip quotes
|
||||||
|
for node in pairs(matched) do
|
||||||
|
if node.attributes[w] ~= v then
|
||||||
|
matched:remove(node)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
return matched
|
||||||
|
end
|
||||||
|
|
||||||
|
local subjects, resultset, childrenonly = Set:new({self})
|
||||||
for part in string.gmatch(s, "%S+") do
|
for part in string.gmatch(s, "%S+") do
|
||||||
if part == ">" then childrenonly = true goto nextpart end
|
if part == ">" then childrenonly = true goto nextpart end
|
||||||
resultset = Set:new()
|
resultset = Set:new()
|
||||||
for subject in pairs(subjects) do
|
for subject in pairs(subjects) do
|
||||||
local init = subject.deepernodes
|
local star = subject.deepernodes
|
||||||
if childrenonly then init = Set:new(subject.nodes) childrenonly = false end
|
if childrenonly then star = Set:new(subject.nodes) end
|
||||||
resultset = resultset + init
|
childrenonly = false
|
||||||
|
resultset = resultset + star
|
||||||
end
|
end
|
||||||
if part == "*" then goto nextpart end
|
if part == "*" then goto nextpart end
|
||||||
for t, w in string.gmatch(part, "([%[#%.]?)([^%[%]#%.]+)") do
|
local excludes, filter = Set:new()
|
||||||
if t == "" then resultset = resultset * self.deeperelements[w]
|
for t, w in string.gmatch(part,
|
||||||
elseif t == "[" then resultset = resultset * self.deeperattributes[w]
|
"([:%[#.]?)" .. -- t = an optional :, [, #, or .
|
||||||
elseif t == "#" then resultset = resultset * self.deeperids[w]
|
"([^:%(%[#.%]%)]+)" .. -- w = 1 or more of anything not :, (, [, #, ., ], or )
|
||||||
elseif t == "." then resultset = resultset * self.deeperclasses[w]
|
"%]?%)?" -- followed by an uncaptured optional ] and/or )
|
||||||
|
) do
|
||||||
|
if t == ":" then filter = w goto nextw end
|
||||||
|
local matched = match(t, w)
|
||||||
|
if filter == "not" then
|
||||||
|
excludes = excludes + matched
|
||||||
|
else
|
||||||
|
resultset = resultset * matched
|
||||||
end
|
end
|
||||||
|
filter = nil
|
||||||
|
::nextw::
|
||||||
end
|
end
|
||||||
|
resultset = resultset - excludes
|
||||||
subjects = Set:new(resultset)
|
subjects = Set:new(resultset)
|
||||||
::nextpart::
|
::nextpart::
|
||||||
end
|
end
|
||||||
-- construct a container node for the resultset, so that we can :select() on it
|
return resultset
|
||||||
local ret = ElementNode:new("container", self)
|
|
||||||
for node in pairs(resultset) do
|
|
||||||
table.insert(ret.nodes, node)
|
|
||||||
ret.deepernodes = ret.deepernodes + node.deepernodes
|
|
||||||
for listname,list in pairs({
|
|
||||||
deeperelements = node.deeperelements,
|
|
||||||
deeperattributes = node.deeperattributes,
|
|
||||||
deeperids = node.deeperids,
|
|
||||||
deeperclasses = node.deeperclasses
|
|
||||||
}) do
|
|
||||||
local target = ret[listname]
|
|
||||||
for k,set in pairs(list) do
|
|
||||||
-- Set.__add will create an empty Set if not target[k]
|
|
||||||
target[k] = target[k] + set
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
||||||
return ret
|
|
||||||
end
|
end
|
||||||
|
|
||||||
function ElementNode:select(s) return select(self, s) end
|
function ElementNode:select(s) return select(self, s) end
|
||||||
|
@@ -9,7 +9,11 @@ local function parse(text)
|
|||||||
local node, descend, tpos, opentags = root, true, 1, {}
|
local node, descend, tpos, opentags = root, true, 1, {}
|
||||||
while true do
|
while true do
|
||||||
local openstart, name
|
local openstart, name
|
||||||
openstart, tpos, name = string.find(root._text, "<(%w+)[^>]*>", tpos)
|
openstart, tpos, name = string.find(root._text,
|
||||||
|
"<" .. -- an uncaptured starting "<"
|
||||||
|
"(%w+)" .. -- name = the first word, directly following the "<"
|
||||||
|
"[^>]*>", -- include, but not capture everything up to the next ">"
|
||||||
|
tpos)
|
||||||
if not name then break end
|
if not name then break end
|
||||||
local tag = ElementNode:new(name, node, descend, openstart, tpos)
|
local tag = ElementNode:new(name, node, descend, openstart, tpos)
|
||||||
node = tag
|
node = tag
|
||||||
@@ -17,12 +21,17 @@ local function parse(text)
|
|||||||
local tagst, apos = tag:gettext(), 1
|
local tagst, apos = tag:gettext(), 1
|
||||||
while true do
|
while true do
|
||||||
local start, k, eq, quote, v
|
local start, k, eq, quote, v
|
||||||
start, apos, k, eq, quote = string.find(tagst, "%s+([^%s=]+)(=?)(['\"]?)", apos)
|
start, apos, k, eq, quote = string.find(tagst,
|
||||||
if not k then break end
|
"%s+" .. -- some uncaptured space
|
||||||
|
"([^%s=]+)" .. -- k = an unspaced string up to an optional "="
|
||||||
|
"(=?)" .. -- eq = the optiona; "=", else ""
|
||||||
|
"(['\"]?)", -- quote = an optional "'" or '"' following the "=", or ""
|
||||||
|
apos)
|
||||||
|
if not k or k == "/>" then break end
|
||||||
if eq == "" then
|
if eq == "" then
|
||||||
v = ""
|
v = ""
|
||||||
else
|
else
|
||||||
local pattern = "=([^%s'\">]*)"
|
local pattern = "=([^%s>]*)"
|
||||||
if quote ~= '' then
|
if quote ~= '' then
|
||||||
pattern = quote .. "([^" .. quote .. "]*)" .. quote
|
pattern = quote .. "([^" .. quote .. "]*)" .. quote
|
||||||
end
|
end
|
||||||
|
@@ -1,5 +1,5 @@
|
|||||||
<!DOCTYPE html>
|
<!DOCTYPE html>
|
||||||
<html lang="en" test1='val1' test2="val='2'" test3='val="3"' test4="val = 4" test5=val5>
|
<html lang="en" test1='val1' test2="val='2'" test3='val="3"' test4="val = 4" test5=val5 test6=val""6>
|
||||||
<head>
|
<head>
|
||||||
<meta charset="utf-8" />
|
<meta charset="utf-8" />
|
||||||
<link rel="stylesheet" href="test.css" />
|
<link rel="stylesheet" href="test.css" />
|
||||||
|
39
test.lua
39
test.lua
@@ -10,11 +10,8 @@ local root = htmlparser.parse(text)
|
|||||||
local function p(n)
|
local function p(n)
|
||||||
local space = string.rep(" ", n.level)
|
local space = string.rep(" ", n.level)
|
||||||
local s = space .. n.name
|
local s = space .. n.name
|
||||||
for i,v in ipairs(n.nodes) do
|
|
||||||
s = s .. " nodes[" .. i .. "]=" .. v.name
|
|
||||||
end
|
|
||||||
for k,v in pairs(n.attributes) do
|
for k,v in pairs(n.attributes) do
|
||||||
s = s .. " " .. k .. "=[" .. v .. "]"
|
s = s .. " " .. k .. "=[[" .. v .. "]]"
|
||||||
end
|
end
|
||||||
print(s)
|
print(s)
|
||||||
for i,v in ipairs(n.nodes) do
|
for i,v in ipairs(n.nodes) do
|
||||||
@@ -26,11 +23,11 @@ p(root)
|
|||||||
local function select( s )
|
local function select( s )
|
||||||
print ""
|
print ""
|
||||||
print("->", s)
|
print("->", s)
|
||||||
local tags = root:select(s)
|
local sel = root:select(s)
|
||||||
for i,t in ipairs(tags.nodes) do
|
for element in pairs(sel) do
|
||||||
print(t.name)
|
print(element.name)
|
||||||
end
|
end
|
||||||
print(# tags.nodes)
|
print(sel:len())
|
||||||
end
|
end
|
||||||
select("*")
|
select("*")
|
||||||
select("link")
|
select("link")
|
||||||
@@ -51,10 +48,22 @@ select("ul > *")
|
|||||||
select("body [class]")
|
select("body [class]")
|
||||||
select("body > [class]")
|
select("body > [class]")
|
||||||
|
|
||||||
|
select(".contacts span:not(.firstname)")
|
||||||
|
select(":not(a)[href]")
|
||||||
|
select("[itemscope]:not([itemprop])")
|
||||||
|
|
||||||
|
select("link[rel='alternate']")
|
||||||
|
select("[test2=\"val='2'\"]")
|
||||||
|
select("[test5='val5']")
|
||||||
|
select("[test6='val\"\"6']")
|
||||||
|
select("[itemscope='']")
|
||||||
|
select("[itemscope=]")
|
||||||
|
select("[itemscope]")
|
||||||
|
|
||||||
print("\nchapters")
|
print("\nchapters")
|
||||||
local sel, chapters = root("ol.chapters > li"), {}
|
local sel, chapters = root("ol.chapters > li"), {}
|
||||||
for _,v in ipairs(sel.nodes) do
|
for e in pairs(sel) do
|
||||||
table.insert(chapters, v:getcontent())
|
table.insert(chapters, e:getcontent())
|
||||||
end
|
end
|
||||||
-- print
|
-- print
|
||||||
for i,v in ipairs(chapters) do
|
for i,v in ipairs(chapters) do
|
||||||
@@ -62,11 +71,11 @@ for i,v in ipairs(chapters) do
|
|||||||
end
|
end
|
||||||
|
|
||||||
print("\ncontacts")
|
print("\ncontacts")
|
||||||
local sel, contacts = root("ul.contacts > li")("span[class]"), {}
|
local sel, contacts = root("ul.contacts span[class]"), {}
|
||||||
for _,v in ipairs(sel.nodes) do
|
for e in pairs(sel) do
|
||||||
local id = v.parent.parent.id -- li > a > span
|
local id = e.parent.parent.id -- li > a > span
|
||||||
contacts[id] = contacts[id] or {}
|
contacts[id] = contacts[id] or {}
|
||||||
contacts[id][v.classes[1]] = v:getcontent()
|
contacts[id][e.classes[1]] = e:getcontent()
|
||||||
end
|
end
|
||||||
-- print
|
-- print
|
||||||
for k,v in pairs(contacts) do
|
for k,v in pairs(contacts) do
|
||||||
@@ -78,7 +87,7 @@ end
|
|||||||
|
|
||||||
print("\nmicrodata")
|
print("\nmicrodata")
|
||||||
local sel, scopes = root("[itemprop]"), {}
|
local sel, scopes = root("[itemprop]"), {}
|
||||||
for _,prop in ipairs(sel.nodes) do
|
for prop in pairs(sel) do
|
||||||
if prop.attributes["itemscope"] then goto nextprop end
|
if prop.attributes["itemscope"] then goto nextprop end
|
||||||
local descendantscopes, scope = {}, prop
|
local descendantscopes, scope = {}, prop
|
||||||
while true do
|
while true do
|
||||||
|
Reference in New Issue
Block a user