Merge pull request #12 from wscherphof/#6-selectors

#6 selectors: :not() and [attribute="value"] working
This commit is contained in:
Wouter Scherphof 2013-03-25 06:01:16 -07:00
commit 12815e032b
4 changed files with 90 additions and 55 deletions

View File

@ -17,9 +17,7 @@ function ElementNode:new(nameortext, node, descend, openstart, openend)
deepernodes = Set:new(),
deeperelements = {}, deeperattributes = {}, deeperids = {}, deeperclasses = {}
}
if nameortext == "container" then
instance.root = node
elseif not node then
if not node then
instance.name = "root"
instance.root = instance
instance._text = nameortext
@ -90,48 +88,67 @@ function ElementNode:close(closestart, closeend)
end
local function select(self, s)
if not s or type(s) ~= "string" then return {} end
local subjects = Set:new({self})
local resultset
local childrenonly
if not s or type(s) ~= "string" or s == "" then return Set:new() end
local function match(t, w)
local sets = {
[""] = self.deeperelements,
["["] = self.deeperattributes,
["#"] = self.deeperids,
["."] = self.deeperclasses
}
local v
if t == "[" then
w, v = string.match(w,
"([^=]+)" .. -- w = 1 or more characters up to a possible "="
"=?" .. -- an optional uncaptured "="
"(.*)" -- v = anything following the "=", or else ""
)
end
local matched = sets[t][w]
if v and v ~= "" then
v = string.sub(v, 2, #v - 1) -- strip quotes
for node in pairs(matched) do
if node.attributes[w] ~= v then
matched:remove(node)
end
end
end
return matched
end
local subjects, resultset, childrenonly = Set:new({self})
for part in string.gmatch(s, "%S+") do
if part == ">" then childrenonly = true goto nextpart end
resultset = Set:new()
for subject in pairs(subjects) do
local init = subject.deepernodes
if childrenonly then init = Set:new(subject.nodes) childrenonly = false end
resultset = resultset + init
local star = subject.deepernodes
if childrenonly then star = Set:new(subject.nodes) end
childrenonly = false
resultset = resultset + star
end
if part == "*" then goto nextpart end
for t, w in string.gmatch(part, "([%[#%.]?)([^%[%]#%.]+)") do
if t == "" then resultset = resultset * self.deeperelements[w]
elseif t == "[" then resultset = resultset * self.deeperattributes[w]
elseif t == "#" then resultset = resultset * self.deeperids[w]
elseif t == "." then resultset = resultset * self.deeperclasses[w]
local excludes, filter = Set:new()
for t, w in string.gmatch(part,
"([:%[#.]?)" .. -- t = an optional :, [, #, or .
"([^:%(%[#.%]%)]+)" .. -- w = 1 or more of anything not :, (, [, #, ., ], or )
"%]?%)?" -- followed by an uncaptured optional ] and/or )
) do
if t == ":" then filter = w goto nextw end
local matched = match(t, w)
if filter == "not" then
excludes = excludes + matched
else
resultset = resultset * matched
end
filter = nil
::nextw::
end
resultset = resultset - excludes
subjects = Set:new(resultset)
::nextpart::
end
-- construct a container node for the resultset, so that we can :select() on it
local ret = ElementNode:new("container", self)
for node in pairs(resultset) do
table.insert(ret.nodes, node)
ret.deepernodes = ret.deepernodes + node.deepernodes
for listname,list in pairs({
deeperelements = node.deeperelements,
deeperattributes = node.deeperattributes,
deeperids = node.deeperids,
deeperclasses = node.deeperclasses
}) do
local target = ret[listname]
for k,set in pairs(list) do
-- Set.__add will create an empty Set if not target[k]
target[k] = target[k] + set
end
end
end
return ret
return resultset
end
function ElementNode:select(s) return select(self, s) end

View File

@ -9,7 +9,11 @@ local function parse(text)
local node, descend, tpos, opentags = root, true, 1, {}
while true do
local openstart, name
openstart, tpos, name = string.find(root._text, "<(%w+)[^>]*>", tpos)
openstart, tpos, name = string.find(root._text,
"<" .. -- an uncaptured starting "<"
"(%w+)" .. -- name = the first word, directly following the "<"
"[^>]*>", -- include, but not capture everything up to the next ">"
tpos)
if not name then break end
local tag = ElementNode:new(name, node, descend, openstart, tpos)
node = tag
@ -17,12 +21,17 @@ local function parse(text)
local tagst, apos = tag:gettext(), 1
while true do
local start, k, eq, quote, v
start, apos, k, eq, quote = string.find(tagst, "%s+([^%s=]+)(=?)(['\"]?)", apos)
if not k then break end
start, apos, k, eq, quote = string.find(tagst,
"%s+" .. -- some uncaptured space
"([^%s=]+)" .. -- k = an unspaced string up to an optional "="
"(=?)" .. -- eq = the optiona; "=", else ""
"(['\"]?)", -- quote = an optional "'" or '"' following the "=", or ""
apos)
if not k or k == "/>" then break end
if eq == "" then
v = ""
else
local pattern = "=([^%s'\">]*)"
local pattern = "=([^%s>]*)"
if quote ~= '' then
pattern = quote .. "([^" .. quote .. "]*)" .. quote
end

View File

@ -1,5 +1,5 @@
<!DOCTYPE html>
<html lang="en" test1='val1' test2="val='2'" test3='val="3"' test4="val = 4" test5=val5>
<html lang="en" test1='val1' test2="val='2'" test3='val="3"' test4="val = 4" test5=val5 test6=val""6>
<head>
<meta charset="utf-8" />
<link rel="stylesheet" href="test.css" />

View File

@ -10,11 +10,8 @@ local root = htmlparser.parse(text)
local function p(n)
local space = string.rep(" ", n.level)
local s = space .. n.name
for i,v in ipairs(n.nodes) do
s = s .. " nodes[" .. i .. "]=" .. v.name
end
for k,v in pairs(n.attributes) do
s = s .. " " .. k .. "=[" .. v .. "]"
s = s .. " " .. k .. "=[[" .. v .. "]]"
end
print(s)
for i,v in ipairs(n.nodes) do
@ -26,11 +23,11 @@ p(root)
local function select( s )
print ""
print("->", s)
local tags = root:select(s)
for i,t in ipairs(tags.nodes) do
print(t.name)
local sel = root:select(s)
for element in pairs(sel) do
print(element.name)
end
print(# tags.nodes)
print(sel:len())
end
select("*")
select("link")
@ -51,10 +48,22 @@ select("ul > *")
select("body [class]")
select("body > [class]")
select(".contacts span:not(.firstname)")
select(":not(a)[href]")
select("[itemscope]:not([itemprop])")
select("link[rel='alternate']")
select("[test2=\"val='2'\"]")
select("[test5='val5']")
select("[test6='val\"\"6']")
select("[itemscope='']")
select("[itemscope=]")
select("[itemscope]")
print("\nchapters")
local sel, chapters = root("ol.chapters > li"), {}
for _,v in ipairs(sel.nodes) do
table.insert(chapters, v:getcontent())
for e in pairs(sel) do
table.insert(chapters, e:getcontent())
end
-- print
for i,v in ipairs(chapters) do
@ -62,11 +71,11 @@ for i,v in ipairs(chapters) do
end
print("\ncontacts")
local sel, contacts = root("ul.contacts > li")("span[class]"), {}
for _,v in ipairs(sel.nodes) do
local id = v.parent.parent.id -- li > a > span
local sel, contacts = root("ul.contacts span[class]"), {}
for e in pairs(sel) do
local id = e.parent.parent.id -- li > a > span
contacts[id] = contacts[id] or {}
contacts[id][v.classes[1]] = v:getcontent()
contacts[id][e.classes[1]] = e:getcontent()
end
-- print
for k,v in pairs(contacts) do
@ -78,7 +87,7 @@ end
print("\nmicrodata")
local sel, scopes = root("[itemprop]"), {}
for _,prop in ipairs(sel.nodes) do
for prop in pairs(sel) do
if prop.attributes["itemscope"] then goto nextprop end
local descendantscopes, scope = {}, prop
while true do