Merge pull request #12 from wscherphof/#6-selectors

#6 selectors: :not() and [attribute="value"] working
This commit is contained in:
Wouter Scherphof
2013-03-25 06:01:16 -07:00
4 changed files with 90 additions and 55 deletions

View File

@@ -17,9 +17,7 @@ function ElementNode:new(nameortext, node, descend, openstart, openend)
deepernodes = Set:new(), deepernodes = Set:new(),
deeperelements = {}, deeperattributes = {}, deeperids = {}, deeperclasses = {} deeperelements = {}, deeperattributes = {}, deeperids = {}, deeperclasses = {}
} }
if nameortext == "container" then if not node then
instance.root = node
elseif not node then
instance.name = "root" instance.name = "root"
instance.root = instance instance.root = instance
instance._text = nameortext instance._text = nameortext
@@ -90,48 +88,67 @@ function ElementNode:close(closestart, closeend)
end end
local function select(self, s) local function select(self, s)
if not s or type(s) ~= "string" then return {} end if not s or type(s) ~= "string" or s == "" then return Set:new() end
local subjects = Set:new({self})
local resultset local function match(t, w)
local childrenonly local sets = {
[""] = self.deeperelements,
["["] = self.deeperattributes,
["#"] = self.deeperids,
["."] = self.deeperclasses
}
local v
if t == "[" then
w, v = string.match(w,
"([^=]+)" .. -- w = 1 or more characters up to a possible "="
"=?" .. -- an optional uncaptured "="
"(.*)" -- v = anything following the "=", or else ""
)
end
local matched = sets[t][w]
if v and v ~= "" then
v = string.sub(v, 2, #v - 1) -- strip quotes
for node in pairs(matched) do
if node.attributes[w] ~= v then
matched:remove(node)
end
end
end
return matched
end
local subjects, resultset, childrenonly = Set:new({self})
for part in string.gmatch(s, "%S+") do for part in string.gmatch(s, "%S+") do
if part == ">" then childrenonly = true goto nextpart end if part == ">" then childrenonly = true goto nextpart end
resultset = Set:new() resultset = Set:new()
for subject in pairs(subjects) do for subject in pairs(subjects) do
local init = subject.deepernodes local star = subject.deepernodes
if childrenonly then init = Set:new(subject.nodes) childrenonly = false end if childrenonly then star = Set:new(subject.nodes) end
resultset = resultset + init childrenonly = false
resultset = resultset + star
end end
if part == "*" then goto nextpart end if part == "*" then goto nextpart end
for t, w in string.gmatch(part, "([%[#%.]?)([^%[%]#%.]+)") do local excludes, filter = Set:new()
if t == "" then resultset = resultset * self.deeperelements[w] for t, w in string.gmatch(part,
elseif t == "[" then resultset = resultset * self.deeperattributes[w] "([:%[#.]?)" .. -- t = an optional :, [, #, or .
elseif t == "#" then resultset = resultset * self.deeperids[w] "([^:%(%[#.%]%)]+)" .. -- w = 1 or more of anything not :, (, [, #, ., ], or )
elseif t == "." then resultset = resultset * self.deeperclasses[w] "%]?%)?" -- followed by an uncaptured optional ] and/or )
) do
if t == ":" then filter = w goto nextw end
local matched = match(t, w)
if filter == "not" then
excludes = excludes + matched
else
resultset = resultset * matched
end end
filter = nil
::nextw::
end end
resultset = resultset - excludes
subjects = Set:new(resultset) subjects = Set:new(resultset)
::nextpart:: ::nextpart::
end end
-- construct a container node for the resultset, so that we can :select() on it return resultset
local ret = ElementNode:new("container", self)
for node in pairs(resultset) do
table.insert(ret.nodes, node)
ret.deepernodes = ret.deepernodes + node.deepernodes
for listname,list in pairs({
deeperelements = node.deeperelements,
deeperattributes = node.deeperattributes,
deeperids = node.deeperids,
deeperclasses = node.deeperclasses
}) do
local target = ret[listname]
for k,set in pairs(list) do
-- Set.__add will create an empty Set if not target[k]
target[k] = target[k] + set
end
end
end
return ret
end end
function ElementNode:select(s) return select(self, s) end function ElementNode:select(s) return select(self, s) end

View File

@@ -9,7 +9,11 @@ local function parse(text)
local node, descend, tpos, opentags = root, true, 1, {} local node, descend, tpos, opentags = root, true, 1, {}
while true do while true do
local openstart, name local openstart, name
openstart, tpos, name = string.find(root._text, "<(%w+)[^>]*>", tpos) openstart, tpos, name = string.find(root._text,
"<" .. -- an uncaptured starting "<"
"(%w+)" .. -- name = the first word, directly following the "<"
"[^>]*>", -- include, but not capture everything up to the next ">"
tpos)
if not name then break end if not name then break end
local tag = ElementNode:new(name, node, descend, openstart, tpos) local tag = ElementNode:new(name, node, descend, openstart, tpos)
node = tag node = tag
@@ -17,12 +21,17 @@ local function parse(text)
local tagst, apos = tag:gettext(), 1 local tagst, apos = tag:gettext(), 1
while true do while true do
local start, k, eq, quote, v local start, k, eq, quote, v
start, apos, k, eq, quote = string.find(tagst, "%s+([^%s=]+)(=?)(['\"]?)", apos) start, apos, k, eq, quote = string.find(tagst,
if not k then break end "%s+" .. -- some uncaptured space
"([^%s=]+)" .. -- k = an unspaced string up to an optional "="
"(=?)" .. -- eq = the optiona; "=", else ""
"(['\"]?)", -- quote = an optional "'" or '"' following the "=", or ""
apos)
if not k or k == "/>" then break end
if eq == "" then if eq == "" then
v = "" v = ""
else else
local pattern = "=([^%s'\">]*)" local pattern = "=([^%s>]*)"
if quote ~= '' then if quote ~= '' then
pattern = quote .. "([^" .. quote .. "]*)" .. quote pattern = quote .. "([^" .. quote .. "]*)" .. quote
end end

View File

@@ -1,5 +1,5 @@
<!DOCTYPE html> <!DOCTYPE html>
<html lang="en" test1='val1' test2="val='2'" test3='val="3"' test4="val = 4" test5=val5> <html lang="en" test1='val1' test2="val='2'" test3='val="3"' test4="val = 4" test5=val5 test6=val""6>
<head> <head>
<meta charset="utf-8" /> <meta charset="utf-8" />
<link rel="stylesheet" href="test.css" /> <link rel="stylesheet" href="test.css" />

View File

@@ -10,11 +10,8 @@ local root = htmlparser.parse(text)
local function p(n) local function p(n)
local space = string.rep(" ", n.level) local space = string.rep(" ", n.level)
local s = space .. n.name local s = space .. n.name
for i,v in ipairs(n.nodes) do
s = s .. " nodes[" .. i .. "]=" .. v.name
end
for k,v in pairs(n.attributes) do for k,v in pairs(n.attributes) do
s = s .. " " .. k .. "=[" .. v .. "]" s = s .. " " .. k .. "=[[" .. v .. "]]"
end end
print(s) print(s)
for i,v in ipairs(n.nodes) do for i,v in ipairs(n.nodes) do
@@ -26,11 +23,11 @@ p(root)
local function select( s ) local function select( s )
print "" print ""
print("->", s) print("->", s)
local tags = root:select(s) local sel = root:select(s)
for i,t in ipairs(tags.nodes) do for element in pairs(sel) do
print(t.name) print(element.name)
end end
print(# tags.nodes) print(sel:len())
end end
select("*") select("*")
select("link") select("link")
@@ -51,10 +48,22 @@ select("ul > *")
select("body [class]") select("body [class]")
select("body > [class]") select("body > [class]")
select(".contacts span:not(.firstname)")
select(":not(a)[href]")
select("[itemscope]:not([itemprop])")
select("link[rel='alternate']")
select("[test2=\"val='2'\"]")
select("[test5='val5']")
select("[test6='val\"\"6']")
select("[itemscope='']")
select("[itemscope=]")
select("[itemscope]")
print("\nchapters") print("\nchapters")
local sel, chapters = root("ol.chapters > li"), {} local sel, chapters = root("ol.chapters > li"), {}
for _,v in ipairs(sel.nodes) do for e in pairs(sel) do
table.insert(chapters, v:getcontent()) table.insert(chapters, e:getcontent())
end end
-- print -- print
for i,v in ipairs(chapters) do for i,v in ipairs(chapters) do
@@ -62,11 +71,11 @@ for i,v in ipairs(chapters) do
end end
print("\ncontacts") print("\ncontacts")
local sel, contacts = root("ul.contacts > li")("span[class]"), {} local sel, contacts = root("ul.contacts span[class]"), {}
for _,v in ipairs(sel.nodes) do for e in pairs(sel) do
local id = v.parent.parent.id -- li > a > span local id = e.parent.parent.id -- li > a > span
contacts[id] = contacts[id] or {} contacts[id] = contacts[id] or {}
contacts[id][v.classes[1]] = v:getcontent() contacts[id][e.classes[1]] = e:getcontent()
end end
-- print -- print
for k,v in pairs(contacts) do for k,v in pairs(contacts) do
@@ -78,7 +87,7 @@ end
print("\nmicrodata") print("\nmicrodata")
local sel, scopes = root("[itemprop]"), {} local sel, scopes = root("[itemprop]"), {}
for _,prop in ipairs(sel.nodes) do for prop in pairs(sel) do
if prop.attributes["itemscope"] then goto nextprop end if prop.attributes["itemscope"] then goto nextprop end
local descendantscopes, scope = {}, prop local descendantscopes, scope = {}, prop
while true do while true do