From 48183bbf04911f9e8890c469285467cd061e72e0 Mon Sep 17 00:00:00 2001 From: Wouter Scherphof Date: Fri, 22 Mar 2013 22:34:50 +0100 Subject: [PATCH 1/5] :not() seems to be functioning now needs some tidying up and some more test runs --- ElementNode.lua | 20 +++++++++++++++----- test.lua | 10 ++++++++++ 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/ElementNode.lua b/ElementNode.lua index fdeebc4..fa08ad6 100644 --- a/ElementNode.lua +++ b/ElementNode.lua @@ -103,12 +103,22 @@ local function select(self, s) resultset = resultset + init end if part == "*" then goto nextpart end - for t, w in string.gmatch(part, "([%[#%.]?)([^%[%]#%.]+)") do - if t == "" then resultset = resultset * self.deeperelements[w] - elseif t == "[" then resultset = resultset * self.deeperattributes[w] - elseif t == "#" then resultset = resultset * self.deeperids[w] - elseif t == "." then resultset = resultset * self.deeperclasses[w] + local match, filter + for t, w in string.gmatch(part, "([:%[#.]?)([^:%(%[#.%]%)]+)%]?%)?") do + -- TODO tidy up + if t == ":" then filter = w goto nextw end + if t == "" then match = self.deeperelements[w] + elseif t == "[" then match = self.deeperattributes[w] + elseif t == "#" then match = self.deeperids[w] + elseif t == "." then match = self.deeperclasses[w] end + if filter == "not" then + resultset = resultset - match + else + resultset = resultset * match + end + filter = nil + ::nextw:: end subjects = Set:new(resultset) ::nextpart:: diff --git a/test.lua b/test.lua index 0d42f4a..318b34a 100644 --- a/test.lua +++ b/test.lua @@ -113,4 +113,14 @@ local function printscope(node, table, level) end for node,table in pairs(scopes) do printscope(node, table) +end + +local sel = root("[itemscope]:not([itemprop])") +for i,v in ipairs(sel.nodes) do + print(v.name) +end + +local sel = root("[href]:not(a)") +for i,v in ipairs(sel.nodes) do + print(v.name) end \ No newline at end of file From 591e7ebc8b6a8c728eb6552cb7f510ef844439bf Mon Sep 17 00:00:00 2001 From: Wouter Scherphof Date: Sun, 24 Mar 2013 23:20:03 +0100 Subject: [PATCH 2/5] improved not and lost method chaining fixed :not() in that it filters after all matches, preventing later selection of elements that shouldn't have been there Also, ditched the idea of returning a container node, since it was complex and didn't add much. The functionality could be reintroduced by having Set implement the __call or maybe even __index to return the combined results of all its elements. --- ElementNode.lua | 55 ++++++++++++++++--------------------------------- test.lua | 42 ++++++++++++++++++++++--------------- 2 files changed, 43 insertions(+), 54 deletions(-) diff --git a/ElementNode.lua b/ElementNode.lua index fa08ad6..73ebb56 100644 --- a/ElementNode.lua +++ b/ElementNode.lua @@ -17,9 +17,7 @@ function ElementNode:new(nameortext, node, descend, openstart, openend) deepernodes = Set:new(), deeperelements = {}, deeperattributes = {}, deeperids = {}, deeperclasses = {} } - if nameortext == "container" then - instance.root = node - elseif not node then + if not node then instance.name = "root" instance.root = instance instance._text = nameortext @@ -90,58 +88,41 @@ function ElementNode:close(closestart, closeend) end local function select(self, s) - if not s or type(s) ~= "string" then return {} end - local subjects = Set:new({self}) - local resultset - local childrenonly + if not s or type(s) ~= "string" then return Set:new() end + local subjects, resultset, childrenonly = Set:new({self}) + local sets = { + [""] = self.deeperelements, + ["["] = self.deeperattributes, + ["#"] = self.deeperids, + ["."] = self.deeperclasses + } for part in string.gmatch(s, "%S+") do if part == ">" then childrenonly = true goto nextpart end resultset = Set:new() for subject in pairs(subjects) do - local init = subject.deepernodes - if childrenonly then init = Set:new(subject.nodes) childrenonly = false end - resultset = resultset + init + local star = subject.deepernodes + if childrenonly then star = Set:new(subject.nodes) end + childrenonly = false + resultset = resultset + star end if part == "*" then goto nextpart end - local match, filter + local excludes, filter = Set:new() for t, w in string.gmatch(part, "([:%[#.]?)([^:%(%[#.%]%)]+)%]?%)?") do - -- TODO tidy up if t == ":" then filter = w goto nextw end - if t == "" then match = self.deeperelements[w] - elseif t == "[" then match = self.deeperattributes[w] - elseif t == "#" then match = self.deeperids[w] - elseif t == "." then match = self.deeperclasses[w] - end + local match = sets[t][w] if filter == "not" then - resultset = resultset - match + excludes = excludes + match else resultset = resultset * match end filter = nil ::nextw:: end + resultset = resultset - excludes subjects = Set:new(resultset) ::nextpart:: end - -- construct a container node for the resultset, so that we can :select() on it - local ret = ElementNode:new("container", self) - for node in pairs(resultset) do - table.insert(ret.nodes, node) - ret.deepernodes = ret.deepernodes + node.deepernodes - for listname,list in pairs({ - deeperelements = node.deeperelements, - deeperattributes = node.deeperattributes, - deeperids = node.deeperids, - deeperclasses = node.deeperclasses - }) do - local target = ret[listname] - for k,set in pairs(list) do - -- Set.__add will create an empty Set if not target[k] - target[k] = target[k] + set - end - end - end - return ret + return resultset end function ElementNode:select(s) return select(self, s) end diff --git a/test.lua b/test.lua index 318b34a..5417bd1 100644 --- a/test.lua +++ b/test.lua @@ -26,11 +26,11 @@ p(root) local function select( s ) print "" print("->", s) - local tags = root:select(s) - for i,t in ipairs(tags.nodes) do - print(t.name) + local sel = root:select(s) + for element in pairs(sel) do + print(element.name) end - print(# tags.nodes) + print(sel:len()) end select("*") select("link") @@ -53,8 +53,8 @@ select("body > [class]") print("\nchapters") local sel, chapters = root("ol.chapters > li"), {} -for _,v in ipairs(sel.nodes) do - table.insert(chapters, v:getcontent()) +for e in pairs(sel) do + table.insert(chapters, e:getcontent()) end -- print for i,v in ipairs(chapters) do @@ -62,11 +62,11 @@ for i,v in ipairs(chapters) do end print("\ncontacts") -local sel, contacts = root("ul.contacts > li")("span[class]"), {} -for _,v in ipairs(sel.nodes) do - local id = v.parent.parent.id -- li > a > span +local sel, contacts = root("ul.contacts span[class]"), {} +for e in pairs(sel) do + local id = e.parent.parent.id -- li > a > span contacts[id] = contacts[id] or {} - contacts[id][v.classes[1]] = v:getcontent() + contacts[id][e.classes[1]] = e:getcontent() end -- print for k,v in pairs(contacts) do @@ -78,7 +78,7 @@ end print("\nmicrodata") local sel, scopes = root("[itemprop]"), {} -for _,prop in ipairs(sel.nodes) do +for prop in pairs(sel) do if prop.attributes["itemscope"] then goto nextprop end local descendantscopes, scope = {}, prop while true do @@ -115,12 +115,20 @@ for node,table in pairs(scopes) do printscope(node, table) end -local sel = root("[itemscope]:not([itemprop])") -for i,v in ipairs(sel.nodes) do - print(v.name) +print("\nnot firstname") +local sel = root(".contacts span:not(.firstname)") +for e in pairs(sel) do + print(e.classes[1], e:getcontent()) end -local sel = root("[href]:not(a)") -for i,v in ipairs(sel.nodes) do - print(v.name) +print("\nnot a hrefs") +local sel = root(":not(a)[href]") +for e in pairs(sel) do + print(e.name, e.attributes["href"]) +end + +print("\ntop itemscopes") +local sel = root("[itemscope]:not([itemprop])") +for e in pairs(sel) do + print(e.name, e.attributes["itemtype"]) end \ No newline at end of file From 77f24f93be6f9f36ed24c637246815ee75766aa1 Mon Sep 17 00:00:00 2001 From: Wouter Scherphof Date: Sun, 24 Mar 2013 23:30:34 +0100 Subject: [PATCH 3/5] the selctor patterns explained --- ElementNode.lua | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ElementNode.lua b/ElementNode.lua index 73ebb56..650cd6a 100644 --- a/ElementNode.lua +++ b/ElementNode.lua @@ -107,7 +107,11 @@ local function select(self, s) end if part == "*" then goto nextpart end local excludes, filter = Set:new() - for t, w in string.gmatch(part, "([:%[#.]?)([^:%(%[#.%]%)]+)%]?%)?") do + for t, w in string.gmatch(part, + "([:%[#.]?)" .. -- t = an optional :, [, #, or . + "([^:%(%[#.%]%)]+)" .. -- w = 1 or more of anything not :, (, [, #, ., ], or ) + "%]?%)?" -- followed by an uncaptured optional ] and/or ) + ) do if t == ":" then filter = w goto nextw end local match = sets[t][w] if filter == "not" then From de746865bebec33837b30c6a591735121d4450ec Mon Sep 17 00:00:00 2001 From: Wouter Scherphof Date: Mon, 25 Mar 2013 13:02:41 +0100 Subject: [PATCH 4/5] [attribute="value"] working --- ElementNode.lua | 17 ++++++++++++++--- test.lua | 25 +++++++------------------ 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/ElementNode.lua b/ElementNode.lua index 650cd6a..ce65dfb 100644 --- a/ElementNode.lua +++ b/ElementNode.lua @@ -88,14 +88,14 @@ function ElementNode:close(closestart, closeend) end local function select(self, s) - if not s or type(s) ~= "string" then return Set:new() end - local subjects, resultset, childrenonly = Set:new({self}) + if not s or type(s) ~= "string" or s == "" then return Set:new() end local sets = { [""] = self.deeperelements, ["["] = self.deeperattributes, ["#"] = self.deeperids, ["."] = self.deeperclasses } + local subjects, resultset, childrenonly = Set:new({self}) for part in string.gmatch(s, "%S+") do if part == ">" then childrenonly = true goto nextpart end resultset = Set:new() @@ -107,13 +107,24 @@ local function select(self, s) end if part == "*" then goto nextpart end local excludes, filter = Set:new() - for t, w in string.gmatch(part, + for t, w, v in string.gmatch(part, "([:%[#.]?)" .. -- t = an optional :, [, #, or . "([^:%(%[#.%]%)]+)" .. -- w = 1 or more of anything not :, (, [, #, ., ], or ) "%]?%)?" -- followed by an uncaptured optional ] and/or ) ) do if t == ":" then filter = w goto nextw end + if t == "[" then + w, v = string.match(w, "([^=]+)=?(%S*)") + end local match = sets[t][w] + if v and v ~= "" then + v = string.sub(v, 2, #v - 1) -- strip quotes + for node in pairs(match) do + if node.attributes[w] ~= v then + match:remove(node) + end + end + end if filter == "not" then excludes = excludes + match else diff --git a/test.lua b/test.lua index 5417bd1..d4be9b2 100644 --- a/test.lua +++ b/test.lua @@ -51,6 +51,13 @@ select("ul > *") select("body [class]") select("body > [class]") +select(".contacts span:not(.firstname)") +select(":not(a)[href]") +select("[itemscope]:not([itemprop])") + +select("link[rel='alternate']") +select("[test2=\"val='2'\"]") + print("\nchapters") local sel, chapters = root("ol.chapters > li"), {} for e in pairs(sel) do @@ -114,21 +121,3 @@ end for node,table in pairs(scopes) do printscope(node, table) end - -print("\nnot firstname") -local sel = root(".contacts span:not(.firstname)") -for e in pairs(sel) do - print(e.classes[1], e:getcontent()) -end - -print("\nnot a hrefs") -local sel = root(":not(a)[href]") -for e in pairs(sel) do - print(e.name, e.attributes["href"]) -end - -print("\ntop itemscopes") -local sel = root("[itemscope]:not([itemprop])") -for e in pairs(sel) do - print(e.name, e.attributes["itemtype"]) -end \ No newline at end of file From 64f3eb4df328bf45d61a3a117e5bc9bfad2a386d Mon Sep 17 00:00:00 2001 From: Wouter Scherphof Date: Mon, 25 Mar 2013 13:55:35 +0100 Subject: [PATCH 5/5] Moved some details to a function ... regarding [attribute=value] and other matching. Also explained (in comments) the different matching patterns. And fixed a bug where /> would be listed as an attribute. And added a few more tests. --- ElementNode.lua | 53 +++++++++++++++++++++++++++++-------------------- htmlparser.lua | 17 ++++++++++++---- test.html | 2 +- test.lua | 10 ++++++---- 4 files changed, 52 insertions(+), 30 deletions(-) diff --git a/ElementNode.lua b/ElementNode.lua index ce65dfb..4b530ea 100644 --- a/ElementNode.lua +++ b/ElementNode.lua @@ -89,12 +89,34 @@ end local function select(self, s) if not s or type(s) ~= "string" or s == "" then return Set:new() end - local sets = { - [""] = self.deeperelements, - ["["] = self.deeperattributes, - ["#"] = self.deeperids, - ["."] = self.deeperclasses - } + + local function match(t, w) + local sets = { + [""] = self.deeperelements, + ["["] = self.deeperattributes, + ["#"] = self.deeperids, + ["."] = self.deeperclasses + } + local v + if t == "[" then + w, v = string.match(w, + "([^=]+)" .. -- w = 1 or more characters up to a possible "=" + "=?" .. -- an optional uncaptured "=" + "(.*)" -- v = anything following the "=", or else "" + ) + end + local matched = sets[t][w] + if v and v ~= "" then + v = string.sub(v, 2, #v - 1) -- strip quotes + for node in pairs(matched) do + if node.attributes[w] ~= v then + matched:remove(node) + end + end + end + return matched + end + local subjects, resultset, childrenonly = Set:new({self}) for part in string.gmatch(s, "%S+") do if part == ">" then childrenonly = true goto nextpart end @@ -107,28 +129,17 @@ local function select(self, s) end if part == "*" then goto nextpart end local excludes, filter = Set:new() - for t, w, v in string.gmatch(part, + for t, w in string.gmatch(part, "([:%[#.]?)" .. -- t = an optional :, [, #, or . "([^:%(%[#.%]%)]+)" .. -- w = 1 or more of anything not :, (, [, #, ., ], or ) "%]?%)?" -- followed by an uncaptured optional ] and/or ) ) do if t == ":" then filter = w goto nextw end - if t == "[" then - w, v = string.match(w, "([^=]+)=?(%S*)") - end - local match = sets[t][w] - if v and v ~= "" then - v = string.sub(v, 2, #v - 1) -- strip quotes - for node in pairs(match) do - if node.attributes[w] ~= v then - match:remove(node) - end - end - end + local matched = match(t, w) if filter == "not" then - excludes = excludes + match + excludes = excludes + matched else - resultset = resultset * match + resultset = resultset * matched end filter = nil ::nextw:: diff --git a/htmlparser.lua b/htmlparser.lua index bcc4ad0..053baad 100644 --- a/htmlparser.lua +++ b/htmlparser.lua @@ -9,7 +9,11 @@ local function parse(text) local node, descend, tpos, opentags = root, true, 1, {} while true do local openstart, name - openstart, tpos, name = string.find(root._text, "<(%w+)[^>]*>", tpos) + openstart, tpos, name = string.find(root._text, + "<" .. -- an uncaptured starting "<" + "(%w+)" .. -- name = the first word, directly following the "<" + "[^>]*>", -- include, but not capture everything up to the next ">" + tpos) if not name then break end local tag = ElementNode:new(name, node, descend, openstart, tpos) node = tag @@ -17,12 +21,17 @@ local function parse(text) local tagst, apos = tag:gettext(), 1 while true do local start, k, eq, quote, v - start, apos, k, eq, quote = string.find(tagst, "%s+([^%s=]+)(=?)(['\"]?)", apos) - if not k then break end + start, apos, k, eq, quote = string.find(tagst, + "%s+" .. -- some uncaptured space + "([^%s=]+)" .. -- k = an unspaced string up to an optional "=" + "(=?)" .. -- eq = the optiona; "=", else "" + "(['\"]?)", -- quote = an optional "'" or '"' following the "=", or "" + apos) + if not k or k == "/>" then break end if eq == "" then v = "" else - local pattern = "=([^%s'\">]*)" + local pattern = "=([^%s>]*)" if quote ~= '' then pattern = quote .. "([^" .. quote .. "]*)" .. quote end diff --git a/test.html b/test.html index 3b5f88b..dbff550 100644 --- a/test.html +++ b/test.html @@ -1,5 +1,5 @@ - + diff --git a/test.lua b/test.lua index d4be9b2..b139fa5 100644 --- a/test.lua +++ b/test.lua @@ -10,11 +10,8 @@ local root = htmlparser.parse(text) local function p(n) local space = string.rep(" ", n.level) local s = space .. n.name - for i,v in ipairs(n.nodes) do - s = s .. " nodes[" .. i .. "]=" .. v.name - end for k,v in pairs(n.attributes) do - s = s .. " " .. k .. "=[" .. v .. "]" + s = s .. " " .. k .. "=[[" .. v .. "]]" end print(s) for i,v in ipairs(n.nodes) do @@ -57,6 +54,11 @@ select("[itemscope]:not([itemprop])") select("link[rel='alternate']") select("[test2=\"val='2'\"]") +select("[test5='val5']") +select("[test6='val\"\"6']") +select("[itemscope='']") +select("[itemscope=]") +select("[itemscope]") print("\nchapters") local sel, chapters = root("ol.chapters > li"), {}