mirror of
				https://github.com/TangentFoxy/lua-htmlparser.git
				synced 2025-10-25 04:45:01 +00:00 
			
		
		
		
	improved not and lost method chaining
fixed :not() in that it filters after all matches, preventing later selection of elements that shouldn't have been there Also, ditched the idea of returning a container node, since it was complex and didn't add much. The functionality could be reintroduced by having Set implement the __call or maybe even __index to return the combined results of all its elements.
This commit is contained in:
		| @@ -17,9 +17,7 @@ function ElementNode:new(nameortext, node, descend, openstart, openend) | ||||
|     deepernodes = Set:new(), | ||||
|     deeperelements = {}, deeperattributes = {}, deeperids = {}, deeperclasses = {} | ||||
|   } | ||||
|   if nameortext == "container" then | ||||
|     instance.root = node | ||||
|   elseif not node then | ||||
|   if not node then | ||||
|     instance.name = "root" | ||||
|     instance.root = instance | ||||
|     instance._text = nameortext | ||||
| @@ -90,58 +88,41 @@ function ElementNode:close(closestart, closeend) | ||||
| end | ||||
|  | ||||
| local function select(self, s) | ||||
|   if not s or type(s) ~= "string" then return {} end | ||||
|   local subjects = Set:new({self}) | ||||
|   local resultset | ||||
|   local childrenonly | ||||
|   if not s or type(s) ~= "string" then return Set:new() end | ||||
|   local subjects, resultset, childrenonly = Set:new({self}) | ||||
|   local sets = { | ||||
|     [""]  = self.deeperelements, | ||||
|     ["["] = self.deeperattributes, | ||||
|     ["#"] = self.deeperids, | ||||
|     ["."] = self.deeperclasses | ||||
|   } | ||||
|   for part in string.gmatch(s, "%S+") do | ||||
|     if part == ">" then childrenonly = true goto nextpart end | ||||
|     resultset = Set:new() | ||||
|     for subject in pairs(subjects) do | ||||
|       local init = subject.deepernodes | ||||
|       if childrenonly then init = Set:new(subject.nodes) childrenonly = false end | ||||
|       resultset = resultset + init | ||||
|       local star = subject.deepernodes | ||||
|       if childrenonly then star = Set:new(subject.nodes) end | ||||
|       childrenonly = false | ||||
|       resultset = resultset + star | ||||
|     end | ||||
|     if part == "*" then goto nextpart end | ||||
|     local match, filter | ||||
|     local excludes, filter = Set:new() | ||||
|     for t, w in string.gmatch(part, "([:%[#.]?)([^:%(%[#.%]%)]+)%]?%)?") do | ||||
|       -- TODO tidy up | ||||
|       if t == ":" then filter = w goto nextw end | ||||
|       if t == "" then match = self.deeperelements[w] | ||||
|       elseif t == "[" then match = self.deeperattributes[w] | ||||
|       elseif t == "#" then match = self.deeperids[w] | ||||
|       elseif t == "." then match = self.deeperclasses[w] | ||||
|       end | ||||
|       local match = sets[t][w] | ||||
|       if filter == "not" then | ||||
|         resultset = resultset - match | ||||
|         excludes = excludes + match | ||||
|       else | ||||
|         resultset = resultset * match | ||||
|       end | ||||
|       filter = nil | ||||
|       ::nextw:: | ||||
|     end | ||||
|     resultset = resultset - excludes | ||||
|     subjects = Set:new(resultset) | ||||
|     ::nextpart:: | ||||
|   end | ||||
|   -- construct a container node for the resultset, so that we can :select() on it | ||||
|   local ret = ElementNode:new("container", self) | ||||
|   for node in pairs(resultset) do | ||||
|     table.insert(ret.nodes, node) | ||||
|     ret.deepernodes = ret.deepernodes + node.deepernodes | ||||
|     for listname,list in pairs({ | ||||
|       deeperelements = node.deeperelements, | ||||
|       deeperattributes = node.deeperattributes, | ||||
|       deeperids = node.deeperids, | ||||
|       deeperclasses = node.deeperclasses | ||||
|     }) do | ||||
|       local target = ret[listname] | ||||
|       for k,set in pairs(list) do | ||||
|         -- Set.__add will create an empty Set if not target[k] | ||||
|         target[k] = target[k] + set | ||||
|       end | ||||
|     end | ||||
|   end | ||||
|   return ret | ||||
|   return resultset | ||||
| end | ||||
|  | ||||
| function ElementNode:select(s) return select(self, s) end | ||||
|   | ||||
							
								
								
									
										42
									
								
								test.lua
									
									
									
									
									
								
							
							
						
						
									
										42
									
								
								test.lua
									
									
									
									
									
								
							| @@ -26,11 +26,11 @@ p(root) | ||||
| local function select( s ) | ||||
|   print "" | ||||
|   print("->", s) | ||||
|   local tags = root:select(s) | ||||
|   for i,t in ipairs(tags.nodes) do | ||||
|     print(t.name) | ||||
|   local sel = root:select(s) | ||||
|   for element in pairs(sel) do | ||||
|     print(element.name) | ||||
|   end | ||||
|   print(# tags.nodes) | ||||
|   print(sel:len()) | ||||
| end | ||||
| select("*") | ||||
| select("link") | ||||
| @@ -53,8 +53,8 @@ select("body > [class]") | ||||
|  | ||||
| print("\nchapters") | ||||
| local sel, chapters = root("ol.chapters > li"), {} | ||||
| for _,v in ipairs(sel.nodes) do | ||||
|   table.insert(chapters, v:getcontent()) | ||||
| for e in pairs(sel) do | ||||
|   table.insert(chapters, e:getcontent()) | ||||
| end | ||||
| -- print | ||||
| for i,v in ipairs(chapters) do | ||||
| @@ -62,11 +62,11 @@ for i,v in ipairs(chapters) do | ||||
| end | ||||
|  | ||||
| print("\ncontacts") | ||||
| local sel, contacts = root("ul.contacts > li")("span[class]"), {} | ||||
| for _,v in ipairs(sel.nodes) do | ||||
|   local id = v.parent.parent.id -- li > a > span | ||||
| local sel, contacts = root("ul.contacts span[class]"), {} | ||||
| for e in pairs(sel) do | ||||
|   local id = e.parent.parent.id -- li > a > span | ||||
|   contacts[id] = contacts[id] or {} | ||||
|   contacts[id][v.classes[1]] = v:getcontent() | ||||
|   contacts[id][e.classes[1]] = e:getcontent() | ||||
| end | ||||
| -- print | ||||
| for k,v in pairs(contacts) do | ||||
| @@ -78,7 +78,7 @@ end | ||||
|  | ||||
| print("\nmicrodata") | ||||
| local sel, scopes = root("[itemprop]"), {} | ||||
| for _,prop in ipairs(sel.nodes) do | ||||
| for prop in pairs(sel) do | ||||
|   if prop.attributes["itemscope"] then goto nextprop end | ||||
|   local descendantscopes, scope = {}, prop | ||||
|   while true do | ||||
| @@ -115,12 +115,20 @@ for node,table in pairs(scopes) do | ||||
|   printscope(node, table) | ||||
| end | ||||
|  | ||||
| local sel = root("[itemscope]:not([itemprop])") | ||||
| for i,v in ipairs(sel.nodes) do | ||||
|   print(v.name) | ||||
| print("\nnot firstname") | ||||
| local sel = root(".contacts span:not(.firstname)") | ||||
| for e in pairs(sel) do | ||||
|   print(e.classes[1], e:getcontent()) | ||||
| end | ||||
|  | ||||
| local sel = root("[href]:not(a)") | ||||
| for i,v in ipairs(sel.nodes) do | ||||
|   print(v.name) | ||||
| print("\nnot a hrefs") | ||||
| local sel = root(":not(a)[href]") | ||||
| for e in pairs(sel) do | ||||
|   print(e.name, e.attributes["href"]) | ||||
| end | ||||
|  | ||||
| print("\ntop itemscopes") | ||||
| local sel = root("[itemscope]:not([itemprop])") | ||||
| for e in pairs(sel) do | ||||
|   print(e.name, e.attributes["itemtype"]) | ||||
| end | ||||
		Reference in New Issue
	
	Block a user