mirror of
				https://github.com/TangentFoxy/lua-htmlparser.git
				synced 2025-10-24 20:35:01 +00:00 
			
		
		
		
	| @@ -87,7 +87,7 @@ All tree elements provide, apart from `:select` and `()`, the following accessor | ||||
| - `.deeperclasses` as `.deeperelements`, but keyed on class name | ||||
|  | ||||
| ##Limitations | ||||
| - Attribute values in selector strings cannot contain any spaces, nor any of `#`, `[`, `]`, `:`, `(`, or `)` | ||||
| - Attribute values in selector strings cannot contain any spaces | ||||
| - The spaces before and after the `>` in a `parent > child` relation are mandatory  | ||||
| - `<!` elements (including doctype, comments, and CDATA) are not parsed; markup within CDATA is *not* escaped | ||||
| - Textnodes are no separate tree elements; in `local root = htmlparser.parse("<p>line1<br />line2</p>")`, `root.nodes[1]:getcontent()` is `"line1<br />line2"`, while `root.nodes[1].nodes[1].name` is `"br"` | ||||
|   | ||||
| @@ -149,34 +149,37 @@ local function select(self, s) | ||||
|     childrenonly = false | ||||
|     if part == "*" then goto nextpart end | ||||
|     local excludes, filter = Set:new() | ||||
|     local halfword = "" | ||||
|     for t, w, c in string.gmatch(part, | ||||
|       "([:%[#.]?)" ..        -- t = an optional :, [, #, or . | ||||
|       "([^:%(%[#.%]%)]+)" .. -- w = 1 or more of anything not :, (, [, #, ., ], or ) | ||||
|       "(%]?)%)?"               -- followed by an uncaptured optional ] and/or ) | ||||
|     ) do | ||||
|       -- this if..elseif.. block will match the pattern like "[src='aaa.jpg']" | ||||
|       if t == "[" and c ~= "]" then | ||||
|         halfword = t .. w | ||||
|         goto nextw | ||||
|       elseif c == "" and halfword ~= "" then | ||||
|         halfword = halfword .. t .. w | ||||
|         goto nextw | ||||
|       elseif t ~= "[" and c == "]" then | ||||
|         halfword = halfword .. t .. w .. c | ||||
|         t, w = "[", string.sub(halfword, 2, -2) | ||||
|         halfword = "" | ||||
|     local start, pos = 0, 0 | ||||
|     while true do | ||||
|       local switch, type, name, eq, quote | ||||
|       start, pos, switch, type, name, eq, quote = string.find(part, | ||||
|         "(%(?%)?)" ..         -- switch = a possible ( or ) switching the filter on or off | ||||
|         "([:%[#.]?)" ..       -- type = a possible :, [, #, or . | ||||
|         "(%w+)" ..            -- name = 1 or more alfanumeric chars | ||||
|         "([|%*~%$!%^]?=?)" .. -- eq = a possible |=, *=, ~=, $=, !=, ^=, or = | ||||
|         "(['\"]?)",           -- quote = a ' or " delimiting a possible attribute value | ||||
|         pos + 1 | ||||
|       ) | ||||
|       if not name then break end | ||||
|       if ":" == type then | ||||
|         filter = name | ||||
|         goto nextname | ||||
|       end | ||||
|  | ||||
|       if t == ":" then filter = w goto nextw end | ||||
|       local matched = match(t, w) | ||||
|       if ")" == switch then | ||||
|         filter = nil | ||||
|       end | ||||
|       if "[" == type and "" ~= quote then | ||||
|         local value | ||||
|         start, pos, value = string.find(part, "(%b" .. quote .. quote .. ")]", pos) | ||||
|         name = name .. eq .. value | ||||
|       end | ||||
|       local matched = match(type, name) | ||||
|       if filter == "not" then | ||||
|         excludes = excludes + matched | ||||
|       else | ||||
|         resultset = resultset * matched | ||||
|       end | ||||
|       filter = nil | ||||
|       ::nextw:: | ||||
|       ::nextname:: | ||||
|     end | ||||
|     resultset = resultset - excludes | ||||
|     subjects = Set:new(resultset) | ||||
|   | ||||
							
								
								
									
										11
									
								
								tst/init.lua
									
									
									
									
									
								
							
							
						
						
									
										11
									
								
								tst/init.lua
									
									
									
									
									
								
							| @@ -83,9 +83,9 @@ end | ||||
| function test_attr_equal() | ||||
| 	local tree = htmlparser.parse([[ | ||||
| 		<n a1 a2= a3='' a4="" | ||||
| 			a5='a"5"' a6="a'6'" a7='#.[] :()' a8='|*+-=?$^%&/' | ||||
| 			a5='a"5"' a6="a'6'" a7='#.[]:()' a8='|*+-=?$^%&/' | ||||
| 			a9=a9 | ||||
| 		a10 a11="a11.js.jpg"></n> | ||||
| 		a10></n> | ||||
| 	]]) | ||||
| 	assert_equal(1, #tree.nodes, "top level") | ||||
| 	assert(tree("[a1='']")[1], "a1=''") | ||||
| @@ -94,16 +94,11 @@ function test_attr_equal() | ||||
| 	assert(tree("[a4='']")[1], "a4=''") | ||||
| 	assert(tree("[a5='a\"5\"']")[1], "a5='a\"5\"'") | ||||
| 	assert(tree("[a6=\"a'6'\"]")[1], "a6=\"a'6'\"") | ||||
| 	-- not these characters | ||||
| 	-- (because these have a special meaning as id, class, or attribute selector, hierarchy separator, or filter command) | ||||
| 	-- they can occur in the HTML, but not in a selector string | ||||
| 	-- assert(tree("[a7='#.[] :()']")[n], "a7='#.[] :()'") | ||||
| 	assert(tree("[a7='#.[]:()']")[1], "a7='#.[]:()'") | ||||
| 	assert(tree("[a8='|*+-=?$^%&/']")[1], "a8='|*+-=?$^%&/'") | ||||
| 	assert(tree("[a9='a9']")[1], "a9='a9'") | ||||
| 	assert(tree("[a10='']")[1], "a10=''") | ||||
| 	assert(tree("[a10=]")[1], "a10=") | ||||
| 	-- An excepton for a7. Some times we may select javascript or img nodes with attr selector [src="a.js"] or [src="a.jpg"] | ||||
| 	assert(tree("[a11='a11.js.jpg']")[1], "a11=") | ||||
| end | ||||
|  | ||||
| function test_attr_notequal() | ||||
|   | ||||
		Reference in New Issue
	
	Block a user