diff --git a/src/htmlparser/ElementNode.lua b/src/htmlparser/ElementNode.lua index 22ee7c6..11a6c22 100644 --- a/src/htmlparser/ElementNode.lua +++ b/src/htmlparser/ElementNode.lua @@ -149,34 +149,37 @@ local function select(self, s) childrenonly = false if part == "*" then goto nextpart end local excludes, filter = Set:new() - local halfword = "" - for t, w, c in string.gmatch(part, - "([:%[#.]?)" .. -- t = an optional :, [, #, or . - "([^:%(%[#.%]%)]+)" .. -- w = 1 or more of anything not :, (, [, #, ., ], or ) - "(%]?)%)?" -- followed by an uncaptured optional ] and/or ) - ) do - -- this if..elseif.. block will match the pattern like "[src='aaa.jpg']" - if t == "[" and c ~= "]" then - halfword = t .. w - goto nextw - elseif c == "" and halfword ~= "" then - halfword = halfword .. t .. w - goto nextw - elseif t ~= "[" and c == "]" then - halfword = halfword .. t .. w .. c - t, w = "[", string.sub(halfword, 2, -2) - halfword = "" + local start, pos = 0, 0 + while true do + local switch, type, name, eq, quote + start, pos, switch, type, name, eq, quote = string.find(part, + "(%(?%)?)" .. -- switch = a possible ( or ) switching the filter on or off + "([:%[#.]?)" .. -- type = a possible :, [, #, or . + "(%w+)" .. -- name = 1 or more alfanumeric chars + "([|%*~%$!%^]?=?)" .. -- eq = a possible |=, *=, ~=, $=, !=, ^=, or = + "(['\"]?)", -- quote = a ' or " delimiting a possible attribute value + pos + 1 + ) + if not name then break end + if ":" == type then + filter = name + goto nextname end - - if t == ":" then filter = w goto nextw end - local matched = match(t, w) + if ")" == switch then + filter = nil + end + if "[" == type and "" ~= quote then + local value + start, pos, value = string.find(part, "(%b" .. quote .. quote .. ")]", pos) + name = name .. eq .. value + end + local matched = match(type, name) if filter == "not" then excludes = excludes + matched else resultset = resultset * matched end - filter = nil - ::nextw:: + ::nextname:: end resultset = resultset - excludes subjects = Set:new(resultset) diff --git a/tst/init.lua b/tst/init.lua index e6c17b7..afdf52a 100644 --- a/tst/init.lua +++ b/tst/init.lua @@ -83,9 +83,9 @@ end function test_attr_equal() local tree = htmlparser.parse([[ + a10> ]]) assert_equal(1, #tree.nodes, "top level") assert(tree("[a1='']")[1], "a1=''") @@ -94,16 +94,11 @@ function test_attr_equal() assert(tree("[a4='']")[1], "a4=''") assert(tree("[a5='a\"5\"']")[1], "a5='a\"5\"'") assert(tree("[a6=\"a'6'\"]")[1], "a6=\"a'6'\"") - -- not these characters - -- (because these have a special meaning as id, class, or attribute selector, hierarchy separator, or filter command) - -- they can occur in the HTML, but not in a selector string - -- assert(tree("[a7='#.[] :()']")[n], "a7='#.[] :()'") + assert(tree("[a7='#.[]:()']")[1], "a7='#.[]:()'") assert(tree("[a8='|*+-=?$^%&/']")[1], "a8='|*+-=?$^%&/'") assert(tree("[a9='a9']")[1], "a9='a9'") assert(tree("[a10='']")[1], "a10=''") assert(tree("[a10=]")[1], "a10=") - -- An excepton for a7. Some times we may select javascript or img nodes with attr selector [src="a.js"] or [src="a.jpg"] - assert(tree("[a11='a11.js.jpg']")[1], "a11=") end function test_attr_notequal()