Allow any unspaced quoted attribute value

No more forbidden characters, except the space, that’s still
interpreted as the ancestor-descendant separator
This commit is contained in:
Wouter Scherphof 2014-01-10 20:52:01 +01:00
parent 1dafa955d9
commit 7ea22d13f7
2 changed files with 28 additions and 30 deletions

View File

@ -149,34 +149,37 @@ local function select(self, s)
childrenonly = false childrenonly = false
if part == "*" then goto nextpart end if part == "*" then goto nextpart end
local excludes, filter = Set:new() local excludes, filter = Set:new()
local halfword = "" local start, pos = 0, 0
for t, w, c in string.gmatch(part, while true do
"([:%[#.]?)" .. -- t = an optional :, [, #, or . local switch, type, name, eq, quote
"([^:%(%[#.%]%)]+)" .. -- w = 1 or more of anything not :, (, [, #, ., ], or ) start, pos, switch, type, name, eq, quote = string.find(part,
"(%]?)%)?" -- followed by an uncaptured optional ] and/or ) "(%(?%)?)" .. -- switch = a possible ( or ) switching the filter on or off
) do "([:%[#.]?)" .. -- type = a possible :, [, #, or .
-- this if..elseif.. block will match the pattern like "[src='aaa.jpg']" "(%w+)" .. -- name = 1 or more alfanumeric chars
if t == "[" and c ~= "]" then "([|%*~%$!%^]?=?)" .. -- eq = a possible |=, *=, ~=, $=, !=, ^=, or =
halfword = t .. w "(['\"]?)", -- quote = a ' or " delimiting a possible attribute value
goto nextw pos + 1
elseif c == "" and halfword ~= "" then )
halfword = halfword .. t .. w if not name then break end
goto nextw if ":" == type then
elseif t ~= "[" and c == "]" then filter = name
halfword = halfword .. t .. w .. c goto nextname
t, w = "[", string.sub(halfword, 2, -2)
halfword = ""
end end
if ")" == switch then
if t == ":" then filter = w goto nextw end filter = nil
local matched = match(t, w) end
if "[" == type and "" ~= quote then
local value
start, pos, value = string.find(part, "(%b" .. quote .. quote .. ")]", pos)
name = name .. eq .. value
end
local matched = match(type, name)
if filter == "not" then if filter == "not" then
excludes = excludes + matched excludes = excludes + matched
else else
resultset = resultset * matched resultset = resultset * matched
end end
filter = nil ::nextname::
::nextw::
end end
resultset = resultset - excludes resultset = resultset - excludes
subjects = Set:new(resultset) subjects = Set:new(resultset)

View File

@ -83,9 +83,9 @@ end
function test_attr_equal() function test_attr_equal()
local tree = htmlparser.parse([[ local tree = htmlparser.parse([[
<n a1 a2= a3='' a4="" <n a1 a2= a3='' a4=""
a5='a"5"' a6="a'6'" a7='#.[] :()' a8='|*+-=?$^%&/' a5='a"5"' a6="a'6'" a7='#.[]:()' a8='|*+-=?$^%&/'
a9=a9 a9=a9
a10 a11="a11.js.jpg"></n> a10></n>
]]) ]])
assert_equal(1, #tree.nodes, "top level") assert_equal(1, #tree.nodes, "top level")
assert(tree("[a1='']")[1], "a1=''") assert(tree("[a1='']")[1], "a1=''")
@ -94,16 +94,11 @@ function test_attr_equal()
assert(tree("[a4='']")[1], "a4=''") assert(tree("[a4='']")[1], "a4=''")
assert(tree("[a5='a\"5\"']")[1], "a5='a\"5\"'") assert(tree("[a5='a\"5\"']")[1], "a5='a\"5\"'")
assert(tree("[a6=\"a'6'\"]")[1], "a6=\"a'6'\"") assert(tree("[a6=\"a'6'\"]")[1], "a6=\"a'6'\"")
-- not these characters assert(tree("[a7='#.[]:()']")[1], "a7='#.[]:()'")
-- (because these have a special meaning as id, class, or attribute selector, hierarchy separator, or filter command)
-- they can occur in the HTML, but not in a selector string
-- assert(tree("[a7='#.[] :()']")[n], "a7='#.[] :()'")
assert(tree("[a8='|*+-=?$^%&/']")[1], "a8='|*+-=?$^%&/'") assert(tree("[a8='|*+-=?$^%&/']")[1], "a8='|*+-=?$^%&/'")
assert(tree("[a9='a9']")[1], "a9='a9'") assert(tree("[a9='a9']")[1], "a9='a9'")
assert(tree("[a10='']")[1], "a10=''") assert(tree("[a10='']")[1], "a10=''")
assert(tree("[a10=]")[1], "a10=") assert(tree("[a10=]")[1], "a10=")
-- An excepton for a7. Some times we may select javascript or img nodes with attr selector [src="a.js"] or [src="a.jpg"]
assert(tree("[a11='a11.js.jpg']")[1], "a11=")
end end
function test_attr_notequal() function test_attr_notequal()