From 7ea22d13f7ea65dc2986eff444628f028b7f7c0a Mon Sep 17 00:00:00 2001
From: Wouter Scherphof
Date: Fri, 10 Jan 2014 20:52:01 +0100
Subject: [PATCH 1/2] Allow any unspaced quoted attribute value
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
No more forbidden characters, except the space, that’s still
interpreted as the ancestor-descendant separator
---
src/htmlparser/ElementNode.lua | 47 ++++++++++++++++++----------------
tst/init.lua | 11 +++-----
2 files changed, 28 insertions(+), 30 deletions(-)
diff --git a/src/htmlparser/ElementNode.lua b/src/htmlparser/ElementNode.lua
index 22ee7c6..11a6c22 100644
--- a/src/htmlparser/ElementNode.lua
+++ b/src/htmlparser/ElementNode.lua
@@ -149,34 +149,37 @@ local function select(self, s)
childrenonly = false
if part == "*" then goto nextpart end
local excludes, filter = Set:new()
- local halfword = ""
- for t, w, c in string.gmatch(part,
- "([:%[#.]?)" .. -- t = an optional :, [, #, or .
- "([^:%(%[#.%]%)]+)" .. -- w = 1 or more of anything not :, (, [, #, ., ], or )
- "(%]?)%)?" -- followed by an uncaptured optional ] and/or )
- ) do
- -- this if..elseif.. block will match the pattern like "[src='aaa.jpg']"
- if t == "[" and c ~= "]" then
- halfword = t .. w
- goto nextw
- elseif c == "" and halfword ~= "" then
- halfword = halfword .. t .. w
- goto nextw
- elseif t ~= "[" and c == "]" then
- halfword = halfword .. t .. w .. c
- t, w = "[", string.sub(halfword, 2, -2)
- halfword = ""
+ local start, pos = 0, 0
+ while true do
+ local switch, type, name, eq, quote
+ start, pos, switch, type, name, eq, quote = string.find(part,
+ "(%(?%)?)" .. -- switch = a possible ( or ) switching the filter on or off
+ "([:%[#.]?)" .. -- type = a possible :, [, #, or .
+ "(%w+)" .. -- name = 1 or more alfanumeric chars
+ "([|%*~%$!%^]?=?)" .. -- eq = a possible |=, *=, ~=, $=, !=, ^=, or =
+ "(['\"]?)", -- quote = a ' or " delimiting a possible attribute value
+ pos + 1
+ )
+ if not name then break end
+ if ":" == type then
+ filter = name
+ goto nextname
end
-
- if t == ":" then filter = w goto nextw end
- local matched = match(t, w)
+ if ")" == switch then
+ filter = nil
+ end
+ if "[" == type and "" ~= quote then
+ local value
+ start, pos, value = string.find(part, "(%b" .. quote .. quote .. ")]", pos)
+ name = name .. eq .. value
+ end
+ local matched = match(type, name)
if filter == "not" then
excludes = excludes + matched
else
resultset = resultset * matched
end
- filter = nil
- ::nextw::
+ ::nextname::
end
resultset = resultset - excludes
subjects = Set:new(resultset)
diff --git a/tst/init.lua b/tst/init.lua
index e6c17b7..afdf52a 100644
--- a/tst/init.lua
+++ b/tst/init.lua
@@ -83,9 +83,9 @@ end
function test_attr_equal()
local tree = htmlparser.parse([[
+ a10>
]])
assert_equal(1, #tree.nodes, "top level")
assert(tree("[a1='']")[1], "a1=''")
@@ -94,16 +94,11 @@ function test_attr_equal()
assert(tree("[a4='']")[1], "a4=''")
assert(tree("[a5='a\"5\"']")[1], "a5='a\"5\"'")
assert(tree("[a6=\"a'6'\"]")[1], "a6=\"a'6'\"")
- -- not these characters
- -- (because these have a special meaning as id, class, or attribute selector, hierarchy separator, or filter command)
- -- they can occur in the HTML, but not in a selector string
- -- assert(tree("[a7='#.[] :()']")[n], "a7='#.[] :()'")
+ assert(tree("[a7='#.[]:()']")[1], "a7='#.[]:()'")
assert(tree("[a8='|*+-=?$^%&/']")[1], "a8='|*+-=?$^%&/'")
assert(tree("[a9='a9']")[1], "a9='a9'")
assert(tree("[a10='']")[1], "a10=''")
assert(tree("[a10=]")[1], "a10=")
- -- An excepton for a7. Some times we may select javascript or img nodes with attr selector [src="a.js"] or [src="a.jpg"]
- assert(tree("[a11='a11.js.jpg']")[1], "a11=")
end
function test_attr_notequal()
From 305f0346696ac0ae392a3ab256d7a894e93c45ef Mon Sep 17 00:00:00 2001
From: Wouter Scherphof
Date: Fri, 10 Jan 2014 20:55:10 +0100
Subject: [PATCH 2/2] Removed the dropped limitations
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index e4abbc7..633faa0 100644
--- a/README.md
+++ b/README.md
@@ -87,7 +87,7 @@ All tree elements provide, apart from `:select` and `()`, the following accessor
- `.deeperclasses` as `.deeperelements`, but keyed on class name
##Limitations
-- Attribute values in selector strings cannot contain any spaces, nor any of `#`, `[`, `]`, `:`, `(`, or `)`
+- Attribute values in selector strings cannot contain any spaces
- The spaces before and after the `>` in a `parent > child` relation are mandatory
- `line1
line2
")`, `root.nodes[1]:getcontent()` is `"line1
line2"`, while `root.nodes[1].nodes[1].name` is `"br"`