mirror of
https://github.com/msva/lua-htmlparser.git
synced 2024-11-27 12:44:22 +00:00
commit
63cbbee1ae
@ -87,7 +87,7 @@ All tree elements provide, apart from `:select` and `()`, the following accessor
|
|||||||
- `.deeperclasses` as `.deeperelements`, but keyed on class name
|
- `.deeperclasses` as `.deeperelements`, but keyed on class name
|
||||||
|
|
||||||
##Limitations
|
##Limitations
|
||||||
- Attribute values in selector strings cannot contain any spaces, nor any of `#`, `[`, `]`, `:`, `(`, or `)`
|
- Attribute values in selector strings cannot contain any spaces
|
||||||
- The spaces before and after the `>` in a `parent > child` relation are mandatory
|
- The spaces before and after the `>` in a `parent > child` relation are mandatory
|
||||||
- `<!` elements (including doctype, comments, and CDATA) are not parsed; markup within CDATA is *not* escaped
|
- `<!` elements (including doctype, comments, and CDATA) are not parsed; markup within CDATA is *not* escaped
|
||||||
- Textnodes are no separate tree elements; in `local root = htmlparser.parse("<p>line1<br />line2</p>")`, `root.nodes[1]:getcontent()` is `"line1<br />line2"`, while `root.nodes[1].nodes[1].name` is `"br"`
|
- Textnodes are no separate tree elements; in `local root = htmlparser.parse("<p>line1<br />line2</p>")`, `root.nodes[1]:getcontent()` is `"line1<br />line2"`, while `root.nodes[1].nodes[1].name` is `"br"`
|
||||||
|
@ -149,34 +149,37 @@ local function select(self, s)
|
|||||||
childrenonly = false
|
childrenonly = false
|
||||||
if part == "*" then goto nextpart end
|
if part == "*" then goto nextpart end
|
||||||
local excludes, filter = Set:new()
|
local excludes, filter = Set:new()
|
||||||
local halfword = ""
|
local start, pos = 0, 0
|
||||||
for t, w, c in string.gmatch(part,
|
while true do
|
||||||
"([:%[#.]?)" .. -- t = an optional :, [, #, or .
|
local switch, type, name, eq, quote
|
||||||
"([^:%(%[#.%]%)]+)" .. -- w = 1 or more of anything not :, (, [, #, ., ], or )
|
start, pos, switch, type, name, eq, quote = string.find(part,
|
||||||
"(%]?)%)?" -- followed by an uncaptured optional ] and/or )
|
"(%(?%)?)" .. -- switch = a possible ( or ) switching the filter on or off
|
||||||
) do
|
"([:%[#.]?)" .. -- type = a possible :, [, #, or .
|
||||||
-- this if..elseif.. block will match the pattern like "[src='aaa.jpg']"
|
"(%w+)" .. -- name = 1 or more alfanumeric chars
|
||||||
if t == "[" and c ~= "]" then
|
"([|%*~%$!%^]?=?)" .. -- eq = a possible |=, *=, ~=, $=, !=, ^=, or =
|
||||||
halfword = t .. w
|
"(['\"]?)", -- quote = a ' or " delimiting a possible attribute value
|
||||||
goto nextw
|
pos + 1
|
||||||
elseif c == "" and halfword ~= "" then
|
)
|
||||||
halfword = halfword .. t .. w
|
if not name then break end
|
||||||
goto nextw
|
if ":" == type then
|
||||||
elseif t ~= "[" and c == "]" then
|
filter = name
|
||||||
halfword = halfword .. t .. w .. c
|
goto nextname
|
||||||
t, w = "[", string.sub(halfword, 2, -2)
|
|
||||||
halfword = ""
|
|
||||||
end
|
end
|
||||||
|
if ")" == switch then
|
||||||
if t == ":" then filter = w goto nextw end
|
filter = nil
|
||||||
local matched = match(t, w)
|
end
|
||||||
|
if "[" == type and "" ~= quote then
|
||||||
|
local value
|
||||||
|
start, pos, value = string.find(part, "(%b" .. quote .. quote .. ")]", pos)
|
||||||
|
name = name .. eq .. value
|
||||||
|
end
|
||||||
|
local matched = match(type, name)
|
||||||
if filter == "not" then
|
if filter == "not" then
|
||||||
excludes = excludes + matched
|
excludes = excludes + matched
|
||||||
else
|
else
|
||||||
resultset = resultset * matched
|
resultset = resultset * matched
|
||||||
end
|
end
|
||||||
filter = nil
|
::nextname::
|
||||||
::nextw::
|
|
||||||
end
|
end
|
||||||
resultset = resultset - excludes
|
resultset = resultset - excludes
|
||||||
subjects = Set:new(resultset)
|
subjects = Set:new(resultset)
|
||||||
|
@ -85,7 +85,7 @@ function test_attr_equal()
|
|||||||
<n a1 a2= a3='' a4=""
|
<n a1 a2= a3='' a4=""
|
||||||
a5='a"5"' a6="a'6'" a7='#.[]:()' a8='|*+-=?$^%&/'
|
a5='a"5"' a6="a'6'" a7='#.[]:()' a8='|*+-=?$^%&/'
|
||||||
a9=a9
|
a9=a9
|
||||||
a10 a11="a11.js.jpg"></n>
|
a10></n>
|
||||||
]])
|
]])
|
||||||
assert_equal(1, #tree.nodes, "top level")
|
assert_equal(1, #tree.nodes, "top level")
|
||||||
assert(tree("[a1='']")[1], "a1=''")
|
assert(tree("[a1='']")[1], "a1=''")
|
||||||
@ -94,16 +94,11 @@ function test_attr_equal()
|
|||||||
assert(tree("[a4='']")[1], "a4=''")
|
assert(tree("[a4='']")[1], "a4=''")
|
||||||
assert(tree("[a5='a\"5\"']")[1], "a5='a\"5\"'")
|
assert(tree("[a5='a\"5\"']")[1], "a5='a\"5\"'")
|
||||||
assert(tree("[a6=\"a'6'\"]")[1], "a6=\"a'6'\"")
|
assert(tree("[a6=\"a'6'\"]")[1], "a6=\"a'6'\"")
|
||||||
-- not these characters
|
assert(tree("[a7='#.[]:()']")[1], "a7='#.[]:()'")
|
||||||
-- (because these have a special meaning as id, class, or attribute selector, hierarchy separator, or filter command)
|
|
||||||
-- they can occur in the HTML, but not in a selector string
|
|
||||||
-- assert(tree("[a7='#.[] :()']")[n], "a7='#.[] :()'")
|
|
||||||
assert(tree("[a8='|*+-=?$^%&/']")[1], "a8='|*+-=?$^%&/'")
|
assert(tree("[a8='|*+-=?$^%&/']")[1], "a8='|*+-=?$^%&/'")
|
||||||
assert(tree("[a9='a9']")[1], "a9='a9'")
|
assert(tree("[a9='a9']")[1], "a9='a9'")
|
||||||
assert(tree("[a10='']")[1], "a10=''")
|
assert(tree("[a10='']")[1], "a10=''")
|
||||||
assert(tree("[a10=]")[1], "a10=")
|
assert(tree("[a10=]")[1], "a10=")
|
||||||
-- An excepton for a7. Some times we may select javascript or img nodes with attr selector [src="a.js"] or [src="a.jpg"]
|
|
||||||
assert(tree("[a11='a11.js.jpg']")[1], "a11=")
|
|
||||||
end
|
end
|
||||||
|
|
||||||
function test_attr_notequal()
|
function test_attr_notequal()
|
||||||
|
Loading…
Reference in New Issue
Block a user