diff --git a/README.md b/README.md
index f235aa5..808949d 100644
--- a/README.md
+++ b/README.md
@@ -68,13 +68,18 @@ Selectors can be combined; e.g. `".class:not([attribute]) element.class"`
###Limitations
- Attribute values in selectors currently cannot contain any spaces, since space is interpreted as a delimiter between the `ancestor` and `descendant`, `parent` and `>`, or `>` and `child` parts of the selector
-- Likewise, for the `parent > child` relation, the spaces before and after the `>` are mandatory
+- Consequently, for the `parent > child` relation, the spaces before and after the `>` are mandatory
+- Attribute values in selectors currently also cannot contain any of `#`, `.`, `[`, `]`, `:`, `(`, or `)`
- `line1
line2
` is plainly `"line1
line2"`
-- All start and end tags should be explicitly specified in the text to be parsed; omitted tags (as [permitted](http://www.w3.org/TR/html5/syntax.html#optional-tags) by the the HTML spec) are NOT implied. Only the [void](http://www.w3.org/TR/html5/syntax.html#void-elements) elements naturally don't need an end tag
+- All start and end tags should be explicitly specified in the text to be parsed; omitted tags (as [permitted](http://www.w3.org/TR/html5/syntax.html#optional-tags) by the the HTML spec) are NOT implied. Only the [void](http://www.w3.org/TR/html5/syntax.html#void-elements) elements naturally don't need (and mustn't have) an end tag
+- The HTML text is not validated in any way; tag and attribute names and the nesting of different tags is completely arbitrary. The only HTML-specific part of the parser is that it knows which tags are void elements
##Examples
-See `./doc/samples.lua`
+See `./doc/sample.lua`
+
+##Tests
+See `./tst/init.lua`
##Element type
All tree elements provide, apart from `:select` and `()`, the following accessors:
diff --git a/doc/sample.lua b/doc/sample.lua
index 5c85bdb..9431784 100644
--- a/doc/sample.lua
+++ b/doc/sample.lua
@@ -23,58 +23,6 @@ local function p(n)
end
p(root)
-local function select( s )
- print ""
- print("->", s)
- local sel = root:select(s)
- for element in pairs(sel) do
- print(element.name)
- end
- print(sel:len())
-end
-
-select("*")
-select("link")
-select("#/contacts/4711")
-select(".chapters")
-select("[href]")
-select("span.firstname")
-select("ul[id]")
-
-select("#/contacts/4711")
-select("#/contacts/4711 *")
-select("#/contacts/4711 .lastname")
-select("body li[id]")
-
-select("ul")
-select("ul *")
-select("ul > *")
-select("body [class]")
-select("body > [class]")
-
-select(".contacts span:not(.firstname)")
-select(":not(a)[href]")
-select("[itemscope]:not([itemprop])")
-
-select("link[rel='alternate']")
-select("[test2=\"val='2'\"]")
-select("[test5='val5']")
-select("[test6='val\"\"6']")
-select("[itemscope='']")
-select("[itemscope=]")
-select("[itemscope]")
-
-select("[itemscope][itemprop='address']")
-select("[itemscope][itemprop!='address']")
-select("[itemscope][itemprop!='adres']")
-select("[itemscope][itemprop!='']")
-select("[hreflang|='en']")
-select("[itemprop*='address']")
-select("[words~='two']")
-select("[words~='three']")
-select("[itemprop$='ion']")
-select("[hreflang^='en']")
-
print("\nchapters")
local sel, chapters = root("ol.chapters > li"), {}
for e in pairs(sel) do
diff --git a/src/htmlparser.lua b/src/htmlparser.lua
index 209332c..1f05ef9 100644
--- a/src/htmlparser.lua
+++ b/src/htmlparser.lua
@@ -23,21 +23,19 @@ local function parse(text)
local start, k, eq, quote, v
start, apos, k, eq, quote = string.find(tagst,
"%s+" .. -- some uncaptured space
- "([^%s=]+)" .. -- k = an unspaced string up to an optional "="
+ "([^%s=/>]+)" .. -- k = an unspaced string up to an optional "=" or the "/" or ">"
"(=?)" .. -- eq = the optional; "=", else ""
"(['\"]?)", -- quote = an optional "'" or '"' following the "=", or ""
apos)
if not k or k == "/>" or k == ">" then break end
- if eq == "" then
- v = ""
- else
+ if eq == "=" then
local pattern = "=([^%s>]*)"
- if quote ~= '' then
+ if quote ~= "" then
pattern = quote .. "([^" .. quote .. "]*)" .. quote
end
start, apos, v = string.find(tagst, pattern, apos)
end
- tag:addattribute(k, v)
+ tag:addattribute(k, v or "")
end
if voidelements[string.lower(tag.name)] then
diff --git a/src/htmlparser/ElementNode.lua b/src/htmlparser/ElementNode.lua
index d6c99e8..e58454c 100644
--- a/src/htmlparser/ElementNode.lua
+++ b/src/htmlparser/ElementNode.lua
@@ -51,9 +51,8 @@ function ElementNode:addattribute(k, v)
self.attributes[k] = v
if string.lower(k) == "id" then
self.id = v
- end
-- class attribute contains "space-separated tokens", each of which we'd like quick access to
- if string.lower(k) == "class" then
+ elseif string.lower(k) == "class" then
for class in string.gmatch(v, "%S+") do
table.insert(self.classes, class)
end
@@ -98,18 +97,20 @@ local function select(self, s)
local sets = {[""] = self.deeperelements, ["["] = self.deeperattributes,
["#"] = self.deeperids, ["."] = self.deeperclasses}
local function match(t, w)
- local m, v
- if t == "[" then w, m, v = string.match(w,
+ local m, e, v
+ if t == "[" then w, m, e, v = string.match(w,
"([^=|%*~%$!%^]+)" .. -- w = 1 or more characters up to a possible "=", "|", "*", "~", "$", "!", or "^"
"([|%*~%$!%^]?)" .. -- m = an optional "|", "*", "~", "$", "!", or "^", preceding the optional "="
- "=?" .. -- an optional uncaptured "="
+ "(=?)" .. -- e = the optional "="
"(.*)" -- v = anything following the "=", or else ""
)
end
local matched = Set:new(sets[t][w])
-- attribute value selectors
- if v and v ~= "" then
+ if e == "=" then
+ if #v < 2 then v = "'" .. v .. "'" end -- values should be quoted
v = string.sub(v, 2, #v - 1) -- strip quotes
+ if m == "!" then matched = Set:new(self.deepernodes) end -- include those without that attribute
for node in pairs(matched) do
local a = node.attributes[w]
-- equals
diff --git a/tst/init.lua b/tst/init.lua
index fc80b37..e3b164e 100644
--- a/tst/init.lua
+++ b/tst/init.lua
@@ -6,42 +6,185 @@ local lunitx = require("lunitx")
module("html", lunitx.testcase, package.seeall)
local htmlparser = require("htmlparser")
-local tree, sel
+
+function test_void()
+ local tree = htmlparser.parse([[
+
+
+
+
+
+
+
+
+
+
+ ]])
+ assert_equal(5, #tree.nodes, "top level")
+ for _,n in ipairs(tree.nodes) do
+ if n.name == "p" then
+ assert_equal(4, #n.nodes, "deeper level")
+ else
+ assert_equal("br", n.name, "name")
+ assert_equal("", n:getcontent(), "content")
+ end
+ end
+end
+
+function test_attr()
+ local tree = htmlparser.parse([[
+
+ ]])
+ assert_equal(1, #tree.nodes, "top level")
+ local n = tree.nodes[1]
+ assert(tree("[a1]")[n], "a1")
+ assert(tree("[a2]")[n], "a2")
+ assert(tree("[a3]")[n], "a3")
+ assert(tree("[a4]")[n], "a4")
+ assert(tree("[a5]")[n], "a5")
+ assert(tree("[a6]")[n], "a6")
+ assert(tree("[a7]")[n], "a7")
+ assert(tree("[a8]")[n], "a8")
+ assert(tree("[a9]")[n], "a9")
+ assert(tree("[a10]")[n], "a10")
+end
+
+function test_attr_equal()
+ local tree = htmlparser.parse([[
+
+ ]])
+ assert_equal(1, #tree.nodes, "top level")
+ local n = tree.nodes[1]
+ assert(tree("[a1='']")[n], "a1=''")
+ assert(tree("[a2='']")[n], "a2=''")
+ assert(tree("[a3='']")[n], "a3=''")
+ assert(tree("[a4='']")[n], "a4=''")
+ assert(tree("[a5='a\"5\"']")[n], "a5='a\"5\"'")
+ assert(tree("[a6=\"a'6'\"]")[n], "a6=\"a'6'\"")
+ -- not these characters
+ -- (because these have a special meaning as id, class, or attribute selector, hierarchy separator, or filter command)
+ -- they can occur in the HTML, but not in a selector string
+ -- assert(tree("[a7='#.[] :()']")[n], "a7='#.[] :()'")
+ assert(tree("[a8='|*+-=?$^%&/']")[n], "a8='|*+-=?$^%&/'")
+ assert(tree("[a9='a9']")[n], "a9='a9'")
+ assert(tree("[a10='']")[n], "a10=''")
+ assert(tree("[a10=]")[n], "a10=")
+end
+
+function test_attr_notequal()
+ local tree = htmlparser.parse([[
+
+
+
+
+ ]])
+ assert_equal(4, #tree.nodes, "top level")
+ assert_equal(3, tree("[a1!='a1']"):len(), "a1!='a1'")
+ assert_equal(4, tree("[a1!='b1']"):len(), "a1!='b1'")
+ assert_equal(3, tree("[a1!='']"):len(), "a1!=''")
+ assert_equal(3, tree("[a1!=]"):len(), "a1!=")
+end
+
+function test_attr_prefix_start_end()
+ local tree = htmlparser.parse([[
+
+
+
+
+
+ ]])
+ assert_equal(5, #tree.nodes, "top level")
+ assert_equal(3, tree("[a1|='en']"):len(), "a1|='en'")
+ assert_equal(4, tree("[a1^='en']"):len(), "a1^='en'")
+ assert_equal(2, tree("[a1$='en']"):len(), "a1$='en'")
+end
+
+function test_attr_word()
+ local tree = htmlparser.parse([[
+
+
+
+
+ ]])
+ assert_equal(4, #tree.nodes, "top level")
+ assert_equal(1, tree("[a1~='two']"):len(), "a1~='two'")
+ assert_equal(2, tree("[a1~='three']"):len(), "a1~='three'")
+ assert_equal(1, tree("[a1~='four']"):len(), "a1~='four'")
+end
+
+function test_attr_contains()
+ local tree = htmlparser.parse([[
+
+
+
+
+
+
+ ]])
+ assert_equal(6, #tree.nodes, "top level")
+ assert_equal(2, tree("[a1*='one']"):len(), "a1*='one'")
+ assert_equal(2, tree("[a1*='t']"):len(), "a1*='t'")
+ assert_equal(1, tree("[a1*='f']"):len(), "a1*='f'")
+ assert_equal(5, tree("[a1*='']"):len(), "a1*=''")
+ assert_equal(5, tree("[a1*=]"):len(), "a1*=")
+end
function test_descendants()
- tree = htmlparser.parse([[
+ local tree = htmlparser.parse([[
1
- 1.1
- 1.2
- 1.2.1
+ 1
+ 2
+ 3
+
+ 4
+
2
- 2.1
- 2.2
- 2.2.1
+ 5
+ 6
+ 7
+
+ 8
+
+
+ not
+
]])
- sel = tree("parent child")
- assert_equal(6, sel:len(), 'parent child')
+ assert_equal(8, tree("parent child"):len(), 'parent child')
end
function test_children()
- tree = htmlparser.parse([[
+ local tree = htmlparser.parse([[
1
- 1.1
- 1.2
- 1.2.1
+ 1
+ 2
+ not
+
+ not
+
2
- 2.1
- 2.2
- 2.2.1
+ 3
+ 4
+ not
+
+ not
+
+
+ not
+
]])
- sel = tree("parent > child")
- assert_equal(4, sel:len(), 'parent > child')
+ assert_equal(4, tree("parent > child"):len(), 'parent > child')
end
\ No newline at end of file