From 2bba6a7189d179523ce564e29b74166bf1efecdd Mon Sep 17 00:00:00 2001 From: Wouter Scherphof Date: Tue, 2 Apr 2013 12:59:44 +0200 Subject: [PATCH 1/5] added test for void elements --- tst/init.lua | 63 +++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 50 insertions(+), 13 deletions(-) diff --git a/tst/init.lua b/tst/init.lua index fc80b37..7097331 100644 --- a/tst/init.lua +++ b/tst/init.lua @@ -8,38 +8,75 @@ module("html", lunitx.testcase, package.seeall) local htmlparser = require("htmlparser") local tree, sel +function test_void() + tree = htmlparser.parse([[ +

+
+
+
+
+

+
+
+
+
+ ]]) + assert_equal(5, #tree.nodes, "top level") + for _,n in ipairs(tree.nodes) do + if n.name == "p" then + assert_equal(4, #n.nodes, "deeper level") + else + assert_equal("br", n.name, "name") + assert_equal(0, #n.attributes, "attributes") + assert_equal("", n:getcontent(), "content") + end + end +end + function test_descendants() tree = htmlparser.parse([[ 1 - 1.1 - 1.2 - 1.2.1 + 1 + 2 + 3 + + 4 + 2 - 2.1 - 2.2 - 2.2.1 + 5 + 6 + 7 + + 8 + ]]) sel = tree("parent child") - assert_equal(6, sel:len(), 'parent child') + assert_equal(8, sel:len(), 'parent child') end function test_children() tree = htmlparser.parse([[ 1 - 1.1 - 1.2 - 1.2.1 + 1 + 2 + not + + not + 2 - 2.1 - 2.2 - 2.2.1 + 3 + 4 + not + + not + ]]) sel = tree("parent > child") From b3bbb56d9f54a6939a28300430f7616cb94c6d38 Mon Sep 17 00:00:00 2001 From: Wouter Scherphof Date: Tue, 2 Apr 2013 13:06:32 +0200 Subject: [PATCH 2/5] tidbit in hierarchy tests --- tst/init.lua | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tst/init.lua b/tst/init.lua index 7097331..8f1a53f 100644 --- a/tst/init.lua +++ b/tst/init.lua @@ -53,6 +53,9 @@ function test_descendants() 8 + + not + ]]) sel = tree("parent child") assert_equal(8, sel:len(), 'parent child') @@ -78,6 +81,9 @@ function test_children() not + + not + ]]) sel = tree("parent > child") assert_equal(4, sel:len(), 'parent > child') From 10a5faf1920577831485b6d2a8be8df43a792f63 Mon Sep 17 00:00:00 2001 From: Wouter Scherphof Date: Wed, 3 Apr 2013 22:32:23 +0200 Subject: [PATCH 3/5] added test_attr (and fixed one or two edge cases) --- src/htmlparser.lua | 10 +++--- src/htmlparser/ElementNode.lua | 13 ++++---- tst/init.lua | 61 ++++++++++++++++++++++++++++++---- 3 files changed, 65 insertions(+), 19 deletions(-) diff --git a/src/htmlparser.lua b/src/htmlparser.lua index 209332c..1f05ef9 100644 --- a/src/htmlparser.lua +++ b/src/htmlparser.lua @@ -23,21 +23,19 @@ local function parse(text) local start, k, eq, quote, v start, apos, k, eq, quote = string.find(tagst, "%s+" .. -- some uncaptured space - "([^%s=]+)" .. -- k = an unspaced string up to an optional "=" + "([^%s=/>]+)" .. -- k = an unspaced string up to an optional "=" or the "/" or ">" "(=?)" .. -- eq = the optional; "=", else "" "(['\"]?)", -- quote = an optional "'" or '"' following the "=", or "" apos) if not k or k == "/>" or k == ">" then break end - if eq == "" then - v = "" - else + if eq == "=" then local pattern = "=([^%s>]*)" - if quote ~= '' then + if quote ~= "" then pattern = quote .. "([^" .. quote .. "]*)" .. quote end start, apos, v = string.find(tagst, pattern, apos) end - tag:addattribute(k, v) + tag:addattribute(k, v or "") end if voidelements[string.lower(tag.name)] then diff --git a/src/htmlparser/ElementNode.lua b/src/htmlparser/ElementNode.lua index d6c99e8..e58454c 100644 --- a/src/htmlparser/ElementNode.lua +++ b/src/htmlparser/ElementNode.lua @@ -51,9 +51,8 @@ function ElementNode:addattribute(k, v) self.attributes[k] = v if string.lower(k) == "id" then self.id = v - end -- class attribute contains "space-separated tokens", each of which we'd like quick access to - if string.lower(k) == "class" then + elseif string.lower(k) == "class" then for class in string.gmatch(v, "%S+") do table.insert(self.classes, class) end @@ -98,18 +97,20 @@ local function select(self, s) local sets = {[""] = self.deeperelements, ["["] = self.deeperattributes, ["#"] = self.deeperids, ["."] = self.deeperclasses} local function match(t, w) - local m, v - if t == "[" then w, m, v = string.match(w, + local m, e, v + if t == "[" then w, m, e, v = string.match(w, "([^=|%*~%$!%^]+)" .. -- w = 1 or more characters up to a possible "=", "|", "*", "~", "$", "!", or "^" "([|%*~%$!%^]?)" .. -- m = an optional "|", "*", "~", "$", "!", or "^", preceding the optional "=" - "=?" .. -- an optional uncaptured "=" + "(=?)" .. -- e = the optional "=" "(.*)" -- v = anything following the "=", or else "" ) end local matched = Set:new(sets[t][w]) -- attribute value selectors - if v and v ~= "" then + if e == "=" then + if #v < 2 then v = "'" .. v .. "'" end -- values should be quoted v = string.sub(v, 2, #v - 1) -- strip quotes + if m == "!" then matched = Set:new(self.deepernodes) end -- include those without that attribute for node in pairs(matched) do local a = node.attributes[w] -- equals diff --git a/tst/init.lua b/tst/init.lua index 8f1a53f..1349809 100644 --- a/tst/init.lua +++ b/tst/init.lua @@ -6,10 +6,9 @@ local lunitx = require("lunitx") module("html", lunitx.testcase, package.seeall) local htmlparser = require("htmlparser") -local tree, sel function test_void() - tree = htmlparser.parse([[ + local tree = htmlparser.parse([[



@@ -27,14 +26,62 @@ function test_void() assert_equal(4, #n.nodes, "deeper level") else assert_equal("br", n.name, "name") - assert_equal(0, #n.attributes, "attributes") assert_equal("", n:getcontent(), "content") + for _ in pairs(n.attributes) do + fail("should not have attributes") + end end end end +function test_attr() + local tree = htmlparser.parse([[ + + + + ]]) + assert_equal(3, #tree.nodes, "top level") + local n + for _,v in ipairs(tree.nodes) do + if v.name == "n" then n = v break end + end + assert(tree("[a1]")[n], "a1") + assert(tree("[a2]")[n], "a2") + assert(tree("[a3]")[n], "a3") + assert(tree("[a4]")[n], "a4") + assert(tree("[a5]")[n], "a5") + assert(tree("[a6]")[n], "a6") + assert(tree("[a7]")[n], "a7") + assert(tree("[a8]")[n], "a8") + assert(tree("[a1='']")[n], "a1=''") + assert(tree("[a2='']")[n], "a2=''") + assert(tree("[a3='']")[n], "a3=''") + assert(tree("[a4='']")[n], "a4=''") + assert(tree("[a5='a\"5\"']")[n], "a5='a\"5\"'") + assert(tree("[a6=\"a'6'\"]")[n], "a6=\"a'6'\"") + assert(tree("[a8='a=8']")[n], "a8='a=8'") + assert_equal(1, tree("[a10=]"):len(), "a10=") + assert_equal(1, tree("[a10='']"):len(), "a10=''") + assert_equal(2, tree("[a10!='enen']"):len(), "a10!='enen'") + assert_equal(2, tree("[a10!='']"):len(), "a10!=''") + assert_equal(3, tree("[a0!='']"):len(), "a0!=''") + assert_equal(0, tree("[a0='']"):len(), "a0=''") + assert_equal(2, tree("[a9|='en']"):len(), "a9|='en'") + assert_equal(3, tree("[a9^='en']"):len(), "a9^='en'") + assert_equal(1, tree("[a9$='en']"):len(), "a9$='en'") + assert_equal(1, tree("[a11~='two']"):len(), "a1~='two'") + assert_equal(2, tree("[a11~='three']"):len(), "a1~='three'") + assert_equal(1, tree("[a11~='four']"):len(), "a1~='four'") + assert_equal(1, tree("[a7*='7']"):len(), "a7*='7'") + assert_equal(1, tree("[a11*='f']"):len(), "a11*='f'") +end + function test_descendants() - tree = htmlparser.parse([[ + local tree = htmlparser.parse([[ 1 1 2 @@ -57,12 +104,12 @@ function test_descendants() not ]]) - sel = tree("parent child") + local sel = tree("parent child") assert_equal(8, sel:len(), 'parent child') end function test_children() - tree = htmlparser.parse([[ + local tree = htmlparser.parse([[ 1 1 2 @@ -85,6 +132,6 @@ function test_children() not ]]) - sel = tree("parent > child") + local sel = tree("parent > child") assert_equal(4, sel:len(), 'parent > child') end \ No newline at end of file From 2983056fa29c70ec9be113ab490b13d83be8006e Mon Sep 17 00:00:00 2001 From: Wouter Scherphof Date: Fri, 5 Apr 2013 19:58:39 +0200 Subject: [PATCH 4/5] Organised and extended attr tests --- README.md | 6 ++- tst/init.lua | 119 +++++++++++++++++++++++++++++++++++++-------------- 2 files changed, 90 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index f235aa5..da0887e 100644 --- a/README.md +++ b/README.md @@ -68,10 +68,12 @@ Selectors can be combined; e.g. `".class:not([attribute]) element.class"` ###Limitations - Attribute values in selectors currently cannot contain any spaces, since space is interpreted as a delimiter between the `ancestor` and `descendant`, `parent` and `>`, or `>` and `child` parts of the selector -- Likewise, for the `parent > child` relation, the spaces before and after the `>` are mandatory +- Consequently, for the `parent > child` relation, the spaces before and after the `>` are mandatory +- Attribute values in selectors currently also cannot contain any of `#`, `.`, `[`, `]`, `:`, `(`, or `)` - `line1
line2

` is plainly `"line1
line2"` -- All start and end tags should be explicitly specified in the text to be parsed; omitted tags (as [permitted](http://www.w3.org/TR/html5/syntax.html#optional-tags) by the the HTML spec) are NOT implied. Only the [void](http://www.w3.org/TR/html5/syntax.html#void-elements) elements naturally don't need an end tag +- All start and end tags should be explicitly specified in the text to be parsed; omitted tags (as [permitted](http://www.w3.org/TR/html5/syntax.html#optional-tags) by the the HTML spec) are NOT implied. Only the [void](http://www.w3.org/TR/html5/syntax.html#void-elements) elements naturally don't need (and mustn't have) an end tag +- The HTML text is not validated in any way; tag and attribute names and the nesting of different tags is completely arbitrary. The only HTML-specific part of the parser is that it knows which tags are void elements ##Examples See `./doc/samples.lua` diff --git a/tst/init.lua b/tst/init.lua index 1349809..e3b164e 100644 --- a/tst/init.lua +++ b/tst/init.lua @@ -27,9 +27,6 @@ function test_void() else assert_equal("br", n.name, "name") assert_equal("", n:getcontent(), "content") - for _ in pairs(n.attributes) do - fail("should not have attributes") - end end end end @@ -37,18 +34,12 @@ end function test_attr() local tree = htmlparser.parse([[ - - + a5='a"5"' a6="a'6'" a7='#.[] :()' a8='|*+-=?$^%&/' + a9=a9 + a10> ]]) - assert_equal(3, #tree.nodes, "top level") - local n - for _,v in ipairs(tree.nodes) do - if v.name == "n" then n = v break end - end + assert_equal(1, #tree.nodes, "top level") + local n = tree.nodes[1] assert(tree("[a1]")[n], "a1") assert(tree("[a2]")[n], "a2") assert(tree("[a3]")[n], "a3") @@ -57,27 +48,91 @@ function test_attr() assert(tree("[a6]")[n], "a6") assert(tree("[a7]")[n], "a7") assert(tree("[a8]")[n], "a8") + assert(tree("[a9]")[n], "a9") + assert(tree("[a10]")[n], "a10") +end + +function test_attr_equal() + local tree = htmlparser.parse([[ + + ]]) + assert_equal(1, #tree.nodes, "top level") + local n = tree.nodes[1] assert(tree("[a1='']")[n], "a1=''") assert(tree("[a2='']")[n], "a2=''") assert(tree("[a3='']")[n], "a3=''") assert(tree("[a4='']")[n], "a4=''") assert(tree("[a5='a\"5\"']")[n], "a5='a\"5\"'") assert(tree("[a6=\"a'6'\"]")[n], "a6=\"a'6'\"") - assert(tree("[a8='a=8']")[n], "a8='a=8'") - assert_equal(1, tree("[a10=]"):len(), "a10=") - assert_equal(1, tree("[a10='']"):len(), "a10=''") - assert_equal(2, tree("[a10!='enen']"):len(), "a10!='enen'") - assert_equal(2, tree("[a10!='']"):len(), "a10!=''") - assert_equal(3, tree("[a0!='']"):len(), "a0!=''") - assert_equal(0, tree("[a0='']"):len(), "a0=''") - assert_equal(2, tree("[a9|='en']"):len(), "a9|='en'") - assert_equal(3, tree("[a9^='en']"):len(), "a9^='en'") - assert_equal(1, tree("[a9$='en']"):len(), "a9$='en'") - assert_equal(1, tree("[a11~='two']"):len(), "a1~='two'") - assert_equal(2, tree("[a11~='three']"):len(), "a1~='three'") - assert_equal(1, tree("[a11~='four']"):len(), "a1~='four'") - assert_equal(1, tree("[a7*='7']"):len(), "a7*='7'") - assert_equal(1, tree("[a11*='f']"):len(), "a11*='f'") + -- not these characters + -- (because these have a special meaning as id, class, or attribute selector, hierarchy separator, or filter command) + -- they can occur in the HTML, but not in a selector string + -- assert(tree("[a7='#.[] :()']")[n], "a7='#.[] :()'") + assert(tree("[a8='|*+-=?$^%&/']")[n], "a8='|*+-=?$^%&/'") + assert(tree("[a9='a9']")[n], "a9='a9'") + assert(tree("[a10='']")[n], "a10=''") + assert(tree("[a10=]")[n], "a10=") +end + +function test_attr_notequal() + local tree = htmlparser.parse([[ + + + + + ]]) + assert_equal(4, #tree.nodes, "top level") + assert_equal(3, tree("[a1!='a1']"):len(), "a1!='a1'") + assert_equal(4, tree("[a1!='b1']"):len(), "a1!='b1'") + assert_equal(3, tree("[a1!='']"):len(), "a1!=''") + assert_equal(3, tree("[a1!=]"):len(), "a1!=") +end + +function test_attr_prefix_start_end() + local tree = htmlparser.parse([[ + + + + + + ]]) + assert_equal(5, #tree.nodes, "top level") + assert_equal(3, tree("[a1|='en']"):len(), "a1|='en'") + assert_equal(4, tree("[a1^='en']"):len(), "a1^='en'") + assert_equal(2, tree("[a1$='en']"):len(), "a1$='en'") +end + +function test_attr_word() + local tree = htmlparser.parse([[ + + + + + ]]) + assert_equal(4, #tree.nodes, "top level") + assert_equal(1, tree("[a1~='two']"):len(), "a1~='two'") + assert_equal(2, tree("[a1~='three']"):len(), "a1~='three'") + assert_equal(1, tree("[a1~='four']"):len(), "a1~='four'") +end + +function test_attr_contains() + local tree = htmlparser.parse([[ + + + + + + + ]]) + assert_equal(6, #tree.nodes, "top level") + assert_equal(2, tree("[a1*='one']"):len(), "a1*='one'") + assert_equal(2, tree("[a1*='t']"):len(), "a1*='t'") + assert_equal(1, tree("[a1*='f']"):len(), "a1*='f'") + assert_equal(5, tree("[a1*='']"):len(), "a1*=''") + assert_equal(5, tree("[a1*=]"):len(), "a1*=") end function test_descendants() @@ -104,8 +159,7 @@ function test_descendants() not ]]) - local sel = tree("parent child") - assert_equal(8, sel:len(), 'parent child') + assert_equal(8, tree("parent child"):len(), 'parent child') end function test_children() @@ -132,6 +186,5 @@ function test_children() not ]]) - local sel = tree("parent > child") - assert_equal(4, sel:len(), 'parent > child') + assert_equal(4, tree("parent > child"):len(), 'parent > child') end \ No newline at end of file From 7c7b4b2f42b3886316fb923bc96b53451945c767 Mon Sep 17 00:00:00 2001 From: Wouter Scherphof Date: Fri, 5 Apr 2013 20:03:04 +0200 Subject: [PATCH 5/5] Removed tests from sample.lua --- README.md | 5 ++++- doc/sample.lua | 52 -------------------------------------------------- 2 files changed, 4 insertions(+), 53 deletions(-) diff --git a/README.md b/README.md index da0887e..808949d 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,10 @@ Selectors can be combined; e.g. `".class:not([attribute]) element.class"` - The HTML text is not validated in any way; tag and attribute names and the nesting of different tags is completely arbitrary. The only HTML-specific part of the parser is that it knows which tags are void elements ##Examples -See `./doc/samples.lua` +See `./doc/sample.lua` + +##Tests +See `./tst/init.lua` ##Element type All tree elements provide, apart from `:select` and `()`, the following accessors: diff --git a/doc/sample.lua b/doc/sample.lua index 5c85bdb..9431784 100644 --- a/doc/sample.lua +++ b/doc/sample.lua @@ -23,58 +23,6 @@ local function p(n) end p(root) -local function select( s ) - print "" - print("->", s) - local sel = root:select(s) - for element in pairs(sel) do - print(element.name) - end - print(sel:len()) -end - -select("*") -select("link") -select("#/contacts/4711") -select(".chapters") -select("[href]") -select("span.firstname") -select("ul[id]") - -select("#/contacts/4711") -select("#/contacts/4711 *") -select("#/contacts/4711 .lastname") -select("body li[id]") - -select("ul") -select("ul *") -select("ul > *") -select("body [class]") -select("body > [class]") - -select(".contacts span:not(.firstname)") -select(":not(a)[href]") -select("[itemscope]:not([itemprop])") - -select("link[rel='alternate']") -select("[test2=\"val='2'\"]") -select("[test5='val5']") -select("[test6='val\"\"6']") -select("[itemscope='']") -select("[itemscope=]") -select("[itemscope]") - -select("[itemscope][itemprop='address']") -select("[itemscope][itemprop!='address']") -select("[itemscope][itemprop!='adres']") -select("[itemscope][itemprop!='']") -select("[hreflang|='en']") -select("[itemprop*='address']") -select("[words~='two']") -select("[words~='three']") -select("[itemprop$='ion']") -select("[hreflang^='en']") - print("\nchapters") local sel, chapters = root("ol.chapters > li"), {} for e in pairs(sel) do