mirror of
https://github.com/msva/lua-htmlparser.git
synced 2024-11-27 12:44:22 +00:00
commit
7fc66a4ef9
11
README.md
11
README.md
@ -68,13 +68,18 @@ Selectors can be combined; e.g. `".class:not([attribute]) element.class"`
|
|||||||
|
|
||||||
###Limitations
|
###Limitations
|
||||||
- Attribute values in selectors currently cannot contain any spaces, since space is interpreted as a delimiter between the `ancestor` and `descendant`, `parent` and `>`, or `>` and `child` parts of the selector
|
- Attribute values in selectors currently cannot contain any spaces, since space is interpreted as a delimiter between the `ancestor` and `descendant`, `parent` and `>`, or `>` and `child` parts of the selector
|
||||||
- Likewise, for the `parent > child` relation, the spaces before and after the `>` are mandatory
|
- Consequently, for the `parent > child` relation, the spaces before and after the `>` are mandatory
|
||||||
|
- Attribute values in selectors currently also cannot contain any of `#`, `.`, `[`, `]`, `:`, `(`, or `)`
|
||||||
- `<!` elements are not parsed, including doctype, comments, and CDATA
|
- `<!` elements are not parsed, including doctype, comments, and CDATA
|
||||||
- Textnodes are not seperate entries in the tree, so the content of `<p>line1<br />line2</p>` is plainly `"line1<br />line2"`
|
- Textnodes are not seperate entries in the tree, so the content of `<p>line1<br />line2</p>` is plainly `"line1<br />line2"`
|
||||||
- All start and end tags should be explicitly specified in the text to be parsed; omitted tags (as [permitted](http://www.w3.org/TR/html5/syntax.html#optional-tags) by the the HTML spec) are NOT implied. Only the [void](http://www.w3.org/TR/html5/syntax.html#void-elements) elements naturally don't need an end tag
|
- All start and end tags should be explicitly specified in the text to be parsed; omitted tags (as [permitted](http://www.w3.org/TR/html5/syntax.html#optional-tags) by the the HTML spec) are NOT implied. Only the [void](http://www.w3.org/TR/html5/syntax.html#void-elements) elements naturally don't need (and mustn't have) an end tag
|
||||||
|
- The HTML text is not validated in any way; tag and attribute names and the nesting of different tags is completely arbitrary. The only HTML-specific part of the parser is that it knows which tags are void elements
|
||||||
|
|
||||||
##Examples
|
##Examples
|
||||||
See `./doc/samples.lua`
|
See `./doc/sample.lua`
|
||||||
|
|
||||||
|
##Tests
|
||||||
|
See `./tst/init.lua`
|
||||||
|
|
||||||
##Element type
|
##Element type
|
||||||
All tree elements provide, apart from `:select` and `()`, the following accessors:
|
All tree elements provide, apart from `:select` and `()`, the following accessors:
|
||||||
|
@ -23,58 +23,6 @@ local function p(n)
|
|||||||
end
|
end
|
||||||
p(root)
|
p(root)
|
||||||
|
|
||||||
local function select( s )
|
|
||||||
print ""
|
|
||||||
print("->", s)
|
|
||||||
local sel = root:select(s)
|
|
||||||
for element in pairs(sel) do
|
|
||||||
print(element.name)
|
|
||||||
end
|
|
||||||
print(sel:len())
|
|
||||||
end
|
|
||||||
|
|
||||||
select("*")
|
|
||||||
select("link")
|
|
||||||
select("#/contacts/4711")
|
|
||||||
select(".chapters")
|
|
||||||
select("[href]")
|
|
||||||
select("span.firstname")
|
|
||||||
select("ul[id]")
|
|
||||||
|
|
||||||
select("#/contacts/4711")
|
|
||||||
select("#/contacts/4711 *")
|
|
||||||
select("#/contacts/4711 .lastname")
|
|
||||||
select("body li[id]")
|
|
||||||
|
|
||||||
select("ul")
|
|
||||||
select("ul *")
|
|
||||||
select("ul > *")
|
|
||||||
select("body [class]")
|
|
||||||
select("body > [class]")
|
|
||||||
|
|
||||||
select(".contacts span:not(.firstname)")
|
|
||||||
select(":not(a)[href]")
|
|
||||||
select("[itemscope]:not([itemprop])")
|
|
||||||
|
|
||||||
select("link[rel='alternate']")
|
|
||||||
select("[test2=\"val='2'\"]")
|
|
||||||
select("[test5='val5']")
|
|
||||||
select("[test6='val\"\"6']")
|
|
||||||
select("[itemscope='']")
|
|
||||||
select("[itemscope=]")
|
|
||||||
select("[itemscope]")
|
|
||||||
|
|
||||||
select("[itemscope][itemprop='address']")
|
|
||||||
select("[itemscope][itemprop!='address']")
|
|
||||||
select("[itemscope][itemprop!='adres']")
|
|
||||||
select("[itemscope][itemprop!='']")
|
|
||||||
select("[hreflang|='en']")
|
|
||||||
select("[itemprop*='address']")
|
|
||||||
select("[words~='two']")
|
|
||||||
select("[words~='three']")
|
|
||||||
select("[itemprop$='ion']")
|
|
||||||
select("[hreflang^='en']")
|
|
||||||
|
|
||||||
print("\nchapters")
|
print("\nchapters")
|
||||||
local sel, chapters = root("ol.chapters > li"), {}
|
local sel, chapters = root("ol.chapters > li"), {}
|
||||||
for e in pairs(sel) do
|
for e in pairs(sel) do
|
||||||
|
@ -23,21 +23,19 @@ local function parse(text)
|
|||||||
local start, k, eq, quote, v
|
local start, k, eq, quote, v
|
||||||
start, apos, k, eq, quote = string.find(tagst,
|
start, apos, k, eq, quote = string.find(tagst,
|
||||||
"%s+" .. -- some uncaptured space
|
"%s+" .. -- some uncaptured space
|
||||||
"([^%s=]+)" .. -- k = an unspaced string up to an optional "="
|
"([^%s=/>]+)" .. -- k = an unspaced string up to an optional "=" or the "/" or ">"
|
||||||
"(=?)" .. -- eq = the optional; "=", else ""
|
"(=?)" .. -- eq = the optional; "=", else ""
|
||||||
"(['\"]?)", -- quote = an optional "'" or '"' following the "=", or ""
|
"(['\"]?)", -- quote = an optional "'" or '"' following the "=", or ""
|
||||||
apos)
|
apos)
|
||||||
if not k or k == "/>" or k == ">" then break end
|
if not k or k == "/>" or k == ">" then break end
|
||||||
if eq == "" then
|
if eq == "=" then
|
||||||
v = ""
|
|
||||||
else
|
|
||||||
local pattern = "=([^%s>]*)"
|
local pattern = "=([^%s>]*)"
|
||||||
if quote ~= '' then
|
if quote ~= "" then
|
||||||
pattern = quote .. "([^" .. quote .. "]*)" .. quote
|
pattern = quote .. "([^" .. quote .. "]*)" .. quote
|
||||||
end
|
end
|
||||||
start, apos, v = string.find(tagst, pattern, apos)
|
start, apos, v = string.find(tagst, pattern, apos)
|
||||||
end
|
end
|
||||||
tag:addattribute(k, v)
|
tag:addattribute(k, v or "")
|
||||||
end
|
end
|
||||||
|
|
||||||
if voidelements[string.lower(tag.name)] then
|
if voidelements[string.lower(tag.name)] then
|
||||||
|
@ -51,9 +51,8 @@ function ElementNode:addattribute(k, v)
|
|||||||
self.attributes[k] = v
|
self.attributes[k] = v
|
||||||
if string.lower(k) == "id" then
|
if string.lower(k) == "id" then
|
||||||
self.id = v
|
self.id = v
|
||||||
end
|
|
||||||
-- class attribute contains "space-separated tokens", each of which we'd like quick access to
|
-- class attribute contains "space-separated tokens", each of which we'd like quick access to
|
||||||
if string.lower(k) == "class" then
|
elseif string.lower(k) == "class" then
|
||||||
for class in string.gmatch(v, "%S+") do
|
for class in string.gmatch(v, "%S+") do
|
||||||
table.insert(self.classes, class)
|
table.insert(self.classes, class)
|
||||||
end
|
end
|
||||||
@ -98,18 +97,20 @@ local function select(self, s)
|
|||||||
local sets = {[""] = self.deeperelements, ["["] = self.deeperattributes,
|
local sets = {[""] = self.deeperelements, ["["] = self.deeperattributes,
|
||||||
["#"] = self.deeperids, ["."] = self.deeperclasses}
|
["#"] = self.deeperids, ["."] = self.deeperclasses}
|
||||||
local function match(t, w)
|
local function match(t, w)
|
||||||
local m, v
|
local m, e, v
|
||||||
if t == "[" then w, m, v = string.match(w,
|
if t == "[" then w, m, e, v = string.match(w,
|
||||||
"([^=|%*~%$!%^]+)" .. -- w = 1 or more characters up to a possible "=", "|", "*", "~", "$", "!", or "^"
|
"([^=|%*~%$!%^]+)" .. -- w = 1 or more characters up to a possible "=", "|", "*", "~", "$", "!", or "^"
|
||||||
"([|%*~%$!%^]?)" .. -- m = an optional "|", "*", "~", "$", "!", or "^", preceding the optional "="
|
"([|%*~%$!%^]?)" .. -- m = an optional "|", "*", "~", "$", "!", or "^", preceding the optional "="
|
||||||
"=?" .. -- an optional uncaptured "="
|
"(=?)" .. -- e = the optional "="
|
||||||
"(.*)" -- v = anything following the "=", or else ""
|
"(.*)" -- v = anything following the "=", or else ""
|
||||||
)
|
)
|
||||||
end
|
end
|
||||||
local matched = Set:new(sets[t][w])
|
local matched = Set:new(sets[t][w])
|
||||||
-- attribute value selectors
|
-- attribute value selectors
|
||||||
if v and v ~= "" then
|
if e == "=" then
|
||||||
|
if #v < 2 then v = "'" .. v .. "'" end -- values should be quoted
|
||||||
v = string.sub(v, 2, #v - 1) -- strip quotes
|
v = string.sub(v, 2, #v - 1) -- strip quotes
|
||||||
|
if m == "!" then matched = Set:new(self.deepernodes) end -- include those without that attribute
|
||||||
for node in pairs(matched) do
|
for node in pairs(matched) do
|
||||||
local a = node.attributes[w]
|
local a = node.attributes[w]
|
||||||
-- equals
|
-- equals
|
||||||
|
181
tst/init.lua
181
tst/init.lua
@ -6,42 +6,185 @@ local lunitx = require("lunitx")
|
|||||||
module("html", lunitx.testcase, package.seeall)
|
module("html", lunitx.testcase, package.seeall)
|
||||||
|
|
||||||
local htmlparser = require("htmlparser")
|
local htmlparser = require("htmlparser")
|
||||||
local tree, sel
|
|
||||||
|
function test_void()
|
||||||
|
local tree = htmlparser.parse([[
|
||||||
|
<p>
|
||||||
|
<br>
|
||||||
|
<br/>
|
||||||
|
<br >
|
||||||
|
<br />
|
||||||
|
</p>
|
||||||
|
<br>
|
||||||
|
<br/>
|
||||||
|
<br >
|
||||||
|
<br />
|
||||||
|
]])
|
||||||
|
assert_equal(5, #tree.nodes, "top level")
|
||||||
|
for _,n in ipairs(tree.nodes) do
|
||||||
|
if n.name == "p" then
|
||||||
|
assert_equal(4, #n.nodes, "deeper level")
|
||||||
|
else
|
||||||
|
assert_equal("br", n.name, "name")
|
||||||
|
assert_equal("", n:getcontent(), "content")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function test_attr()
|
||||||
|
local tree = htmlparser.parse([[
|
||||||
|
<n a1 a2= a3='' a4=""
|
||||||
|
a5='a"5"' a6="a'6'" a7='#.[] :()' a8='|*+-=?$^%&/'
|
||||||
|
a9=a9
|
||||||
|
a10></n>
|
||||||
|
]])
|
||||||
|
assert_equal(1, #tree.nodes, "top level")
|
||||||
|
local n = tree.nodes[1]
|
||||||
|
assert(tree("[a1]")[n], "a1")
|
||||||
|
assert(tree("[a2]")[n], "a2")
|
||||||
|
assert(tree("[a3]")[n], "a3")
|
||||||
|
assert(tree("[a4]")[n], "a4")
|
||||||
|
assert(tree("[a5]")[n], "a5")
|
||||||
|
assert(tree("[a6]")[n], "a6")
|
||||||
|
assert(tree("[a7]")[n], "a7")
|
||||||
|
assert(tree("[a8]")[n], "a8")
|
||||||
|
assert(tree("[a9]")[n], "a9")
|
||||||
|
assert(tree("[a10]")[n], "a10")
|
||||||
|
end
|
||||||
|
|
||||||
|
function test_attr_equal()
|
||||||
|
local tree = htmlparser.parse([[
|
||||||
|
<n a1 a2= a3='' a4=""
|
||||||
|
a5='a"5"' a6="a'6'" a7='#.[] :()' a8='|*+-=?$^%&/'
|
||||||
|
a9=a9
|
||||||
|
a10></n>
|
||||||
|
]])
|
||||||
|
assert_equal(1, #tree.nodes, "top level")
|
||||||
|
local n = tree.nodes[1]
|
||||||
|
assert(tree("[a1='']")[n], "a1=''")
|
||||||
|
assert(tree("[a2='']")[n], "a2=''")
|
||||||
|
assert(tree("[a3='']")[n], "a3=''")
|
||||||
|
assert(tree("[a4='']")[n], "a4=''")
|
||||||
|
assert(tree("[a5='a\"5\"']")[n], "a5='a\"5\"'")
|
||||||
|
assert(tree("[a6=\"a'6'\"]")[n], "a6=\"a'6'\"")
|
||||||
|
-- not these characters
|
||||||
|
-- (because these have a special meaning as id, class, or attribute selector, hierarchy separator, or filter command)
|
||||||
|
-- they can occur in the HTML, but not in a selector string
|
||||||
|
-- assert(tree("[a7='#.[] :()']")[n], "a7='#.[] :()'")
|
||||||
|
assert(tree("[a8='|*+-=?$^%&/']")[n], "a8='|*+-=?$^%&/'")
|
||||||
|
assert(tree("[a9='a9']")[n], "a9='a9'")
|
||||||
|
assert(tree("[a10='']")[n], "a10=''")
|
||||||
|
assert(tree("[a10=]")[n], "a10=")
|
||||||
|
end
|
||||||
|
|
||||||
|
function test_attr_notequal()
|
||||||
|
local tree = htmlparser.parse([[
|
||||||
|
<n a1="a1"></n>
|
||||||
|
<n a1="a2"></n>
|
||||||
|
<n a1></n>
|
||||||
|
<n></n>
|
||||||
|
]])
|
||||||
|
assert_equal(4, #tree.nodes, "top level")
|
||||||
|
assert_equal(3, tree("[a1!='a1']"):len(), "a1!='a1'")
|
||||||
|
assert_equal(4, tree("[a1!='b1']"):len(), "a1!='b1'")
|
||||||
|
assert_equal(3, tree("[a1!='']"):len(), "a1!=''")
|
||||||
|
assert_equal(3, tree("[a1!=]"):len(), "a1!=")
|
||||||
|
end
|
||||||
|
|
||||||
|
function test_attr_prefix_start_end()
|
||||||
|
local tree = htmlparser.parse([[
|
||||||
|
<n a1="en-gb"></n>
|
||||||
|
<n a1="en-us"></n>
|
||||||
|
<n a1="en"></n>
|
||||||
|
<n a1="enen"></n>
|
||||||
|
<n></n>
|
||||||
|
]])
|
||||||
|
assert_equal(5, #tree.nodes, "top level")
|
||||||
|
assert_equal(3, tree("[a1|='en']"):len(), "a1|='en'")
|
||||||
|
assert_equal(4, tree("[a1^='en']"):len(), "a1^='en'")
|
||||||
|
assert_equal(2, tree("[a1$='en']"):len(), "a1$='en'")
|
||||||
|
end
|
||||||
|
|
||||||
|
function test_attr_word()
|
||||||
|
local tree = htmlparser.parse([[
|
||||||
|
<n a1="one two three"></n>
|
||||||
|
<n a1="three four five"></n>
|
||||||
|
<n a1></n>
|
||||||
|
<n></n>
|
||||||
|
]])
|
||||||
|
assert_equal(4, #tree.nodes, "top level")
|
||||||
|
assert_equal(1, tree("[a1~='two']"):len(), "a1~='two'")
|
||||||
|
assert_equal(2, tree("[a1~='three']"):len(), "a1~='three'")
|
||||||
|
assert_equal(1, tree("[a1~='four']"):len(), "a1~='four'")
|
||||||
|
end
|
||||||
|
|
||||||
|
function test_attr_contains()
|
||||||
|
local tree = htmlparser.parse([[
|
||||||
|
<n a1="one"></n>
|
||||||
|
<n a1="one two three"></n>
|
||||||
|
<n a1="three four five"></n>
|
||||||
|
<n a1=""></n>
|
||||||
|
<n a1></n>
|
||||||
|
<n></n>
|
||||||
|
]])
|
||||||
|
assert_equal(6, #tree.nodes, "top level")
|
||||||
|
assert_equal(2, tree("[a1*='one']"):len(), "a1*='one'")
|
||||||
|
assert_equal(2, tree("[a1*='t']"):len(), "a1*='t'")
|
||||||
|
assert_equal(1, tree("[a1*='f']"):len(), "a1*='f'")
|
||||||
|
assert_equal(5, tree("[a1*='']"):len(), "a1*=''")
|
||||||
|
assert_equal(5, tree("[a1*=]"):len(), "a1*=")
|
||||||
|
end
|
||||||
|
|
||||||
function test_descendants()
|
function test_descendants()
|
||||||
tree = htmlparser.parse([[
|
local tree = htmlparser.parse([[
|
||||||
<parent>1
|
<parent>1
|
||||||
<child>1.1</child>
|
<child>1</child>
|
||||||
<child>1.2
|
<child>2
|
||||||
<child>1.2.1</child>
|
<child>3</child>
|
||||||
</child>
|
</child>
|
||||||
|
<arbitrary>
|
||||||
|
<child>4</child>
|
||||||
|
</arbitrary>
|
||||||
</parent>
|
</parent>
|
||||||
<parent>2
|
<parent>2
|
||||||
<child>2.1</child>
|
<child>5</child>
|
||||||
<child>2.2
|
<child>6
|
||||||
<child>2.2.1</child>
|
<child>7</child>
|
||||||
</child>
|
</child>
|
||||||
|
<arbitrary>
|
||||||
|
<child>8</child>
|
||||||
|
</arbitrary>
|
||||||
</parent>
|
</parent>
|
||||||
|
<arbitrary>
|
||||||
|
<child>not</child>
|
||||||
|
</arbitrary>
|
||||||
]])
|
]])
|
||||||
sel = tree("parent child")
|
assert_equal(8, tree("parent child"):len(), 'parent child')
|
||||||
assert_equal(6, sel:len(), 'parent child')
|
|
||||||
end
|
end
|
||||||
|
|
||||||
function test_children()
|
function test_children()
|
||||||
tree = htmlparser.parse([[
|
local tree = htmlparser.parse([[
|
||||||
<parent>1
|
<parent>1
|
||||||
<child>1.1</child>
|
<child>1</child>
|
||||||
<child>1.2
|
<child>2
|
||||||
<child>1.2.1</child>
|
<child>not</child>
|
||||||
</child>
|
</child>
|
||||||
|
<arbitrary>
|
||||||
|
<child>not</child>
|
||||||
|
</arbitrary>
|
||||||
</parent>
|
</parent>
|
||||||
<parent>2
|
<parent>2
|
||||||
<child>2.1</child>
|
<child>3</child>
|
||||||
<child>2.2
|
<child>4
|
||||||
<child>2.2.1</child>
|
<child>not</child>
|
||||||
</child>
|
</child>
|
||||||
|
<arbitrary>
|
||||||
|
<child>not</child>
|
||||||
|
</arbitrary>
|
||||||
</parent>
|
</parent>
|
||||||
|
<arbitrary>
|
||||||
|
<child>not</child>
|
||||||
|
</arbitrary>
|
||||||
]])
|
]])
|
||||||
sel = tree("parent > child")
|
assert_equal(4, tree("parent > child"):len(), 'parent > child')
|
||||||
assert_equal(4, sel:len(), 'parent > child')
|
|
||||||
end
|
end
|
Loading…
Reference in New Issue
Block a user