Moved some details to a function

... regarding [attribute=value] and other matching.
Also explained (in comments) the different matching patterns.
And fixed a bug where /> would be listed as an attribute.
And added a few more tests.
This commit is contained in:
Wouter Scherphof 2013-03-25 13:55:35 +01:00
parent de746865be
commit 64f3eb4df3
4 changed files with 52 additions and 30 deletions

View File

@ -89,12 +89,34 @@ end
local function select(self, s)
if not s or type(s) ~= "string" or s == "" then return Set:new() end
local sets = {
[""] = self.deeperelements,
["["] = self.deeperattributes,
["#"] = self.deeperids,
["."] = self.deeperclasses
}
local function match(t, w)
local sets = {
[""] = self.deeperelements,
["["] = self.deeperattributes,
["#"] = self.deeperids,
["."] = self.deeperclasses
}
local v
if t == "[" then
w, v = string.match(w,
"([^=]+)" .. -- w = 1 or more characters up to a possible "="
"=?" .. -- an optional uncaptured "="
"(.*)" -- v = anything following the "=", or else ""
)
end
local matched = sets[t][w]
if v and v ~= "" then
v = string.sub(v, 2, #v - 1) -- strip quotes
for node in pairs(matched) do
if node.attributes[w] ~= v then
matched:remove(node)
end
end
end
return matched
end
local subjects, resultset, childrenonly = Set:new({self})
for part in string.gmatch(s, "%S+") do
if part == ">" then childrenonly = true goto nextpart end
@ -107,28 +129,17 @@ local function select(self, s)
end
if part == "*" then goto nextpart end
local excludes, filter = Set:new()
for t, w, v in string.gmatch(part,
for t, w in string.gmatch(part,
"([:%[#.]?)" .. -- t = an optional :, [, #, or .
"([^:%(%[#.%]%)]+)" .. -- w = 1 or more of anything not :, (, [, #, ., ], or )
"%]?%)?" -- followed by an uncaptured optional ] and/or )
) do
if t == ":" then filter = w goto nextw end
if t == "[" then
w, v = string.match(w, "([^=]+)=?(%S*)")
end
local match = sets[t][w]
if v and v ~= "" then
v = string.sub(v, 2, #v - 1) -- strip quotes
for node in pairs(match) do
if node.attributes[w] ~= v then
match:remove(node)
end
end
end
local matched = match(t, w)
if filter == "not" then
excludes = excludes + match
excludes = excludes + matched
else
resultset = resultset * match
resultset = resultset * matched
end
filter = nil
::nextw::

View File

@ -9,7 +9,11 @@ local function parse(text)
local node, descend, tpos, opentags = root, true, 1, {}
while true do
local openstart, name
openstart, tpos, name = string.find(root._text, "<(%w+)[^>]*>", tpos)
openstart, tpos, name = string.find(root._text,
"<" .. -- an uncaptured starting "<"
"(%w+)" .. -- name = the first word, directly following the "<"
"[^>]*>", -- include, but not capture everything up to the next ">"
tpos)
if not name then break end
local tag = ElementNode:new(name, node, descend, openstart, tpos)
node = tag
@ -17,12 +21,17 @@ local function parse(text)
local tagst, apos = tag:gettext(), 1
while true do
local start, k, eq, quote, v
start, apos, k, eq, quote = string.find(tagst, "%s+([^%s=]+)(=?)(['\"]?)", apos)
if not k then break end
start, apos, k, eq, quote = string.find(tagst,
"%s+" .. -- some uncaptured space
"([^%s=]+)" .. -- k = an unspaced string up to an optional "="
"(=?)" .. -- eq = the optiona; "=", else ""
"(['\"]?)", -- quote = an optional "'" or '"' following the "=", or ""
apos)
if not k or k == "/>" then break end
if eq == "" then
v = ""
else
local pattern = "=([^%s'\">]*)"
local pattern = "=([^%s>]*)"
if quote ~= '' then
pattern = quote .. "([^" .. quote .. "]*)" .. quote
end

View File

@ -1,5 +1,5 @@
<!DOCTYPE html>
<html lang="en" test1='val1' test2="val='2'" test3='val="3"' test4="val = 4" test5=val5>
<html lang="en" test1='val1' test2="val='2'" test3='val="3"' test4="val = 4" test5=val5 test6=val""6>
<head>
<meta charset="utf-8" />
<link rel="stylesheet" href="test.css" />

View File

@ -10,11 +10,8 @@ local root = htmlparser.parse(text)
local function p(n)
local space = string.rep(" ", n.level)
local s = space .. n.name
for i,v in ipairs(n.nodes) do
s = s .. " nodes[" .. i .. "]=" .. v.name
end
for k,v in pairs(n.attributes) do
s = s .. " " .. k .. "=[" .. v .. "]"
s = s .. " " .. k .. "=[[" .. v .. "]]"
end
print(s)
for i,v in ipairs(n.nodes) do
@ -57,6 +54,11 @@ select("[itemscope]:not([itemprop])")
select("link[rel='alternate']")
select("[test2=\"val='2'\"]")
select("[test5='val5']")
select("[test6='val\"\"6']")
select("[itemscope='']")
select("[itemscope=]")
select("[itemscope]")
print("\nchapters")
local sel, chapters = root("ol.chapters > li"), {}