diff --git a/ElementNode.lua b/ElementNode.lua index ce65dfb..4b530ea 100644 --- a/ElementNode.lua +++ b/ElementNode.lua @@ -89,12 +89,34 @@ end local function select(self, s) if not s or type(s) ~= "string" or s == "" then return Set:new() end - local sets = { - [""] = self.deeperelements, - ["["] = self.deeperattributes, - ["#"] = self.deeperids, - ["."] = self.deeperclasses - } + + local function match(t, w) + local sets = { + [""] = self.deeperelements, + ["["] = self.deeperattributes, + ["#"] = self.deeperids, + ["."] = self.deeperclasses + } + local v + if t == "[" then + w, v = string.match(w, + "([^=]+)" .. -- w = 1 or more characters up to a possible "=" + "=?" .. -- an optional uncaptured "=" + "(.*)" -- v = anything following the "=", or else "" + ) + end + local matched = sets[t][w] + if v and v ~= "" then + v = string.sub(v, 2, #v - 1) -- strip quotes + for node in pairs(matched) do + if node.attributes[w] ~= v then + matched:remove(node) + end + end + end + return matched + end + local subjects, resultset, childrenonly = Set:new({self}) for part in string.gmatch(s, "%S+") do if part == ">" then childrenonly = true goto nextpart end @@ -107,28 +129,17 @@ local function select(self, s) end if part == "*" then goto nextpart end local excludes, filter = Set:new() - for t, w, v in string.gmatch(part, + for t, w in string.gmatch(part, "([:%[#.]?)" .. -- t = an optional :, [, #, or . "([^:%(%[#.%]%)]+)" .. -- w = 1 or more of anything not :, (, [, #, ., ], or ) "%]?%)?" -- followed by an uncaptured optional ] and/or ) ) do if t == ":" then filter = w goto nextw end - if t == "[" then - w, v = string.match(w, "([^=]+)=?(%S*)") - end - local match = sets[t][w] - if v and v ~= "" then - v = string.sub(v, 2, #v - 1) -- strip quotes - for node in pairs(match) do - if node.attributes[w] ~= v then - match:remove(node) - end - end - end + local matched = match(t, w) if filter == "not" then - excludes = excludes + match + excludes = excludes + matched else - resultset = resultset * match + resultset = resultset * matched end filter = nil ::nextw:: diff --git a/htmlparser.lua b/htmlparser.lua index bcc4ad0..053baad 100644 --- a/htmlparser.lua +++ b/htmlparser.lua @@ -9,7 +9,11 @@ local function parse(text) local node, descend, tpos, opentags = root, true, 1, {} while true do local openstart, name - openstart, tpos, name = string.find(root._text, "<(%w+)[^>]*>", tpos) + openstart, tpos, name = string.find(root._text, + "<" .. -- an uncaptured starting "<" + "(%w+)" .. -- name = the first word, directly following the "<" + "[^>]*>", -- include, but not capture everything up to the next ">" + tpos) if not name then break end local tag = ElementNode:new(name, node, descend, openstart, tpos) node = tag @@ -17,12 +21,17 @@ local function parse(text) local tagst, apos = tag:gettext(), 1 while true do local start, k, eq, quote, v - start, apos, k, eq, quote = string.find(tagst, "%s+([^%s=]+)(=?)(['\"]?)", apos) - if not k then break end + start, apos, k, eq, quote = string.find(tagst, + "%s+" .. -- some uncaptured space + "([^%s=]+)" .. -- k = an unspaced string up to an optional "=" + "(=?)" .. -- eq = the optiona; "=", else "" + "(['\"]?)", -- quote = an optional "'" or '"' following the "=", or "" + apos) + if not k or k == "/>" then break end if eq == "" then v = "" else - local pattern = "=([^%s'\">]*)" + local pattern = "=([^%s>]*)" if quote ~= '' then pattern = quote .. "([^" .. quote .. "]*)" .. quote end diff --git a/test.html b/test.html index 3b5f88b..dbff550 100644 --- a/test.html +++ b/test.html @@ -1,5 +1,5 @@ - +
diff --git a/test.lua b/test.lua index d4be9b2..b139fa5 100644 --- a/test.lua +++ b/test.lua @@ -10,11 +10,8 @@ local root = htmlparser.parse(text) local function p(n) local space = string.rep(" ", n.level) local s = space .. n.name - for i,v in ipairs(n.nodes) do - s = s .. " nodes[" .. i .. "]=" .. v.name - end for k,v in pairs(n.attributes) do - s = s .. " " .. k .. "=[" .. v .. "]" + s = s .. " " .. k .. "=[[" .. v .. "]]" end print(s) for i,v in ipairs(n.nodes) do @@ -57,6 +54,11 @@ select("[itemscope]:not([itemprop])") select("link[rel='alternate']") select("[test2=\"val='2'\"]") +select("[test5='val5']") +select("[test6='val\"\"6']") +select("[itemscope='']") +select("[itemscope=]") +select("[itemscope]") print("\nchapters") local sel, chapters = root("ol.chapters > li"), {}