diff --git a/ElementNode.lua b/ElementNode.lua index fdeebc4..4b530ea 100644 --- a/ElementNode.lua +++ b/ElementNode.lua @@ -17,9 +17,7 @@ function ElementNode:new(nameortext, node, descend, openstart, openend) deepernodes = Set:new(), deeperelements = {}, deeperattributes = {}, deeperids = {}, deeperclasses = {} } - if nameortext == "container" then - instance.root = node - elseif not node then + if not node then instance.name = "root" instance.root = instance instance._text = nameortext @@ -90,48 +88,67 @@ function ElementNode:close(closestart, closeend) end local function select(self, s) - if not s or type(s) ~= "string" then return {} end - local subjects = Set:new({self}) - local resultset - local childrenonly + if not s or type(s) ~= "string" or s == "" then return Set:new() end + + local function match(t, w) + local sets = { + [""] = self.deeperelements, + ["["] = self.deeperattributes, + ["#"] = self.deeperids, + ["."] = self.deeperclasses + } + local v + if t == "[" then + w, v = string.match(w, + "([^=]+)" .. -- w = 1 or more characters up to a possible "=" + "=?" .. -- an optional uncaptured "=" + "(.*)" -- v = anything following the "=", or else "" + ) + end + local matched = sets[t][w] + if v and v ~= "" then + v = string.sub(v, 2, #v - 1) -- strip quotes + for node in pairs(matched) do + if node.attributes[w] ~= v then + matched:remove(node) + end + end + end + return matched + end + + local subjects, resultset, childrenonly = Set:new({self}) for part in string.gmatch(s, "%S+") do if part == ">" then childrenonly = true goto nextpart end resultset = Set:new() for subject in pairs(subjects) do - local init = subject.deepernodes - if childrenonly then init = Set:new(subject.nodes) childrenonly = false end - resultset = resultset + init + local star = subject.deepernodes + if childrenonly then star = Set:new(subject.nodes) end + childrenonly = false + resultset = resultset + star end if part == "*" then goto nextpart end - for t, w in string.gmatch(part, "([%[#%.]?)([^%[%]#%.]+)") do - if t == "" then resultset = resultset * self.deeperelements[w] - elseif t == "[" then resultset = resultset * self.deeperattributes[w] - elseif t == "#" then resultset = resultset * self.deeperids[w] - elseif t == "." then resultset = resultset * self.deeperclasses[w] + local excludes, filter = Set:new() + for t, w in string.gmatch(part, + "([:%[#.]?)" .. -- t = an optional :, [, #, or . + "([^:%(%[#.%]%)]+)" .. -- w = 1 or more of anything not :, (, [, #, ., ], or ) + "%]?%)?" -- followed by an uncaptured optional ] and/or ) + ) do + if t == ":" then filter = w goto nextw end + local matched = match(t, w) + if filter == "not" then + excludes = excludes + matched + else + resultset = resultset * matched end + filter = nil + ::nextw:: end + resultset = resultset - excludes subjects = Set:new(resultset) ::nextpart:: end - -- construct a container node for the resultset, so that we can :select() on it - local ret = ElementNode:new("container", self) - for node in pairs(resultset) do - table.insert(ret.nodes, node) - ret.deepernodes = ret.deepernodes + node.deepernodes - for listname,list in pairs({ - deeperelements = node.deeperelements, - deeperattributes = node.deeperattributes, - deeperids = node.deeperids, - deeperclasses = node.deeperclasses - }) do - local target = ret[listname] - for k,set in pairs(list) do - -- Set.__add will create an empty Set if not target[k] - target[k] = target[k] + set - end - end - end - return ret + return resultset end function ElementNode:select(s) return select(self, s) end diff --git a/htmlparser.lua b/htmlparser.lua index bcc4ad0..053baad 100644 --- a/htmlparser.lua +++ b/htmlparser.lua @@ -9,7 +9,11 @@ local function parse(text) local node, descend, tpos, opentags = root, true, 1, {} while true do local openstart, name - openstart, tpos, name = string.find(root._text, "<(%w+)[^>]*>", tpos) + openstart, tpos, name = string.find(root._text, + "<" .. -- an uncaptured starting "<" + "(%w+)" .. -- name = the first word, directly following the "<" + "[^>]*>", -- include, but not capture everything up to the next ">" + tpos) if not name then break end local tag = ElementNode:new(name, node, descend, openstart, tpos) node = tag @@ -17,12 +21,17 @@ local function parse(text) local tagst, apos = tag:gettext(), 1 while true do local start, k, eq, quote, v - start, apos, k, eq, quote = string.find(tagst, "%s+([^%s=]+)(=?)(['\"]?)", apos) - if not k then break end + start, apos, k, eq, quote = string.find(tagst, + "%s+" .. -- some uncaptured space + "([^%s=]+)" .. -- k = an unspaced string up to an optional "=" + "(=?)" .. -- eq = the optiona; "=", else "" + "(['\"]?)", -- quote = an optional "'" or '"' following the "=", or "" + apos) + if not k or k == "/>" then break end if eq == "" then v = "" else - local pattern = "=([^%s'\">]*)" + local pattern = "=([^%s>]*)" if quote ~= '' then pattern = quote .. "([^" .. quote .. "]*)" .. quote end diff --git a/test.html b/test.html index 3b5f88b..dbff550 100644 --- a/test.html +++ b/test.html @@ -1,5 +1,5 @@ - + diff --git a/test.lua b/test.lua index 0d42f4a..b139fa5 100644 --- a/test.lua +++ b/test.lua @@ -10,11 +10,8 @@ local root = htmlparser.parse(text) local function p(n) local space = string.rep(" ", n.level) local s = space .. n.name - for i,v in ipairs(n.nodes) do - s = s .. " nodes[" .. i .. "]=" .. v.name - end for k,v in pairs(n.attributes) do - s = s .. " " .. k .. "=[" .. v .. "]" + s = s .. " " .. k .. "=[[" .. v .. "]]" end print(s) for i,v in ipairs(n.nodes) do @@ -26,11 +23,11 @@ p(root) local function select( s ) print "" print("->", s) - local tags = root:select(s) - for i,t in ipairs(tags.nodes) do - print(t.name) + local sel = root:select(s) + for element in pairs(sel) do + print(element.name) end - print(# tags.nodes) + print(sel:len()) end select("*") select("link") @@ -51,10 +48,22 @@ select("ul > *") select("body [class]") select("body > [class]") +select(".contacts span:not(.firstname)") +select(":not(a)[href]") +select("[itemscope]:not([itemprop])") + +select("link[rel='alternate']") +select("[test2=\"val='2'\"]") +select("[test5='val5']") +select("[test6='val\"\"6']") +select("[itemscope='']") +select("[itemscope=]") +select("[itemscope]") + print("\nchapters") local sel, chapters = root("ol.chapters > li"), {} -for _,v in ipairs(sel.nodes) do - table.insert(chapters, v:getcontent()) +for e in pairs(sel) do + table.insert(chapters, e:getcontent()) end -- print for i,v in ipairs(chapters) do @@ -62,11 +71,11 @@ for i,v in ipairs(chapters) do end print("\ncontacts") -local sel, contacts = root("ul.contacts > li")("span[class]"), {} -for _,v in ipairs(sel.nodes) do - local id = v.parent.parent.id -- li > a > span +local sel, contacts = root("ul.contacts span[class]"), {} +for e in pairs(sel) do + local id = e.parent.parent.id -- li > a > span contacts[id] = contacts[id] or {} - contacts[id][v.classes[1]] = v:getcontent() + contacts[id][e.classes[1]] = e:getcontent() end -- print for k,v in pairs(contacts) do @@ -78,7 +87,7 @@ end print("\nmicrodata") local sel, scopes = root("[itemprop]"), {} -for _,prop in ipairs(sel.nodes) do +for prop in pairs(sel) do if prop.attributes["itemscope"] then goto nextprop end local descendantscopes, scope = {}, prop while true do @@ -113,4 +122,4 @@ local function printscope(node, table, level) end for node,table in pairs(scopes) do printscope(node, table) -end \ No newline at end of file +end