mirror of
				https://github.com/TangentFoxy/lua-htmlparser.git
				synced 2025-10-24 20:35:01 +00:00 
			
		
		
		
	first draft
Inital working version in version control
This commit is contained in:
		
							
								
								
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1,2 @@ | ||||
|  | ||||
| .DS_Store | ||||
							
								
								
									
										119
									
								
								ElementNode.lua
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										119
									
								
								ElementNode.lua
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,119 @@ | ||||
| local Set = require "Set" | ||||
|  | ||||
| local ElementNode = {} | ||||
| ElementNode.mt = {__index = ElementNode} | ||||
| function ElementNode:new(nameortext, node, descend, openstart, openend) | ||||
|   local instance = { | ||||
|     name = nameortext, | ||||
|     level = 0, | ||||
|     parent = nil, | ||||
|     root = nil, | ||||
|     nodes = {}, | ||||
|     _openstart = openstart, _openend = openend, | ||||
|     _closestart = openstart, _closeend = openend, | ||||
|     attributes = {}, | ||||
|     id = nil, | ||||
|     classes = {}, | ||||
|     deepernodes = Set:new(), | ||||
|     deeperelements = {}, deeperattributes = {}, deeperids = {}, deeperclasses = {} | ||||
|   } | ||||
|   if not node then | ||||
|     instance.name = "root" | ||||
|     instance.root = instance | ||||
|     instance._text = nameortext | ||||
|     local length = string.len(nameortext) | ||||
|     instance._openstart, instance._openend = 1, length | ||||
|     instance._closestart, instance._closeend = 1, length | ||||
|   elseif descend then | ||||
|     instance.root = node.root | ||||
|     instance.parent = node | ||||
|     instance.level = node.level + 1 | ||||
|     table.insert(node.nodes, instance) | ||||
|   else | ||||
|     instance.root = node.root | ||||
|     instance.parent = node.parent | ||||
|     instance.level = node.level | ||||
|     table.insert(node.parent.nodes, instance) | ||||
|   end | ||||
|   return setmetatable(instance, ElementNode.mt) | ||||
| end | ||||
|  | ||||
| function ElementNode:gettext() | ||||
|   return string.sub(self.root._text, self._openstart, self._closeend) | ||||
| end | ||||
|  | ||||
| function ElementNode:getcontent() | ||||
|   return string.sub(self.root._text, self._openend + 1, self._closestart - 1) | ||||
| end | ||||
|  | ||||
| function ElementNode:addattribute(k, v) | ||||
|   self.attributes[k] = v | ||||
|   if string.lower(k) == "id" then | ||||
|     self.id = v | ||||
|   end | ||||
|   -- class attribute contains "space-separated tokens", each of which we'd like quick access to | ||||
|   if string.lower(k) == "class" then | ||||
|     for class in string.gmatch(v, "%S+") do | ||||
|       table.insert(self.classes, class) | ||||
|     end | ||||
|   end | ||||
| end | ||||
|  | ||||
| local function insert(list, name, node) | ||||
|   if not list[name] then | ||||
|     list[name] = Set:new() | ||||
|   end | ||||
|   list[name]:add(node) | ||||
| end | ||||
|  | ||||
| function ElementNode:close(closestart, closeend) | ||||
|   if closestart and closeend then | ||||
|     self._closestart, self._closeend = closestart, closeend | ||||
|   end | ||||
|   -- inform hihger level nodes about this element's existence in their branches | ||||
|   local node = self | ||||
|   while true do | ||||
|     node = node.parent | ||||
|     if not node then break end | ||||
|     node.deepernodes:add(self) | ||||
|     insert(node.deeperelements, self.name, self) | ||||
|     for k in pairs(self.attributes) do | ||||
|       insert(node.deeperattributes, k, self) | ||||
|     end | ||||
|     if self.id then | ||||
|       insert(node.deeperids, self.id, self) | ||||
|     end | ||||
|     for _,v in ipairs(self.classes) do | ||||
|       insert(node.deeperclasses, v, self) | ||||
|     end | ||||
|   end | ||||
| end | ||||
|  | ||||
| function ElementNode:select(s) | ||||
|   if not s or type(s) ~= "string" then return {} end | ||||
|   local subjects = Set:new({self}) | ||||
|   local resultset | ||||
|   local childrenonly | ||||
|   for part in string.gmatch(s, "%S+") do | ||||
|     if part == ">" then childrenonly = true goto nextpart end | ||||
|     resultset = Set:new() | ||||
|     for subject in pairs(subjects) do | ||||
|       local init = subject.deepernodes | ||||
|       if childrenonly then init = Set:new(subject.nodes) childrenonly = false end | ||||
|       resultset = resultset + init | ||||
|     end | ||||
|     if part == "*" then goto nextpart end | ||||
|     for t, w in string.gmatch(part, "([%[#%.]?)([^%[%]#%.]+)") do | ||||
|       if t == "" then resultset = resultset * self.deeperelements[w] | ||||
|       elseif t == "[" then resultset = resultset * self.deeperattributes[w] | ||||
|       elseif t == "#" then resultset = resultset * self.deeperids[w] | ||||
|       elseif t == "." then resultset = resultset * self.deeperclasses[w] | ||||
|       end | ||||
|     end | ||||
|     subjects = Set:new(resultset) | ||||
|     ::nextpart:: | ||||
|   end | ||||
|   return resultset:tolist() | ||||
| end | ||||
|  | ||||
| return ElementNode | ||||
							
								
								
									
										57
									
								
								HtmlParser.lua
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										57
									
								
								HtmlParser.lua
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,57 @@ | ||||
| local ElementNode = require("ElementNode") | ||||
| local voidelements = require("voidelements") | ||||
|  | ||||
| local HtmlParser = {} | ||||
|  | ||||
| local function parse(text) | ||||
|   local root = ElementNode:new(text) | ||||
|  | ||||
|   local node, descend, tpos, opentags = root, true, 1, {} | ||||
|   while true do | ||||
|     local openstart, name | ||||
|     openstart, tpos, name = string.find(root._text, "<(%w+)[^>]*>", tpos) | ||||
|     if not name then break end | ||||
|     local tag = ElementNode:new(name, node, descend, openstart, tpos) | ||||
|     node = tag | ||||
|  | ||||
|     local tagst, apos = tag:gettext(), 1 | ||||
|     while true do | ||||
|       local start, k, quote, v | ||||
|       start, apos, k, quote = string.find(tagst, "%s+([^%s=]+)=(['\"]?)", apos) | ||||
|       if not k then break end | ||||
|       local pattern = "=([^%s'\">]*)" | ||||
|       if quote ~= '' then | ||||
|         pattern = quote .. "([^" .. quote .. "]*)" .. quote | ||||
|       end | ||||
|       start, apos, v = string.find(tagst, pattern, apos) | ||||
|       tag:addattribute(k, v) | ||||
|     end | ||||
|  | ||||
|     if voidelements[string.lower(tag.name)] then | ||||
|       descend = false | ||||
|       tag:close() | ||||
|     else | ||||
|       opentags[tag.name] = tag | ||||
|     end | ||||
|  | ||||
|     local closeend = tpos | ||||
|     while true do | ||||
|       local closestart, closing, closename | ||||
|       closestart, closeend, closing, closename = string.find(root._text, "[^<]*<(/?)(%w+)", closeend) | ||||
|       closing = closing and closing ~= '' | ||||
|       if not closing then break end | ||||
|       tag = opentags[closename] | ||||
|       opentags[closename] = nil | ||||
|       closestart = string.find(root._text, "<", closestart) | ||||
|       tag:close(closestart, closeend + 1) | ||||
|       node = tag.parent | ||||
|       descend = true | ||||
|     end | ||||
|   end | ||||
|  | ||||
|   return root | ||||
| end | ||||
| HtmlParser.parse = parse | ||||
|  | ||||
| return HtmlParser | ||||
|  | ||||
							
								
								
									
										87
									
								
								Set.lua
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										87
									
								
								Set.lua
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,87 @@ | ||||
| local Set = {} | ||||
| Set.mt = {__index = Set} | ||||
| function Set:new(t) | ||||
|   local instance = {} | ||||
|   if type(t) == "table" then | ||||
|     if #t > 0 then | ||||
|       for _,v in ipairs(t) do | ||||
|         instance[v] = true | ||||
|       end | ||||
|     else | ||||
|       for k in pairs(t) do | ||||
|         instance[k] = true | ||||
|       end | ||||
|     end | ||||
|   else | ||||
|     instance = {t} | ||||
|   end | ||||
|   return setmetatable(instance, Set.mt) | ||||
| end | ||||
|  | ||||
| function Set:add(e) | ||||
|   self[e] = true | ||||
| end | ||||
|  | ||||
| function Set:remove(e) | ||||
|   self[e] = nil | ||||
| end | ||||
|  | ||||
| -- Union | ||||
| Set.mt.__add = function (a, b) | ||||
|   local res = Set:new() | ||||
|   if getmetatable(a) ~= Set.mt then a = Set:new(a) end | ||||
|   if getmetatable(b) ~= Set.mt then b = Set:new(b) end | ||||
|   for k in pairs(a) do res[k] = true end | ||||
|   for k in pairs(b) do res[k] = true end | ||||
|   return res | ||||
| end | ||||
|  | ||||
| -- Subtraction | ||||
| Set.mt.__sub = function (a, b) | ||||
|   local res = Set:new() | ||||
|   if getmetatable(a) ~= Set.mt then a = Set:new(a) end | ||||
|   if getmetatable(b) ~= Set.mt then b = Set:new(b) end | ||||
|   for k in pairs(a) do res[k] = true end | ||||
|   for k in pairs(b) do res[k] = nil end | ||||
|   return res | ||||
| end | ||||
|  | ||||
| -- Intersection | ||||
| Set.mt.__mul = function (a, b) | ||||
|   local res = Set:new() | ||||
|   if getmetatable(a) ~= Set.mt then a = Set:new(a) end | ||||
|   if getmetatable(b) ~= Set.mt then b = Set:new(b) end | ||||
|   for k in pairs(a) do | ||||
|     res[k] = b[k] | ||||
|   end | ||||
|   return res | ||||
| end | ||||
|  | ||||
| -- String representation | ||||
| Set.mt.__tostring = function (set) | ||||
|   local s = "{" | ||||
|   local sep = "" | ||||
|   for k in pairs(set) do | ||||
|     s = s .. sep .. k | ||||
|     sep = ", " | ||||
|   end | ||||
|   return s .. "}" | ||||
| end | ||||
|  | ||||
| function Set:len() | ||||
|   local num = 0 | ||||
|   for _ in pairs(self) do | ||||
|     num = num + 1 | ||||
|   end | ||||
|   return num | ||||
| end | ||||
|  | ||||
| function Set:tolist() | ||||
|   local res = {} | ||||
|   for k in pairs(self) do | ||||
|     table.insert(res, k) | ||||
|   end | ||||
|   return res | ||||
| end | ||||
|  | ||||
| return Set | ||||
							
								
								
									
										43
									
								
								test.html
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										43
									
								
								test.html
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,43 @@ | ||||
| <!DOCTYPE html> | ||||
| <html lang="en" test1='val1' test2="val='2'" test3='val="3"' test4="val = 4" test5=val5> | ||||
| <head> | ||||
|   <meta charset="utf-8" /> | ||||
|   <link rel="stylesheet" href="test.css" /> | ||||
|   <link rel="alternate" title="Feed" type="application/atom+xml" href="#" /> | ||||
| </head> | ||||
| <body> | ||||
|   <h1>Contents</h1> | ||||
|   <ol class="chapters"> | ||||
|     <li>Preface</li> | ||||
|     <li>Introduction</li> | ||||
|     <li>Concepts</li> | ||||
|     <li>Theory</li> | ||||
|     <li>Hypotheses</li> | ||||
|     <li>Experiments</li> | ||||
|     <li>Conclusions</li> | ||||
|     <li>References</li> | ||||
|   </ol> | ||||
|   <h1>Acknowledgements</h1> | ||||
|   <p> | ||||
|     Surely, we could not have done this huge amount of work all by ourselves.<br /> | ||||
|     Therefore, we cannot thank enough the following persons for their kind contributions: | ||||
|     <!-- | ||||
|       Of course, the text in this paragraph only serve presentation purposes, i.e. it's not actually part of the machine-consumable structured data that this API is serving. | ||||
|     --> | ||||
|   </p> | ||||
|   <ul class="contacts"> | ||||
|     <li id="/contacts/4711"> | ||||
|       <a href="/contacts/4711"> | ||||
|         <span class="firstname">Jon</span> | ||||
|         <span class="lastname">Moore</span> | ||||
|       </a> | ||||
|     </li> | ||||
|     <li id="/contacts/4712"> | ||||
|       <a href="/contacts/4712"> | ||||
|         <span class="firstname">Homer</span> | ||||
|         <span class="lastname">Simpson</span> | ||||
|       </a> | ||||
|     </li> | ||||
|   </ul> | ||||
| </body> | ||||
| </html> | ||||
							
								
								
									
										52
									
								
								test.lua
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										52
									
								
								test.lua
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,52 @@ | ||||
| local HtmlParser = require("HtmlParser") | ||||
|  | ||||
| local io = require("io") | ||||
| local file = io.input("./test.html") | ||||
| local text = io.read("*a") file:close() | ||||
|  | ||||
| local root = HtmlParser.parse(text) | ||||
|  | ||||
| -- print the tree | ||||
| local function p(n) | ||||
|   local space = string.rep("  ", n.level) | ||||
|   local s = space .. n.name | ||||
|   for i,v in ipairs(n.nodes) do | ||||
|     s = s .. " nodes[" .. i .. "]=" .. v.name | ||||
|   end | ||||
|   for k,v in pairs(n.attributes) do | ||||
|     s = s .. " " .. k .. "=[" .. v .. "]" | ||||
|   end | ||||
|   print(s) | ||||
|   for i,v in ipairs(n.nodes) do | ||||
|     p(v) | ||||
|   end | ||||
| end | ||||
| p(root) | ||||
|  | ||||
| local function select( s ) | ||||
|   print "" | ||||
|   print("->", s) | ||||
|   local tags = root:select(s) | ||||
|   for i,t in ipairs(tags) do | ||||
|     print(t.name) | ||||
|   end | ||||
|   print(# tags) | ||||
| end | ||||
| select("*") | ||||
| select("link") | ||||
| select("#/contacts/4711") | ||||
| select(".chapters") | ||||
| select("[href]") | ||||
| select("span.firstname") | ||||
| select("ul[id]") | ||||
|  | ||||
| select("#/contacts/4711") | ||||
| select("#/contacts/4711 *") | ||||
| select("#/contacts/4711 .lastname") | ||||
| select("body li[id]") | ||||
|  | ||||
| select("ul") | ||||
| select("ul *") | ||||
| select("ul > *") | ||||
| select("body [class]") | ||||
| select("body > [class]") | ||||
							
								
								
									
										18
									
								
								voidelements.lua
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								voidelements.lua
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,18 @@ | ||||
| return { | ||||
|   area = true, | ||||
|   base = true, | ||||
|   br = true, | ||||
|   col = true, | ||||
|   command = true, | ||||
|   embed = true, | ||||
|   hr = true, | ||||
|   img = true, | ||||
|   input = true, | ||||
|   keygen = true, | ||||
|   link = true, | ||||
|   meta = true, | ||||
|   param = true, | ||||
|   source = true, | ||||
|   track = true, | ||||
|   wbr = true | ||||
| } | ||||
		Reference in New Issue
	
	Block a user