diff --git a/.gitignore b/.gitignore index 9bea433..40e2119 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,17 @@ +# LuaRocks # +###################### +bin/ +lib/ +share/ +*.rock +# OS generated files # +###################### .DS_Store +.DS_Store? +._* +.Spotlight-V100 +.Trashes +Icon? +ehthumbs.db +Thumbs.db \ No newline at end of file diff --git a/README.md b/README.md index 2c947c2..10d154d 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,92 @@ -lua-htmlparser -============== +#LuaRock "htmlparser" -An HTML parser for lua. +Parse HTML text into a tree of elements with selectors + +[1]: http://wscherphof.github.com/lua-set/ +[2]: http://api.jquery.com/category/selectors/ + +##License +MIT; see `./doc/LICENSE` + +##Usage +Start off with +```lua +require("luarocks.loader") +local htmlparser = require("htmlparser") +``` +Then, parse some html: +```lua +local root = htmlparser.parse(htmlstring) +``` +The input to parse may be the contents of a complete html document, or any valid html snippet, as long as all tags are correctly opened and closed. +Now, find sepcific contained elements by selecting: +```lua +local elements = root:select(selectorstring) +``` +Or in shorthand: +```lua +local elements = root(selectorstring) +``` +This wil return a [Set][1] of elements, all of which are of the same type as the root element, and thus support selecting as well, if ever needed: +```lua +for e in pairs(elements) do + print(e.name) + local subs = e(subselectorstring) + for sub in pairs(subs) do + print("", sub.name) + end +end +``` +The root element is a container for the top level elements in the parsed text, i.e. the `` element in a parsed html document would be a child of the returned root element. + +##Selectors +Supported selectors are a subset of [jQuery's selectors][2]: + +- `"*"` all contained elements +- `"element"` elements with the given tagname +- `"#id"` elements with the given id attribute value +- `".class"` elements with the given classname in the class attribute +- `"[attribute]"` elements with an attribute of the given name +- `"[attribute='value']"` equals: elements with the given value for the attribute with the given name +- `"[attribute!='value']"` not equals: elements without an attribute of the given name, or with that attribute, but with a value that is different from the given value +- `"[attribute|='value']"` prefix: attribute's value is given value, or starts with given value, followed by a hyphen (`-`) +- `"[attribute*='value']"` contains: attribute's value contains given value +- `"[attribute~='value']"` word: attribute's value is a space-separated token, where one of the tokens is the given value +- `"[attribute^='value']"` starts with: attribute's value starts with given value +- `"[attribute$='value']"` ends with: attribute's value ends with given value +- `":not(selectorstring)"` elements not selected by given selector string +- `"ancestor descendant"` elements selected by the `descendant` selector string, that are a descendant of any element selected by the `ancestor` selector string +- `"parent > child"` elements selected by the `child` selector string, that are a child element of any element selected by the `parent` selector string + +Selectors can be combined; e.g. `".class:not([attribute]) element.class"` + +###Limitations +- Attribute values in selectors currently cannot contain any spaces, since space is interpreted as a delimiter between the `ancestor` and `descendant`, `parent` and `>`, or `>` and `child` parts of the selector +- Likewise, for the `parent > child` relation, the spaces before and after the `>` are mandatory +- `line1
line2

` is plainly `"line1
line2"` + +##Examples +See `./doc/samples.lua` + +##Element type +All tree elements provide, apart from `:select` and `()`, the following accessors: + +###Basic +- `.name` the element's tagname +- `.attributes` a table with keys and values for the element's attributes; `{}` if none +- `.id` the value of the element's id attribute; `nil` if not present +- `.classes` an array with the classes listed in element's class attribute; `{}` if none +- `:getcontent()` the raw text between the opening and closing tags of the element; `""` if none +- `.nodes` an array with the element's child elements, `{}` if none +- `.parent` the elements that contains this element; `root.parent` is `nil` + +###Other +- `:gettext()` the raw text of the complete element, starting with `""` +- `.level` how deep the element is in the tree; root level is `0` +- `.root` the root element of the tree; `root.root` is `root` +- `.deepernodes` a [Set][1] containing all elements in the tree beneath this element, including this element's `.nodes`; `{}` if none +- `.deeperelements` a table with a key for each distinct tagname in `.deepernodes`, containing a [Set][1] of all deeper element nodes with that name; `{}` in none +- `.deeperattributes` as `.deeperelements`, but keyed on attribute name +- `.deeperids` as `.deeperelements`, but keyed on id value +- `.deeperclasses` as `.deeperelements`, but keyed on class name diff --git a/Set.lua b/Set.lua deleted file mode 100644 index 104e31f..0000000 --- a/Set.lua +++ /dev/null @@ -1,87 +0,0 @@ -local Set = {} -Set.mt = {__index = Set} -function Set:new(t) - local instance = {} - if type(t) == "table" then - if #t > 0 then - for _,v in ipairs(t) do - instance[v] = true - end - else - for k in pairs(t) do - instance[k] = true - end - end - else - instance = {t} - end - return setmetatable(instance, Set.mt) -end - -function Set:add(e) - self[e] = true -end - -function Set:remove(e) - self[e] = nil -end - --- Union -Set.mt.__add = function (a, b) - local res = Set:new() - if getmetatable(a) ~= Set.mt then a = Set:new(a) end - if getmetatable(b) ~= Set.mt then b = Set:new(b) end - for k in pairs(a) do res[k] = true end - for k in pairs(b) do res[k] = true end - return res -end - --- Subtraction -Set.mt.__sub = function (a, b) - local res = Set:new() - if getmetatable(a) ~= Set.mt then a = Set:new(a) end - if getmetatable(b) ~= Set.mt then b = Set:new(b) end - for k in pairs(a) do res[k] = true end - for k in pairs(b) do res[k] = nil end - return res -end - --- Intersection -Set.mt.__mul = function (a, b) - local res = Set:new() - if getmetatable(a) ~= Set.mt then a = Set:new(a) end - if getmetatable(b) ~= Set.mt then b = Set:new(b) end - for k in pairs(a) do - res[k] = b[k] - end - return res -end - --- String representation -Set.mt.__tostring = function (set) - local s = "{" - local sep = "" - for k in pairs(set) do - s = s .. sep .. k - sep = ", " - end - return s .. "}" -end - -function Set:len() - local num = 0 - for _ in pairs(self) do - num = num + 1 - end - return num -end - -function Set:tolist() - local res = {} - for k in pairs(self) do - table.insert(res, k) - end - return res -end - -return Set \ No newline at end of file diff --git a/LICENSE b/doc/LICENSE similarity index 100% rename from LICENSE rename to doc/LICENSE diff --git a/doc/README.html b/doc/README.html new file mode 100644 index 0000000..59f7df1 --- /dev/null +++ b/doc/README.html @@ -0,0 +1,14 @@ + + + + Htmlparser LuaRock Readme + + + + +

You are being redirected to the homepage of the + Htmlparser LuaRock. +

+

If you are not redirected after a few seconds, please click on the link above!

+ + \ No newline at end of file diff --git a/test.html b/doc/sample.html similarity index 100% rename from test.html rename to doc/sample.html diff --git a/test.lua b/doc/sample.lua similarity index 93% rename from test.lua rename to doc/sample.lua index 1eff3f6..5c85bdb 100644 --- a/test.lua +++ b/doc/sample.lua @@ -1,7 +1,10 @@ +require("luarocks.loader") +-- Omit next line in actual module clients; it's only to support development of the module itself +package.path = "../src/?.lua;" .. package.path local htmlparser = require("htmlparser") local io = require("io") -local file = io.input("./test.html") +local file = io.input("./sample.html") local text = io.read("*a") file:close() local root = htmlparser.parse(text) @@ -29,6 +32,7 @@ local function select( s ) end print(sel:len()) end + select("*") select("link") select("#/contacts/4711") diff --git a/htmlparser-0.1-1.rockspec b/htmlparser-0.1-1.rockspec new file mode 100644 index 0000000..4238859 --- /dev/null +++ b/htmlparser-0.1-1.rockspec @@ -0,0 +1,26 @@ +package = "htmlparser" +version = "0.1-1" +source = { + url = "git://github.com/wscherphof/lua-htmlparser.git", + branch = "v0.1" +} +description = { + summary = "Parse HTML text into a tree of elements with selectors", + detailed = [[ + Call parse() to build up a tree of element nodes. Each node in the tree, including the root node that is returned by parse(), supports a basic set of jQuery-like selectors. Or you could walk the tree by hand. + ]], + homepage = "http://wscherphof.github.com/lua-htmlparser/", + license = "MIT" +} +dependencies = { + "lua >= 5.2", + "set >= 0.1" +} +build = { + type = "builtin", + modules = { + htmlparser = "src/htmlparser.lua", + ["htmlparser.ElementNode"] = "src/htmlparser/ElementNode.lua", + ["htmlparser.voidelements"] = "src/htmlparser/voidelements.lua" + } +} \ No newline at end of file diff --git a/htmlparser.lua b/src/htmlparser.lua similarity index 94% rename from htmlparser.lua rename to src/htmlparser.lua index 053baad..ba6c4a6 100644 --- a/htmlparser.lua +++ b/src/htmlparser.lua @@ -1,5 +1,5 @@ -local ElementNode = require("ElementNode") -local voidelements = require("voidelements") +local ElementNode = require("htmlparser.ElementNode") +local voidelements = require("htmlparser.voidelements") local HtmlParser = {} diff --git a/ElementNode.lua b/src/htmlparser/ElementNode.lua similarity index 99% rename from ElementNode.lua rename to src/htmlparser/ElementNode.lua index 0acd32b..4d8fc3c 100644 --- a/ElementNode.lua +++ b/src/htmlparser/ElementNode.lua @@ -1,4 +1,5 @@ -local Set = require "Set" +require("luarocks.loader") +local Set = require("Set") local ElementNode = {} ElementNode.mt = {__index = ElementNode} diff --git a/voidelements.lua b/src/htmlparser/voidelements.lua similarity index 100% rename from voidelements.lua rename to src/htmlparser/voidelements.lua