diff --git a/.gitignore b/.gitignore index feff165..40e2119 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,9 @@ # LuaRocks # ###################### +bin/ lib/ share/ -bin/ +*.rock # OS generated files # ###################### diff --git a/README.md b/README.md index 2c947c2..1125a7e 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,75 @@ -lua-htmlparser -============== +#LuaRock "htmlparser" -An HTML parser for lua. +Parse HTML text into a tree of elements with selectors + +###License +MIT; see ./doc/LICENSE + +###Usage +Start off with +```lua +require("luarocks.loader") +local htmlparser = require("htmlparser") +``` +Then, parse some html: +```lua +local root = htmlparser.parse(htmlstring) +``` +The input to parse may be the contents of a complete html document, or any valid html snippet, as long as all tags are correctly opened and closed. +Now, find specific elements by selecting: +```lua +local elements = root:select(selectorstring) +``` +Or in shorthand: +```lua +local elements = root(selectorstring) +``` +This wil return a Set of elements, all of which are of the same type as the root element, and thus support selecting as well, if ever needed: +```lua +for e in pairs(elements) do + print(e.name) + local subs = e(subselectorstring) + for sub in pairs(subs) do + print("", sub.name) + end +end +``` + +###Selectors +- "element" +- "#id" +- ".class" +- "[attribute]" +- "[attribute=value]" +- "[attribute!=value]" +- "[attribute|=value]" +- "[attribute*=value]" +- "[attribute~=value]" +- "[attribute^=value]" +- "[attribute$=value]" +- ":not(selector)" +- "ancestor descendant" +- "parent > child" +Selectors can be combined; e.g. ".class:not([attribute]) element.class" + +####Limitations +- Attribute values in selectors currently cannot contain any spaces, since space is interpreted as a delimiter between ancestor and descendant, parent and >, or > and child parts of the selector +- Likewise, for the parent > child relation, the spaces before and after the > are mandatory + +###Element type +The tree elements provide, apart from :select and (), the following accessors: +- .name = the elements tagname +- .attributes = a table with keys and values for the element's attributes +- .id = the value of the element's id attribute, if present +- .classes = an array with the classes listed in element's class attribute, if any +- :getcontent() = the raw text between the opening and closing tags of the element +- .nodes = an array with the element's child elements +- .parent = the elements that contains this element; root.parent is nil +- :gettext() = the raw text of the complete element, starting with `""` +- .level = how deep the element is in the tree; root level is 0 +- .root the root element of the tree; root.root is root +- .deepernodes = a Set containing all elements in the tree beneath this element, including this element's .nodes +- .deeperelements = a table with a key for each distinct tagname in .deepernodes, containing a Set of all deeper element nodes with that name +- .deeperattributes = as .deeperelements, but keyed on attribute name +- .deeperids = as .deeperelements, but keyed on id value +- .deeperclasses = as .deeperelements, but keyed on class name diff --git a/test.html b/doc/sample.html similarity index 100% rename from test.html rename to doc/sample.html diff --git a/test.lua b/doc/sample.lua similarity index 93% rename from test.lua rename to doc/sample.lua index 1eff3f6..5c85bdb 100644 --- a/test.lua +++ b/doc/sample.lua @@ -1,7 +1,10 @@ +require("luarocks.loader") +-- Omit next line in actual module clients; it's only to support development of the module itself +package.path = "../src/?.lua;" .. package.path local htmlparser = require("htmlparser") local io = require("io") -local file = io.input("./test.html") +local file = io.input("./sample.html") local text = io.read("*a") file:close() local root = htmlparser.parse(text) @@ -29,6 +32,7 @@ local function select( s ) end print(sel:len()) end + select("*") select("link") select("#/contacts/4711") diff --git a/htmlparser-0.1-1.rockspec b/htmlparser-0.1-1.rockspec new file mode 100644 index 0000000..4238859 --- /dev/null +++ b/htmlparser-0.1-1.rockspec @@ -0,0 +1,26 @@ +package = "htmlparser" +version = "0.1-1" +source = { + url = "git://github.com/wscherphof/lua-htmlparser.git", + branch = "v0.1" +} +description = { + summary = "Parse HTML text into a tree of elements with selectors", + detailed = [[ + Call parse() to build up a tree of element nodes. Each node in the tree, including the root node that is returned by parse(), supports a basic set of jQuery-like selectors. Or you could walk the tree by hand. + ]], + homepage = "http://wscherphof.github.com/lua-htmlparser/", + license = "MIT" +} +dependencies = { + "lua >= 5.2", + "set >= 0.1" +} +build = { + type = "builtin", + modules = { + htmlparser = "src/htmlparser.lua", + ["htmlparser.ElementNode"] = "src/htmlparser/ElementNode.lua", + ["htmlparser.voidelements"] = "src/htmlparser/voidelements.lua" + } +} \ No newline at end of file diff --git a/htmlparser.lua b/src/htmlparser.lua similarity index 94% rename from htmlparser.lua rename to src/htmlparser.lua index 053baad..ba6c4a6 100644 --- a/htmlparser.lua +++ b/src/htmlparser.lua @@ -1,5 +1,5 @@ -local ElementNode = require("ElementNode") -local voidelements = require("voidelements") +local ElementNode = require("htmlparser.ElementNode") +local voidelements = require("htmlparser.voidelements") local HtmlParser = {} diff --git a/ElementNode.lua b/src/htmlparser/ElementNode.lua similarity index 100% rename from ElementNode.lua rename to src/htmlparser/ElementNode.lua diff --git a/voidelements.lua b/src/htmlparser/voidelements.lua similarity index 100% rename from voidelements.lua rename to src/htmlparser/voidelements.lua