diff --git a/.gitignore b/.gitignore
index 9bea433..40e2119 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,17 @@
+# LuaRocks #
+######################
+bin/
+lib/
+share/
+*.rock
+# OS generated files #
+######################
.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+Icon?
+ehthumbs.db
+Thumbs.db
\ No newline at end of file
diff --git a/README.md b/README.md
index 2c947c2..10d154d 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,92 @@
-lua-htmlparser
-==============
+#LuaRock "htmlparser"
-An HTML parser for lua.
+Parse HTML text into a tree of elements with selectors
+
+[1]: http://wscherphof.github.com/lua-set/
+[2]: http://api.jquery.com/category/selectors/
+
+##License
+MIT; see `./doc/LICENSE`
+
+##Usage
+Start off with
+```lua
+require("luarocks.loader")
+local htmlparser = require("htmlparser")
+```
+Then, parse some html:
+```lua
+local root = htmlparser.parse(htmlstring)
+```
+The input to parse may be the contents of a complete html document, or any valid html snippet, as long as all tags are correctly opened and closed.
+Now, find sepcific contained elements by selecting:
+```lua
+local elements = root:select(selectorstring)
+```
+Or in shorthand:
+```lua
+local elements = root(selectorstring)
+```
+This wil return a [Set][1] of elements, all of which are of the same type as the root element, and thus support selecting as well, if ever needed:
+```lua
+for e in pairs(elements) do
+ print(e.name)
+ local subs = e(subselectorstring)
+ for sub in pairs(subs) do
+ print("", sub.name)
+ end
+end
+```
+The root element is a container for the top level elements in the parsed text, i.e. the `` element in a parsed html document would be a child of the returned root element.
+
+##Selectors
+Supported selectors are a subset of [jQuery's selectors][2]:
+
+- `"*"` all contained elements
+- `"element"` elements with the given tagname
+- `"#id"` elements with the given id attribute value
+- `".class"` elements with the given classname in the class attribute
+- `"[attribute]"` elements with an attribute of the given name
+- `"[attribute='value']"` equals: elements with the given value for the attribute with the given name
+- `"[attribute!='value']"` not equals: elements without an attribute of the given name, or with that attribute, but with a value that is different from the given value
+- `"[attribute|='value']"` prefix: attribute's value is given value, or starts with given value, followed by a hyphen (`-`)
+- `"[attribute*='value']"` contains: attribute's value contains given value
+- `"[attribute~='value']"` word: attribute's value is a space-separated token, where one of the tokens is the given value
+- `"[attribute^='value']"` starts with: attribute's value starts with given value
+- `"[attribute$='value']"` ends with: attribute's value ends with given value
+- `":not(selectorstring)"` elements not selected by given selector string
+- `"ancestor descendant"` elements selected by the `descendant` selector string, that are a descendant of any element selected by the `ancestor` selector string
+- `"parent > child"` elements selected by the `child` selector string, that are a child element of any element selected by the `parent` selector string
+
+Selectors can be combined; e.g. `".class:not([attribute]) element.class"`
+
+###Limitations
+- Attribute values in selectors currently cannot contain any spaces, since space is interpreted as a delimiter between the `ancestor` and `descendant`, `parent` and `>`, or `>` and `child` parts of the selector
+- Likewise, for the `parent > child` relation, the spaces before and after the `>` are mandatory
+- `line1
line2
You are being redirected to the homepage of the + Htmlparser LuaRock. +
+If you are not redirected after a few seconds, please click on the link above!
+ + \ No newline at end of file diff --git a/test.html b/doc/sample.html similarity index 100% rename from test.html rename to doc/sample.html diff --git a/test.lua b/doc/sample.lua similarity index 93% rename from test.lua rename to doc/sample.lua index 1eff3f6..5c85bdb 100644 --- a/test.lua +++ b/doc/sample.lua @@ -1,7 +1,10 @@ +require("luarocks.loader") +-- Omit next line in actual module clients; it's only to support development of the module itself +package.path = "../src/?.lua;" .. package.path local htmlparser = require("htmlparser") local io = require("io") -local file = io.input("./test.html") +local file = io.input("./sample.html") local text = io.read("*a") file:close() local root = htmlparser.parse(text) @@ -29,6 +32,7 @@ local function select( s ) end print(sel:len()) end + select("*") select("link") select("#/contacts/4711") diff --git a/htmlparser-0.1-1.rockspec b/htmlparser-0.1-1.rockspec new file mode 100644 index 0000000..4238859 --- /dev/null +++ b/htmlparser-0.1-1.rockspec @@ -0,0 +1,26 @@ +package = "htmlparser" +version = "0.1-1" +source = { + url = "git://github.com/wscherphof/lua-htmlparser.git", + branch = "v0.1" +} +description = { + summary = "Parse HTML text into a tree of elements with selectors", + detailed = [[ + Call parse() to build up a tree of element nodes. Each node in the tree, including the root node that is returned by parse(), supports a basic set of jQuery-like selectors. Or you could walk the tree by hand. + ]], + homepage = "http://wscherphof.github.com/lua-htmlparser/", + license = "MIT" +} +dependencies = { + "lua >= 5.2", + "set >= 0.1" +} +build = { + type = "builtin", + modules = { + htmlparser = "src/htmlparser.lua", + ["htmlparser.ElementNode"] = "src/htmlparser/ElementNode.lua", + ["htmlparser.voidelements"] = "src/htmlparser/voidelements.lua" + } +} \ No newline at end of file diff --git a/htmlparser.lua b/src/htmlparser.lua similarity index 94% rename from htmlparser.lua rename to src/htmlparser.lua index 053baad..ba6c4a6 100644 --- a/htmlparser.lua +++ b/src/htmlparser.lua @@ -1,5 +1,5 @@ -local ElementNode = require("ElementNode") -local voidelements = require("voidelements") +local ElementNode = require("htmlparser.ElementNode") +local voidelements = require("htmlparser.voidelements") local HtmlParser = {} diff --git a/ElementNode.lua b/src/htmlparser/ElementNode.lua similarity index 99% rename from ElementNode.lua rename to src/htmlparser/ElementNode.lua index 0acd32b..4d8fc3c 100644 --- a/ElementNode.lua +++ b/src/htmlparser/ElementNode.lua @@ -1,4 +1,5 @@ -local Set = require "Set" +require("luarocks.loader") +local Set = require("Set") local ElementNode = {} ElementNode.mt = {__index = ElementNode} diff --git a/voidelements.lua b/src/htmlparser/voidelements.lua similarity index 100% rename from voidelements.lua rename to src/htmlparser/voidelements.lua