First draft for a Rock setup

This commit is contained in:
Wouter Scherphof 2013-03-28 12:24:10 +01:00
parent 3af809df9a
commit 4a9fa0a790
8 changed files with 109 additions and 7 deletions

3
.gitignore vendored
View File

@ -1,8 +1,9 @@
# LuaRocks # # LuaRocks #
###################### ######################
bin/
lib/ lib/
share/ share/
bin/ *.rock
# OS generated files # # OS generated files #
###################### ######################

View File

@ -1,4 +1,75 @@
lua-htmlparser #LuaRock "htmlparser"
==============
An HTML parser for lua. Parse HTML text into a tree of elements with selectors
###License
MIT; see ./doc/LICENSE
###Usage
Start off with
```lua
require("luarocks.loader")
local htmlparser = require("htmlparser")
```
Then, parse some html:
```lua
local root = htmlparser.parse(htmlstring)
```
The input to parse may be the contents of a complete html document, or any valid html snippet, as long as all tags are correctly opened and closed.
Now, find specific elements by selecting:
```lua
local elements = root:select(selectorstring)
```
Or in shorthand:
```lua
local elements = root(selectorstring)
```
This wil return a Set of elements, all of which are of the same type as the root element, and thus support selecting as well, if ever needed:
```lua
for e in pairs(elements) do
print(e.name)
local subs = e(subselectorstring)
for sub in pairs(subs) do
print("", sub.name)
end
end
```
###Selectors
- "element"
- "#id"
- ".class"
- "[attribute]"
- "[attribute=value]"
- "[attribute!=value]"
- "[attribute|=value]"
- "[attribute*=value]"
- "[attribute~=value]"
- "[attribute^=value]"
- "[attribute$=value]"
- ":not(selector)"
- "ancestor descendant"
- "parent > child"
Selectors can be combined; e.g. ".class:not([attribute]) element.class"
####Limitations
- Attribute values in selectors currently cannot contain any spaces, since space is interpreted as a delimiter between ancestor and descendant, parent and >, or > and child parts of the selector
- Likewise, for the parent > child relation, the spaces before and after the > are mandatory
###Element type
The tree elements provide, apart from :select and (), the following accessors:
- .name = the elements tagname
- .attributes = a table with keys and values for the element's attributes
- .id = the value of the element's id attribute, if present
- .classes = an array with the classes listed in element's class attribute, if any
- :getcontent() = the raw text between the opening and closing tags of the element
- .nodes = an array with the element's child elements
- .parent = the elements that contains this element; root.parent is nil
- :gettext() = the raw text of the complete element, starting with `"<tagname"` and ending with `"/>"`
- .level = how deep the element is in the tree; root level is 0
- .root the root element of the tree; root.root is root
- .deepernodes = a Set containing all elements in the tree beneath this element, including this element's .nodes
- .deeperelements = a table with a key for each distinct tagname in .deepernodes, containing a Set of all deeper element nodes with that name
- .deeperattributes = as .deeperelements, but keyed on attribute name
- .deeperids = as .deeperelements, but keyed on id value
- .deeperclasses = as .deeperelements, but keyed on class name

View File

@ -1,7 +1,10 @@
require("luarocks.loader")
-- Omit next line in actual module clients; it's only to support development of the module itself
package.path = "../src/?.lua;" .. package.path
local htmlparser = require("htmlparser") local htmlparser = require("htmlparser")
local io = require("io") local io = require("io")
local file = io.input("./test.html") local file = io.input("./sample.html")
local text = io.read("*a") file:close() local text = io.read("*a") file:close()
local root = htmlparser.parse(text) local root = htmlparser.parse(text)
@ -29,6 +32,7 @@ local function select( s )
end end
print(sel:len()) print(sel:len())
end end
select("*") select("*")
select("link") select("link")
select("#/contacts/4711") select("#/contacts/4711")

26
htmlparser-0.1-1.rockspec Normal file
View File

@ -0,0 +1,26 @@
package = "htmlparser"
version = "0.1-1"
source = {
url = "git://github.com/wscherphof/lua-htmlparser.git",
branch = "v0.1"
}
description = {
summary = "Parse HTML text into a tree of elements with selectors",
detailed = [[
Call parse() to build up a tree of element nodes. Each node in the tree, including the root node that is returned by parse(), supports a basic set of jQuery-like selectors. Or you could walk the tree by hand.
]],
homepage = "http://wscherphof.github.com/lua-htmlparser/",
license = "MIT"
}
dependencies = {
"lua >= 5.2",
"set >= 0.1"
}
build = {
type = "builtin",
modules = {
htmlparser = "src/htmlparser.lua",
["htmlparser.ElementNode"] = "src/htmlparser/ElementNode.lua",
["htmlparser.voidelements"] = "src/htmlparser/voidelements.lua"
}
}

View File

@ -1,5 +1,5 @@
local ElementNode = require("ElementNode") local ElementNode = require("htmlparser.ElementNode")
local voidelements = require("voidelements") local voidelements = require("htmlparser.voidelements")
local HtmlParser = {} local HtmlParser = {}