Merge pull request #15 from wscherphof/#2-#3-#4-LuaRocks

closes #1 #2 #3 #5
This commit is contained in:
Wouter Scherphof 2013-03-28 05:36:58 -07:00
commit f20d84c626
11 changed files with 155 additions and 94 deletions

15
.gitignore vendored
View File

@ -1,2 +1,17 @@
# LuaRocks #
######################
bin/
lib/
share/
*.rock
# OS generated files #
######################
.DS_Store
.DS_Store?
._*
.Spotlight-V100
.Trashes
Icon?
ehthumbs.db
Thumbs.db

View File

@ -1,4 +1,92 @@
lua-htmlparser
==============
#LuaRock "htmlparser"
An HTML parser for lua.
Parse HTML text into a tree of elements with selectors
[1]: http://wscherphof.github.com/lua-set/
[2]: http://api.jquery.com/category/selectors/
##License
MIT; see `./doc/LICENSE`
##Usage
Start off with
```lua
require("luarocks.loader")
local htmlparser = require("htmlparser")
```
Then, parse some html:
```lua
local root = htmlparser.parse(htmlstring)
```
The input to parse may be the contents of a complete html document, or any valid html snippet, as long as all tags are correctly opened and closed.
Now, find sepcific contained elements by selecting:
```lua
local elements = root:select(selectorstring)
```
Or in shorthand:
```lua
local elements = root(selectorstring)
```
This wil return a [Set][1] of elements, all of which are of the same type as the root element, and thus support selecting as well, if ever needed:
```lua
for e in pairs(elements) do
print(e.name)
local subs = e(subselectorstring)
for sub in pairs(subs) do
print("", sub.name)
end
end
```
The root element is a container for the top level elements in the parsed text, i.e. the `<html>` element in a parsed html document would be a child of the returned root element.
##Selectors
Supported selectors are a subset of [jQuery's selectors][2]:
- `"*"` all contained elements
- `"element"` elements with the given tagname
- `"#id"` elements with the given id attribute value
- `".class"` elements with the given classname in the class attribute
- `"[attribute]"` elements with an attribute of the given name
- `"[attribute='value']"` equals: elements with the given value for the attribute with the given name
- `"[attribute!='value']"` not equals: elements without an attribute of the given name, or with that attribute, but with a value that is different from the given value
- `"[attribute|='value']"` prefix: attribute's value is given value, or starts with given value, followed by a hyphen (`-`)
- `"[attribute*='value']"` contains: attribute's value contains given value
- `"[attribute~='value']"` word: attribute's value is a space-separated token, where one of the tokens is the given value
- `"[attribute^='value']"` starts with: attribute's value starts with given value
- `"[attribute$='value']"` ends with: attribute's value ends with given value
- `":not(selectorstring)"` elements not selected by given selector string
- `"ancestor descendant"` elements selected by the `descendant` selector string, that are a descendant of any element selected by the `ancestor` selector string
- `"parent > child"` elements selected by the `child` selector string, that are a child element of any element selected by the `parent` selector string
Selectors can be combined; e.g. `".class:not([attribute]) element.class"`
###Limitations
- Attribute values in selectors currently cannot contain any spaces, since space is interpreted as a delimiter between the `ancestor` and `descendant`, `parent` and `>`, or `>` and `child` parts of the selector
- Likewise, for the `parent > child` relation, the spaces before and after the `>` are mandatory
- `<!` elements are not parsed, including doctype and comments
- Textnodes are not seperate entries in the tree, so the content of `<p>line1<br />line2</p>` is plainly `"line1<br />line2"`
##Examples
See `./doc/samples.lua`
##Element type
All tree elements provide, apart from `:select` and `()`, the following accessors:
###Basic
- `.name` the element's tagname
- `.attributes` a table with keys and values for the element's attributes; `{}` if none
- `.id` the value of the element's id attribute; `nil` if not present
- `.classes` an array with the classes listed in element's class attribute; `{}` if none
- `:getcontent()` the raw text between the opening and closing tags of the element; `""` if none
- `.nodes` an array with the element's child elements, `{}` if none
- `.parent` the elements that contains this element; `root.parent` is `nil`
###Other
- `:gettext()` the raw text of the complete element, starting with `"<tagname"` and ending with `"/>"`
- `.level` how deep the element is in the tree; root level is `0`
- `.root` the root element of the tree; `root.root` is `root`
- `.deepernodes` a [Set][1] containing all elements in the tree beneath this element, including this element's `.nodes`; `{}` if none
- `.deeperelements` a table with a key for each distinct tagname in `.deepernodes`, containing a [Set][1] of all deeper element nodes with that name; `{}` in none
- `.deeperattributes` as `.deeperelements`, but keyed on attribute name
- `.deeperids` as `.deeperelements`, but keyed on id value
- `.deeperclasses` as `.deeperelements`, but keyed on class name

87
Set.lua
View File

@ -1,87 +0,0 @@
local Set = {}
Set.mt = {__index = Set}
function Set:new(t)
local instance = {}
if type(t) == "table" then
if #t > 0 then
for _,v in ipairs(t) do
instance[v] = true
end
else
for k in pairs(t) do
instance[k] = true
end
end
else
instance = {t}
end
return setmetatable(instance, Set.mt)
end
function Set:add(e)
self[e] = true
end
function Set:remove(e)
self[e] = nil
end
-- Union
Set.mt.__add = function (a, b)
local res = Set:new()
if getmetatable(a) ~= Set.mt then a = Set:new(a) end
if getmetatable(b) ~= Set.mt then b = Set:new(b) end
for k in pairs(a) do res[k] = true end
for k in pairs(b) do res[k] = true end
return res
end
-- Subtraction
Set.mt.__sub = function (a, b)
local res = Set:new()
if getmetatable(a) ~= Set.mt then a = Set:new(a) end
if getmetatable(b) ~= Set.mt then b = Set:new(b) end
for k in pairs(a) do res[k] = true end
for k in pairs(b) do res[k] = nil end
return res
end
-- Intersection
Set.mt.__mul = function (a, b)
local res = Set:new()
if getmetatable(a) ~= Set.mt then a = Set:new(a) end
if getmetatable(b) ~= Set.mt then b = Set:new(b) end
for k in pairs(a) do
res[k] = b[k]
end
return res
end
-- String representation
Set.mt.__tostring = function (set)
local s = "{"
local sep = ""
for k in pairs(set) do
s = s .. sep .. k
sep = ", "
end
return s .. "}"
end
function Set:len()
local num = 0
for _ in pairs(self) do
num = num + 1
end
return num
end
function Set:tolist()
local res = {}
for k in pairs(self) do
table.insert(res, k)
end
return res
end
return Set

14
doc/README.html Normal file
View File

@ -0,0 +1,14 @@
<!DOCTYPE html>
<html>
<head>
<title>Htmlparser LuaRock Readme</title>
<meta charset="UTF-8">
<meta http-equiv="Refresh" content="3; url=http://wscherphof.github.com/lua-htmlparser/" />
</head>
<body>
<p>You are being redirected to the homepage of the
<a href="http://wscherphof.github.com/lua-htmlparser/">Htmlparser LuaRock</a>.
</p>
<p>If you are not redirected after a few seconds, please click on the link above!</p>
</body>
</html>

View File

@ -1,7 +1,10 @@
require("luarocks.loader")
-- Omit next line in actual module clients; it's only to support development of the module itself
package.path = "../src/?.lua;" .. package.path
local htmlparser = require("htmlparser")
local io = require("io")
local file = io.input("./test.html")
local file = io.input("./sample.html")
local text = io.read("*a") file:close()
local root = htmlparser.parse(text)
@ -29,6 +32,7 @@ local function select( s )
end
print(sel:len())
end
select("*")
select("link")
select("#/contacts/4711")

26
htmlparser-0.1-1.rockspec Normal file
View File

@ -0,0 +1,26 @@
package = "htmlparser"
version = "0.1-1"
source = {
url = "git://github.com/wscherphof/lua-htmlparser.git",
branch = "v0.1"
}
description = {
summary = "Parse HTML text into a tree of elements with selectors",
detailed = [[
Call parse() to build up a tree of element nodes. Each node in the tree, including the root node that is returned by parse(), supports a basic set of jQuery-like selectors. Or you could walk the tree by hand.
]],
homepage = "http://wscherphof.github.com/lua-htmlparser/",
license = "MIT"
}
dependencies = {
"lua >= 5.2",
"set >= 0.1"
}
build = {
type = "builtin",
modules = {
htmlparser = "src/htmlparser.lua",
["htmlparser.ElementNode"] = "src/htmlparser/ElementNode.lua",
["htmlparser.voidelements"] = "src/htmlparser/voidelements.lua"
}
}

View File

@ -1,5 +1,5 @@
local ElementNode = require("ElementNode")
local voidelements = require("voidelements")
local ElementNode = require("htmlparser.ElementNode")
local voidelements = require("htmlparser.voidelements")
local HtmlParser = {}

View File

@ -1,4 +1,5 @@
local Set = require "Set"
require("luarocks.loader")
local Set = require("Set")
local ElementNode = {}
ElementNode.mt = {__index = ElementNode}