mirror of
https://github.com/msva/lua-htmlparser.git
synced 2024-11-04 23:34:20 +00:00
Merge pull request #15 from wscherphof/#2-#3-#4-LuaRocks
closes #1 #2 #3 #5
This commit is contained in:
commit
f20d84c626
15
.gitignore
vendored
15
.gitignore
vendored
@ -1,2 +1,17 @@
|
||||
# LuaRocks #
|
||||
######################
|
||||
bin/
|
||||
lib/
|
||||
share/
|
||||
*.rock
|
||||
|
||||
# OS generated files #
|
||||
######################
|
||||
.DS_Store
|
||||
.DS_Store?
|
||||
._*
|
||||
.Spotlight-V100
|
||||
.Trashes
|
||||
Icon?
|
||||
ehthumbs.db
|
||||
Thumbs.db
|
94
README.md
94
README.md
@ -1,4 +1,92 @@
|
||||
lua-htmlparser
|
||||
==============
|
||||
#LuaRock "htmlparser"
|
||||
|
||||
An HTML parser for lua.
|
||||
Parse HTML text into a tree of elements with selectors
|
||||
|
||||
[1]: http://wscherphof.github.com/lua-set/
|
||||
[2]: http://api.jquery.com/category/selectors/
|
||||
|
||||
##License
|
||||
MIT; see `./doc/LICENSE`
|
||||
|
||||
##Usage
|
||||
Start off with
|
||||
```lua
|
||||
require("luarocks.loader")
|
||||
local htmlparser = require("htmlparser")
|
||||
```
|
||||
Then, parse some html:
|
||||
```lua
|
||||
local root = htmlparser.parse(htmlstring)
|
||||
```
|
||||
The input to parse may be the contents of a complete html document, or any valid html snippet, as long as all tags are correctly opened and closed.
|
||||
Now, find sepcific contained elements by selecting:
|
||||
```lua
|
||||
local elements = root:select(selectorstring)
|
||||
```
|
||||
Or in shorthand:
|
||||
```lua
|
||||
local elements = root(selectorstring)
|
||||
```
|
||||
This wil return a [Set][1] of elements, all of which are of the same type as the root element, and thus support selecting as well, if ever needed:
|
||||
```lua
|
||||
for e in pairs(elements) do
|
||||
print(e.name)
|
||||
local subs = e(subselectorstring)
|
||||
for sub in pairs(subs) do
|
||||
print("", sub.name)
|
||||
end
|
||||
end
|
||||
```
|
||||
The root element is a container for the top level elements in the parsed text, i.e. the `<html>` element in a parsed html document would be a child of the returned root element.
|
||||
|
||||
##Selectors
|
||||
Supported selectors are a subset of [jQuery's selectors][2]:
|
||||
|
||||
- `"*"` all contained elements
|
||||
- `"element"` elements with the given tagname
|
||||
- `"#id"` elements with the given id attribute value
|
||||
- `".class"` elements with the given classname in the class attribute
|
||||
- `"[attribute]"` elements with an attribute of the given name
|
||||
- `"[attribute='value']"` equals: elements with the given value for the attribute with the given name
|
||||
- `"[attribute!='value']"` not equals: elements without an attribute of the given name, or with that attribute, but with a value that is different from the given value
|
||||
- `"[attribute|='value']"` prefix: attribute's value is given value, or starts with given value, followed by a hyphen (`-`)
|
||||
- `"[attribute*='value']"` contains: attribute's value contains given value
|
||||
- `"[attribute~='value']"` word: attribute's value is a space-separated token, where one of the tokens is the given value
|
||||
- `"[attribute^='value']"` starts with: attribute's value starts with given value
|
||||
- `"[attribute$='value']"` ends with: attribute's value ends with given value
|
||||
- `":not(selectorstring)"` elements not selected by given selector string
|
||||
- `"ancestor descendant"` elements selected by the `descendant` selector string, that are a descendant of any element selected by the `ancestor` selector string
|
||||
- `"parent > child"` elements selected by the `child` selector string, that are a child element of any element selected by the `parent` selector string
|
||||
|
||||
Selectors can be combined; e.g. `".class:not([attribute]) element.class"`
|
||||
|
||||
###Limitations
|
||||
- Attribute values in selectors currently cannot contain any spaces, since space is interpreted as a delimiter between the `ancestor` and `descendant`, `parent` and `>`, or `>` and `child` parts of the selector
|
||||
- Likewise, for the `parent > child` relation, the spaces before and after the `>` are mandatory
|
||||
- `<!` elements are not parsed, including doctype and comments
|
||||
- Textnodes are not seperate entries in the tree, so the content of `<p>line1<br />line2</p>` is plainly `"line1<br />line2"`
|
||||
|
||||
##Examples
|
||||
See `./doc/samples.lua`
|
||||
|
||||
##Element type
|
||||
All tree elements provide, apart from `:select` and `()`, the following accessors:
|
||||
|
||||
###Basic
|
||||
- `.name` the element's tagname
|
||||
- `.attributes` a table with keys and values for the element's attributes; `{}` if none
|
||||
- `.id` the value of the element's id attribute; `nil` if not present
|
||||
- `.classes` an array with the classes listed in element's class attribute; `{}` if none
|
||||
- `:getcontent()` the raw text between the opening and closing tags of the element; `""` if none
|
||||
- `.nodes` an array with the element's child elements, `{}` if none
|
||||
- `.parent` the elements that contains this element; `root.parent` is `nil`
|
||||
|
||||
###Other
|
||||
- `:gettext()` the raw text of the complete element, starting with `"<tagname"` and ending with `"/>"`
|
||||
- `.level` how deep the element is in the tree; root level is `0`
|
||||
- `.root` the root element of the tree; `root.root` is `root`
|
||||
- `.deepernodes` a [Set][1] containing all elements in the tree beneath this element, including this element's `.nodes`; `{}` if none
|
||||
- `.deeperelements` a table with a key for each distinct tagname in `.deepernodes`, containing a [Set][1] of all deeper element nodes with that name; `{}` in none
|
||||
- `.deeperattributes` as `.deeperelements`, but keyed on attribute name
|
||||
- `.deeperids` as `.deeperelements`, but keyed on id value
|
||||
- `.deeperclasses` as `.deeperelements`, but keyed on class name
|
||||
|
87
Set.lua
87
Set.lua
@ -1,87 +0,0 @@
|
||||
local Set = {}
|
||||
Set.mt = {__index = Set}
|
||||
function Set:new(t)
|
||||
local instance = {}
|
||||
if type(t) == "table" then
|
||||
if #t > 0 then
|
||||
for _,v in ipairs(t) do
|
||||
instance[v] = true
|
||||
end
|
||||
else
|
||||
for k in pairs(t) do
|
||||
instance[k] = true
|
||||
end
|
||||
end
|
||||
else
|
||||
instance = {t}
|
||||
end
|
||||
return setmetatable(instance, Set.mt)
|
||||
end
|
||||
|
||||
function Set:add(e)
|
||||
self[e] = true
|
||||
end
|
||||
|
||||
function Set:remove(e)
|
||||
self[e] = nil
|
||||
end
|
||||
|
||||
-- Union
|
||||
Set.mt.__add = function (a, b)
|
||||
local res = Set:new()
|
||||
if getmetatable(a) ~= Set.mt then a = Set:new(a) end
|
||||
if getmetatable(b) ~= Set.mt then b = Set:new(b) end
|
||||
for k in pairs(a) do res[k] = true end
|
||||
for k in pairs(b) do res[k] = true end
|
||||
return res
|
||||
end
|
||||
|
||||
-- Subtraction
|
||||
Set.mt.__sub = function (a, b)
|
||||
local res = Set:new()
|
||||
if getmetatable(a) ~= Set.mt then a = Set:new(a) end
|
||||
if getmetatable(b) ~= Set.mt then b = Set:new(b) end
|
||||
for k in pairs(a) do res[k] = true end
|
||||
for k in pairs(b) do res[k] = nil end
|
||||
return res
|
||||
end
|
||||
|
||||
-- Intersection
|
||||
Set.mt.__mul = function (a, b)
|
||||
local res = Set:new()
|
||||
if getmetatable(a) ~= Set.mt then a = Set:new(a) end
|
||||
if getmetatable(b) ~= Set.mt then b = Set:new(b) end
|
||||
for k in pairs(a) do
|
||||
res[k] = b[k]
|
||||
end
|
||||
return res
|
||||
end
|
||||
|
||||
-- String representation
|
||||
Set.mt.__tostring = function (set)
|
||||
local s = "{"
|
||||
local sep = ""
|
||||
for k in pairs(set) do
|
||||
s = s .. sep .. k
|
||||
sep = ", "
|
||||
end
|
||||
return s .. "}"
|
||||
end
|
||||
|
||||
function Set:len()
|
||||
local num = 0
|
||||
for _ in pairs(self) do
|
||||
num = num + 1
|
||||
end
|
||||
return num
|
||||
end
|
||||
|
||||
function Set:tolist()
|
||||
local res = {}
|
||||
for k in pairs(self) do
|
||||
table.insert(res, k)
|
||||
end
|
||||
return res
|
||||
end
|
||||
|
||||
return Set
|
14
doc/README.html
Normal file
14
doc/README.html
Normal file
@ -0,0 +1,14 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Htmlparser LuaRock Readme</title>
|
||||
<meta charset="UTF-8">
|
||||
<meta http-equiv="Refresh" content="3; url=http://wscherphof.github.com/lua-htmlparser/" />
|
||||
</head>
|
||||
<body>
|
||||
<p>You are being redirected to the homepage of the
|
||||
<a href="http://wscherphof.github.com/lua-htmlparser/">Htmlparser LuaRock</a>.
|
||||
</p>
|
||||
<p>If you are not redirected after a few seconds, please click on the link above!</p>
|
||||
</body>
|
||||
</html>
|
@ -1,7 +1,10 @@
|
||||
require("luarocks.loader")
|
||||
-- Omit next line in actual module clients; it's only to support development of the module itself
|
||||
package.path = "../src/?.lua;" .. package.path
|
||||
local htmlparser = require("htmlparser")
|
||||
|
||||
local io = require("io")
|
||||
local file = io.input("./test.html")
|
||||
local file = io.input("./sample.html")
|
||||
local text = io.read("*a") file:close()
|
||||
|
||||
local root = htmlparser.parse(text)
|
||||
@ -29,6 +32,7 @@ local function select( s )
|
||||
end
|
||||
print(sel:len())
|
||||
end
|
||||
|
||||
select("*")
|
||||
select("link")
|
||||
select("#/contacts/4711")
|
26
htmlparser-0.1-1.rockspec
Normal file
26
htmlparser-0.1-1.rockspec
Normal file
@ -0,0 +1,26 @@
|
||||
package = "htmlparser"
|
||||
version = "0.1-1"
|
||||
source = {
|
||||
url = "git://github.com/wscherphof/lua-htmlparser.git",
|
||||
branch = "v0.1"
|
||||
}
|
||||
description = {
|
||||
summary = "Parse HTML text into a tree of elements with selectors",
|
||||
detailed = [[
|
||||
Call parse() to build up a tree of element nodes. Each node in the tree, including the root node that is returned by parse(), supports a basic set of jQuery-like selectors. Or you could walk the tree by hand.
|
||||
]],
|
||||
homepage = "http://wscherphof.github.com/lua-htmlparser/",
|
||||
license = "MIT"
|
||||
}
|
||||
dependencies = {
|
||||
"lua >= 5.2",
|
||||
"set >= 0.1"
|
||||
}
|
||||
build = {
|
||||
type = "builtin",
|
||||
modules = {
|
||||
htmlparser = "src/htmlparser.lua",
|
||||
["htmlparser.ElementNode"] = "src/htmlparser/ElementNode.lua",
|
||||
["htmlparser.voidelements"] = "src/htmlparser/voidelements.lua"
|
||||
}
|
||||
}
|
@ -1,5 +1,5 @@
|
||||
local ElementNode = require("ElementNode")
|
||||
local voidelements = require("voidelements")
|
||||
local ElementNode = require("htmlparser.ElementNode")
|
||||
local voidelements = require("htmlparser.voidelements")
|
||||
|
||||
local HtmlParser = {}
|
||||
|
@ -1,4 +1,5 @@
|
||||
local Set = require "Set"
|
||||
require("luarocks.loader")
|
||||
local Set = require("Set")
|
||||
|
||||
local ElementNode = {}
|
||||
ElementNode.mt = {__index = ElementNode}
|
Loading…
Reference in New Issue
Block a user