mirror of
https://github.com/msva/lua-htmlparser.git
synced 2024-11-04 23:34:20 +00:00
First draft for a Rock setup
This commit is contained in:
parent
3af809df9a
commit
4a9fa0a790
3
.gitignore
vendored
3
.gitignore
vendored
@ -1,8 +1,9 @@
|
|||||||
# LuaRocks #
|
# LuaRocks #
|
||||||
######################
|
######################
|
||||||
|
bin/
|
||||||
lib/
|
lib/
|
||||||
share/
|
share/
|
||||||
bin/
|
*.rock
|
||||||
|
|
||||||
# OS generated files #
|
# OS generated files #
|
||||||
######################
|
######################
|
||||||
|
77
README.md
77
README.md
@ -1,4 +1,75 @@
|
|||||||
lua-htmlparser
|
#LuaRock "htmlparser"
|
||||||
==============
|
|
||||||
|
|
||||||
An HTML parser for lua.
|
Parse HTML text into a tree of elements with selectors
|
||||||
|
|
||||||
|
###License
|
||||||
|
MIT; see ./doc/LICENSE
|
||||||
|
|
||||||
|
###Usage
|
||||||
|
Start off with
|
||||||
|
```lua
|
||||||
|
require("luarocks.loader")
|
||||||
|
local htmlparser = require("htmlparser")
|
||||||
|
```
|
||||||
|
Then, parse some html:
|
||||||
|
```lua
|
||||||
|
local root = htmlparser.parse(htmlstring)
|
||||||
|
```
|
||||||
|
The input to parse may be the contents of a complete html document, or any valid html snippet, as long as all tags are correctly opened and closed.
|
||||||
|
Now, find specific elements by selecting:
|
||||||
|
```lua
|
||||||
|
local elements = root:select(selectorstring)
|
||||||
|
```
|
||||||
|
Or in shorthand:
|
||||||
|
```lua
|
||||||
|
local elements = root(selectorstring)
|
||||||
|
```
|
||||||
|
This wil return a Set of elements, all of which are of the same type as the root element, and thus support selecting as well, if ever needed:
|
||||||
|
```lua
|
||||||
|
for e in pairs(elements) do
|
||||||
|
print(e.name)
|
||||||
|
local subs = e(subselectorstring)
|
||||||
|
for sub in pairs(subs) do
|
||||||
|
print("", sub.name)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
```
|
||||||
|
|
||||||
|
###Selectors
|
||||||
|
- "element"
|
||||||
|
- "#id"
|
||||||
|
- ".class"
|
||||||
|
- "[attribute]"
|
||||||
|
- "[attribute=value]"
|
||||||
|
- "[attribute!=value]"
|
||||||
|
- "[attribute|=value]"
|
||||||
|
- "[attribute*=value]"
|
||||||
|
- "[attribute~=value]"
|
||||||
|
- "[attribute^=value]"
|
||||||
|
- "[attribute$=value]"
|
||||||
|
- ":not(selector)"
|
||||||
|
- "ancestor descendant"
|
||||||
|
- "parent > child"
|
||||||
|
Selectors can be combined; e.g. ".class:not([attribute]) element.class"
|
||||||
|
|
||||||
|
####Limitations
|
||||||
|
- Attribute values in selectors currently cannot contain any spaces, since space is interpreted as a delimiter between ancestor and descendant, parent and >, or > and child parts of the selector
|
||||||
|
- Likewise, for the parent > child relation, the spaces before and after the > are mandatory
|
||||||
|
|
||||||
|
###Element type
|
||||||
|
The tree elements provide, apart from :select and (), the following accessors:
|
||||||
|
- .name = the elements tagname
|
||||||
|
- .attributes = a table with keys and values for the element's attributes
|
||||||
|
- .id = the value of the element's id attribute, if present
|
||||||
|
- .classes = an array with the classes listed in element's class attribute, if any
|
||||||
|
- :getcontent() = the raw text between the opening and closing tags of the element
|
||||||
|
- .nodes = an array with the element's child elements
|
||||||
|
- .parent = the elements that contains this element; root.parent is nil
|
||||||
|
- :gettext() = the raw text of the complete element, starting with `"<tagname"` and ending with `"/>"`
|
||||||
|
- .level = how deep the element is in the tree; root level is 0
|
||||||
|
- .root the root element of the tree; root.root is root
|
||||||
|
- .deepernodes = a Set containing all elements in the tree beneath this element, including this element's .nodes
|
||||||
|
- .deeperelements = a table with a key for each distinct tagname in .deepernodes, containing a Set of all deeper element nodes with that name
|
||||||
|
- .deeperattributes = as .deeperelements, but keyed on attribute name
|
||||||
|
- .deeperids = as .deeperelements, but keyed on id value
|
||||||
|
- .deeperclasses = as .deeperelements, but keyed on class name
|
||||||
|
@ -1,7 +1,10 @@
|
|||||||
|
require("luarocks.loader")
|
||||||
|
-- Omit next line in actual module clients; it's only to support development of the module itself
|
||||||
|
package.path = "../src/?.lua;" .. package.path
|
||||||
local htmlparser = require("htmlparser")
|
local htmlparser = require("htmlparser")
|
||||||
|
|
||||||
local io = require("io")
|
local io = require("io")
|
||||||
local file = io.input("./test.html")
|
local file = io.input("./sample.html")
|
||||||
local text = io.read("*a") file:close()
|
local text = io.read("*a") file:close()
|
||||||
|
|
||||||
local root = htmlparser.parse(text)
|
local root = htmlparser.parse(text)
|
||||||
@ -29,6 +32,7 @@ local function select( s )
|
|||||||
end
|
end
|
||||||
print(sel:len())
|
print(sel:len())
|
||||||
end
|
end
|
||||||
|
|
||||||
select("*")
|
select("*")
|
||||||
select("link")
|
select("link")
|
||||||
select("#/contacts/4711")
|
select("#/contacts/4711")
|
26
htmlparser-0.1-1.rockspec
Normal file
26
htmlparser-0.1-1.rockspec
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
package = "htmlparser"
|
||||||
|
version = "0.1-1"
|
||||||
|
source = {
|
||||||
|
url = "git://github.com/wscherphof/lua-htmlparser.git",
|
||||||
|
branch = "v0.1"
|
||||||
|
}
|
||||||
|
description = {
|
||||||
|
summary = "Parse HTML text into a tree of elements with selectors",
|
||||||
|
detailed = [[
|
||||||
|
Call parse() to build up a tree of element nodes. Each node in the tree, including the root node that is returned by parse(), supports a basic set of jQuery-like selectors. Or you could walk the tree by hand.
|
||||||
|
]],
|
||||||
|
homepage = "http://wscherphof.github.com/lua-htmlparser/",
|
||||||
|
license = "MIT"
|
||||||
|
}
|
||||||
|
dependencies = {
|
||||||
|
"lua >= 5.2",
|
||||||
|
"set >= 0.1"
|
||||||
|
}
|
||||||
|
build = {
|
||||||
|
type = "builtin",
|
||||||
|
modules = {
|
||||||
|
htmlparser = "src/htmlparser.lua",
|
||||||
|
["htmlparser.ElementNode"] = "src/htmlparser/ElementNode.lua",
|
||||||
|
["htmlparser.voidelements"] = "src/htmlparser/voidelements.lua"
|
||||||
|
}
|
||||||
|
}
|
@ -1,5 +1,5 @@
|
|||||||
local ElementNode = require("ElementNode")
|
local ElementNode = require("htmlparser.ElementNode")
|
||||||
local voidelements = require("voidelements")
|
local voidelements = require("htmlparser.voidelements")
|
||||||
|
|
||||||
local HtmlParser = {}
|
local HtmlParser = {}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user