diff --git a/htmlparser.lua b/htmlparser.lua index be80013..bcc4ad0 100644 --- a/htmlparser.lua +++ b/htmlparser.lua @@ -16,14 +16,18 @@ local function parse(text) local tagst, apos = tag:gettext(), 1 while true do - local start, k, quote, v - start, apos, k, quote = string.find(tagst, "%s+([^%s=]+)=(['\"]?)", apos) + local start, k, eq, quote, v + start, apos, k, eq, quote = string.find(tagst, "%s+([^%s=]+)(=?)(['\"]?)", apos) if not k then break end - local pattern = "=([^%s'\">]*)" - if quote ~= '' then - pattern = quote .. "([^" .. quote .. "]*)" .. quote + if eq == "" then + v = "" + else + local pattern = "=([^%s'\">]*)" + if quote ~= '' then + pattern = quote .. "([^" .. quote .. "]*)" .. quote + end + start, apos, v = string.find(tagst, pattern, apos) end - start, apos, v = string.find(tagst, pattern, apos) tag:addattribute(k, v) end @@ -31,17 +35,16 @@ local function parse(text) descend = false tag:close() else - opentags[tag.name] = tag + opentags[tag.name] = opentags[tag.name] or {} + table.insert(opentags[tag.name], tag) end local closeend = tpos while true do local closestart, closing, closename closestart, closeend, closing, closename = string.find(root._text, "[^<]*<(/?)(%w+)", closeend) - closing = closing and closing ~= '' - if not closing then break end - tag = opentags[closename] - opentags[closename] = nil + if not closing or closing == "" then break end + tag = table.remove(opentags[closename]) closestart = string.find(root._text, "<", closestart) tag:close(closestart, closeend + 1) node = tag.parent diff --git a/test.html b/test.html index 3e86fb5..3b5f88b 100644 --- a/test.html +++ b/test.html @@ -39,5 +39,24 @@ +

About me

+
+ Hello, my name is + John Doe, + I am a + graduate research assistant + at the + University of Dreams. + My friends call me + Johnny. + You can visit my homepage at + . +
+ I live at + 1234 Peach Drive, + Warner Robins, + Georgia. +
+
\ No newline at end of file diff --git a/test.lua b/test.lua index 9d3ffc9..0d42f4a 100644 --- a/test.lua +++ b/test.lua @@ -51,22 +51,24 @@ select("ul > *") select("body [class]") select("body > [class]") +print("\nchapters") local sel, chapters = root("ol.chapters > li"), {} for _,v in ipairs(sel.nodes) do table.insert(chapters, v:getcontent()) end -print("\nchapters") +-- print for i,v in ipairs(chapters) do print(i, v) end +print("\ncontacts") local sel, contacts = root("ul.contacts > li")("span[class]"), {} for _,v in ipairs(sel.nodes) do local id = v.parent.parent.id -- li > a > span contacts[id] = contacts[id] or {} contacts[id][v.classes[1]] = v:getcontent() end -print("\ncontacts") +-- print for k,v in pairs(contacts) do print(k) for fk,fv in pairs(v) do @@ -74,5 +76,41 @@ for k,v in pairs(contacts) do end end - - +print("\nmicrodata") +local sel, scopes = root("[itemprop]"), {} +for _,prop in ipairs(sel.nodes) do + if prop.attributes["itemscope"] then goto nextprop end + local descendantscopes, scope = {}, prop + while true do + repeat + scope = scope.parent + until scope.attributes["itemscope"] + if not scope.attributes["itemprop"] then break end + table.insert(descendantscopes, 1, scope) + end + scopes[scope] = scopes[scope] or {} + local entry = scopes[scope] + for _,v in ipairs(descendantscopes) do + entry[v] = entry[v] or {} + entry = entry[v] + end + local k, v = prop.attributes["itemprop"], prop:getcontent() + entry[k] = v + ::nextprop:: +end +-- print +local function printscope(node, table, level) + level = level or 1 + local scopeprop = node.attributes["itemprop"] or "" + print(string.rep(" ", level - 1) .. node.attributes["itemtype"], scopeprop) + for prop,v in pairs(table) do + if type(prop) == "table" then + printscope(prop, v, level + 1) + else + print(string.rep(" ", level) .. prop .. "=[" .. v .. "]") + end + end +end +for node,table in pairs(scopes) do + printscope(node, table) +end \ No newline at end of file