From 82dbddfd19e4a4d7e35910ad49e54cc83638459f Mon Sep 17 00:00:00 2001 From: Wouter Scherphof Date: Thu, 21 Mar 2013 14:03:17 +0100 Subject: [PATCH 1/3] parse valueless attributes (e.g. itemscope) And allow nested tags, which was a newly encountered bug --- htmlparser.lua | 25 ++++++++++++++----------- test.html | 18 ++++++++++++++++++ 2 files changed, 32 insertions(+), 11 deletions(-) diff --git a/htmlparser.lua b/htmlparser.lua index be80013..bcc4ad0 100644 --- a/htmlparser.lua +++ b/htmlparser.lua @@ -16,14 +16,18 @@ local function parse(text) local tagst, apos = tag:gettext(), 1 while true do - local start, k, quote, v - start, apos, k, quote = string.find(tagst, "%s+([^%s=]+)=(['\"]?)", apos) + local start, k, eq, quote, v + start, apos, k, eq, quote = string.find(tagst, "%s+([^%s=]+)(=?)(['\"]?)", apos) if not k then break end - local pattern = "=([^%s'\">]*)" - if quote ~= '' then - pattern = quote .. "([^" .. quote .. "]*)" .. quote + if eq == "" then + v = "" + else + local pattern = "=([^%s'\">]*)" + if quote ~= '' then + pattern = quote .. "([^" .. quote .. "]*)" .. quote + end + start, apos, v = string.find(tagst, pattern, apos) end - start, apos, v = string.find(tagst, pattern, apos) tag:addattribute(k, v) end @@ -31,17 +35,16 @@ local function parse(text) descend = false tag:close() else - opentags[tag.name] = tag + opentags[tag.name] = opentags[tag.name] or {} + table.insert(opentags[tag.name], tag) end local closeend = tpos while true do local closestart, closing, closename closestart, closeend, closing, closename = string.find(root._text, "[^<]*<(/?)(%w+)", closeend) - closing = closing and closing ~= '' - if not closing then break end - tag = opentags[closename] - opentags[closename] = nil + if not closing or closing == "" then break end + tag = table.remove(opentags[closename]) closestart = string.find(root._text, "<", closestart) tag:close(closestart, closeend + 1) node = tag.parent diff --git a/test.html b/test.html index 3e86fb5..c3ea16d 100644 --- a/test.html +++ b/test.html @@ -39,5 +39,23 @@ +
+ Hello, my name is + John Doe, + I am a + graduate research assistant + at the + University of Dreams. + My friends call me + Johnny. + You can visit my homepage at + . +
+ I live at + 1234 Peach Drive, + Warner Robins, + Georgia. +
+
\ No newline at end of file From 78d99a61f6f7915ca67ea57480ae00228d537f52 Mon Sep 17 00:00:00 2001 From: Wouter Scherphof Date: Thu, 21 Mar 2013 14:30:29 +0100 Subject: [PATCH 2/3] incomplete test for extraxting microdata See -- TODO --- test.lua | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/test.lua b/test.lua index 9d3ffc9..2dd046e 100644 --- a/test.lua +++ b/test.lua @@ -74,5 +74,21 @@ for k,v in pairs(contacts) do end end +print("\nmicrodata") +local sel, scopes = root("[itemscope]"), {} +for i,v in ipairs(sel.nodes) do + local type = v.attributes["itemtype"] + if not v.attributes["itemprop"] then + scopes[type] = scopes[type] or {} + local item = {} + local sel = sel("[itemprop]") + for i,v in ipairs(sel.nodes) do + -- TODO + print("prop", v.attributes["itemprop"]) + end + table.insert(scopes[type], item) + end +end + From f9b04866b41f862d5393ae0dbd08396e3c65b3bf Mon Sep 17 00:00:00 2001 From: Wouter Scherphof Date: Fri, 22 Mar 2013 00:10:24 +0100 Subject: [PATCH 3/3] closes #7 Extracting microdata is fairly tedious compared to microformats, due to its generality. Should probably be included as a standard function of the parser, or even better, as a separate module, which then can concern itself with being fully standards compliant. --- test.html | 35 +++++++++++++++++----------------- test.lua | 56 ++++++++++++++++++++++++++++++++++++++----------------- 2 files changed, 57 insertions(+), 34 deletions(-) diff --git a/test.html b/test.html index c3ea16d..3b5f88b 100644 --- a/test.html +++ b/test.html @@ -39,23 +39,24 @@ -
- Hello, my name is - John Doe, - I am a - graduate research assistant - at the - University of Dreams. - My friends call me - Johnny. - You can visit my homepage at - . -
- I live at - 1234 Peach Drive, - Warner Robins, - Georgia. +

About me

+
+ Hello, my name is + John Doe, + I am a + graduate research assistant + at the + University of Dreams. + My friends call me + Johnny. + You can visit my homepage at + . +
+ I live at + 1234 Peach Drive, + Warner Robins, + Georgia. +
-
\ No newline at end of file diff --git a/test.lua b/test.lua index 2dd046e..0d42f4a 100644 --- a/test.lua +++ b/test.lua @@ -51,22 +51,24 @@ select("ul > *") select("body [class]") select("body > [class]") +print("\nchapters") local sel, chapters = root("ol.chapters > li"), {} for _,v in ipairs(sel.nodes) do table.insert(chapters, v:getcontent()) end -print("\nchapters") +-- print for i,v in ipairs(chapters) do print(i, v) end +print("\ncontacts") local sel, contacts = root("ul.contacts > li")("span[class]"), {} for _,v in ipairs(sel.nodes) do local id = v.parent.parent.id -- li > a > span contacts[id] = contacts[id] or {} contacts[id][v.classes[1]] = v:getcontent() end -print("\ncontacts") +-- print for k,v in pairs(contacts) do print(k) for fk,fv in pairs(v) do @@ -75,20 +77,40 @@ for k,v in pairs(contacts) do end print("\nmicrodata") -local sel, scopes = root("[itemscope]"), {} -for i,v in ipairs(sel.nodes) do - local type = v.attributes["itemtype"] - if not v.attributes["itemprop"] then - scopes[type] = scopes[type] or {} - local item = {} - local sel = sel("[itemprop]") - for i,v in ipairs(sel.nodes) do - -- TODO - print("prop", v.attributes["itemprop"]) - end - table.insert(scopes[type], item) +local sel, scopes = root("[itemprop]"), {} +for _,prop in ipairs(sel.nodes) do + if prop.attributes["itemscope"] then goto nextprop end + local descendantscopes, scope = {}, prop + while true do + repeat + scope = scope.parent + until scope.attributes["itemscope"] + if not scope.attributes["itemprop"] then break end + table.insert(descendantscopes, 1, scope) + end + scopes[scope] = scopes[scope] or {} + local entry = scopes[scope] + for _,v in ipairs(descendantscopes) do + entry[v] = entry[v] or {} + entry = entry[v] + end + local k, v = prop.attributes["itemprop"], prop:getcontent() + entry[k] = v + ::nextprop:: +end +-- print +local function printscope(node, table, level) + level = level or 1 + local scopeprop = node.attributes["itemprop"] or "" + print(string.rep(" ", level - 1) .. node.attributes["itemtype"], scopeprop) + for prop,v in pairs(table) do + if type(prop) == "table" then + printscope(prop, v, level + 1) + else + print(string.rep(" ", level) .. prop .. "=[" .. v .. "]") + end end end - - - +for node,table in pairs(scopes) do + printscope(node, table) +end \ No newline at end of file