From 82dbddfd19e4a4d7e35910ad49e54cc83638459f Mon Sep 17 00:00:00 2001 From: Wouter Scherphof Date: Thu, 21 Mar 2013 14:03:17 +0100 Subject: [PATCH] parse valueless attributes (e.g. itemscope) And allow nested tags, which was a newly encountered bug --- htmlparser.lua | 25 ++++++++++++++----------- test.html | 18 ++++++++++++++++++ 2 files changed, 32 insertions(+), 11 deletions(-) diff --git a/htmlparser.lua b/htmlparser.lua index be80013..bcc4ad0 100644 --- a/htmlparser.lua +++ b/htmlparser.lua @@ -16,14 +16,18 @@ local function parse(text) local tagst, apos = tag:gettext(), 1 while true do - local start, k, quote, v - start, apos, k, quote = string.find(tagst, "%s+([^%s=]+)=(['\"]?)", apos) + local start, k, eq, quote, v + start, apos, k, eq, quote = string.find(tagst, "%s+([^%s=]+)(=?)(['\"]?)", apos) if not k then break end - local pattern = "=([^%s'\">]*)" - if quote ~= '' then - pattern = quote .. "([^" .. quote .. "]*)" .. quote + if eq == "" then + v = "" + else + local pattern = "=([^%s'\">]*)" + if quote ~= '' then + pattern = quote .. "([^" .. quote .. "]*)" .. quote + end + start, apos, v = string.find(tagst, pattern, apos) end - start, apos, v = string.find(tagst, pattern, apos) tag:addattribute(k, v) end @@ -31,17 +35,16 @@ local function parse(text) descend = false tag:close() else - opentags[tag.name] = tag + opentags[tag.name] = opentags[tag.name] or {} + table.insert(opentags[tag.name], tag) end local closeend = tpos while true do local closestart, closing, closename closestart, closeend, closing, closename = string.find(root._text, "[^<]*<(/?)(%w+)", closeend) - closing = closing and closing ~= '' - if not closing then break end - tag = opentags[closename] - opentags[closename] = nil + if not closing or closing == "" then break end + tag = table.remove(opentags[closename]) closestart = string.find(root._text, "<", closestart) tag:close(closestart, closeend + 1) node = tag.parent diff --git a/test.html b/test.html index 3e86fb5..c3ea16d 100644 --- a/test.html +++ b/test.html @@ -39,5 +39,23 @@ +
+ Hello, my name is + John Doe, + I am a + graduate research assistant + at the + University of Dreams. + My friends call me + Johnny. + You can visit my homepage at + . +
+ I live at + 1234 Peach Drive, + Warner Robins, + Georgia. +
+
\ No newline at end of file