diff --git a/src/htmlparser.lua b/src/htmlparser.lua index 8d00716..6ec174b 100644 --- a/src/htmlparser.lua +++ b/src/htmlparser.lua @@ -1,4 +1,4 @@ --- vim: ft=lua ts=2 +-- vim: ft=lua ts=2 sw=2 local esc = function(s) return string.gsub(s, "([%^%$%(%)%%%.%[%]%*%+%-%?])", "%%" .. "%1") end local str = tostring @@ -11,16 +11,13 @@ local voidelements = require("htmlparser.voidelements") local HtmlParser = {} -local tpl_rep={ - -- Replace table for template engines syntax that can confuse us. +local tpr = { -- Here we're replacing confusing sequences -- (things looking like tags, but appearing where tags can't) -- with definitelly invalid utf sequence, and later we'll replace them back - ["<%"] = char(208,209), - ["%>"] = char(209,208), + ["<"] = char(208,209,208,209), + [">"] = char(209,208,209,208), } -local tpl_rep_rev = {} - local function parse(text,limit) local text=str(text) @@ -28,15 +25,38 @@ local function parse(text,limit) local limit = limit or htmlparser_looplimit or 1000 local tpl = false - for k,v in pairs(tpl_rep) do - local mtc="("..esc(k)..")" - if text:match(mtc) then - tpl=true - text=text:gsub(mtc,tpl_rep) - tpl_rep_rev[v]=k; - end + + local function g(id,...) + local arg={...} + arg[id]=tpr[arg[id]] + tpl=true + return table.concat(arg) end + text = text + :gsub( + "(<)".. + "([^>]-)".. + "(<)", + function(...)return g(3,...)end + ):gsub( + "("..tpr["<"]..")".. + "([^%w%s])".. + "([^%2]-)".. + "(%2)".. + "(>)".. + "([^>]-)".. + "(>)", + function(...)return g(5,...)end + ):gsub( + [=[(['"])]=].. + [=[([^'>"]-)]=].. + "(>)".. + [=[([^'>"]-)]=].. + [=[(['"])]=], + function(...)return g(3,...)end + ) + local index = 0 local root = ElementNode:new(index, str(text)) @@ -80,7 +100,7 @@ local function parse(text,limit) if not k or k == "/>" or k == ">" then break end if eq == "=" then - local pattern = "=([^%s>]*)" + pattern = "=([^%s>]*)" if quote ~= "" then pattern = quote .. "([^" .. quote .. "]*)" .. quote end @@ -90,11 +110,8 @@ local function parse(text,limit) v=v or "" if tpl then - for rk,rv in pairs(tpl_rep_rev) do - local mtc="("..esc(rk)..")" - if text:match(mtc) then - v = v:gsub(mtc,tpl_rep_rev) - end + for rk,rv in pairs(tpr) do + v = v:gsub(rv,rk) end end @@ -133,11 +150,8 @@ local function parse(text,limit) end if tpl then - for k,v in pairs(tpl_rep_rev) do - local mtc="("..esc(k)..")" - if text:match(mtc) then - root._text = root._text:gsub(mtc,tpl_rep_rev) - end + for k,v in pairs(tpr) do + root._text = root._text:gsub(v,k) end end diff --git a/tst/init.lua b/tst/init.lua index 54d0dfd..3ea7ae2 100644 --- a/tst/init.lua +++ b/tst/init.lua @@ -1,3 +1,4 @@ +-- vim: ft=lua ts=2 sw=2 -- Omit next line in actual module clients; it's only to support development of the module itself package.path = "../src/?.lua;" .. package.path @@ -308,6 +309,19 @@ end function test_loop_limit() local tree = htmlparser.parse([[ + moo + moo + moo + foo=bar> + /> + > + /> + >k + >o +
>
+

+ with unclosed attribute +

]]) -- issue#42 assert(1==1)