-- ***************************************************************
--
-- Parse HTML 4.01 strict (mostly)
-- Copyright 2020 by Sean Conner. All Rights Reserved.
--
-- This library is free software; you can redistribute it and/or modify it
-- under the terms of the GNU Lesser General Public License as published by
-- the Free Software Foundation; either version 3 of the License, or (at your
-- option) any later version.
--
-- This library is distributed in the hope that it will be useful, but
-- WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-- or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
-- License for more details.
--
-- You should have received a copy of the GNU Lesser General Public License
-- along with this library; if not, see http://www.gnu.org/licenses/.
--
-- Comments, questions and criticisms can be sent to: sean@conman.org
--
-- ********************************************************************
-- luacheck: ignore 611 631
-- HTML 4.01 strict
local ENTITIES = require "org.conman.const.entity"
local lpeg = require "lpeg"
local Carg = lpeg.Carg
local Cmt = lpeg.Cmt
local Cc = lpeg.Cc
local Cf = lpeg.Cf
local Cg = lpeg.Cg
local Cs = lpeg.Cs
local Ct = lpeg.Ct
local C = lpeg.C
local P = lpeg.P
local R = lpeg.R
local S = lpeg.S
local V = lpeg.V
local EMPTY = P(true)
local WS = S" \t\r\n"
local CHAR = P"&#" * C(R"09"^1) * P";" / utf8.char
+ P"&" * C(R("az","AZ","09")^1) * P";" / ENTITIES
+ Cmt( -- This exists JUST to support the PRE tag.
C(S" \t\r\n"^1 * Carg(1)),
function(_,position,cap,state)
return position,state.pre and cap or " "
end
)
+ P(1)
local CHARa = P"&#" * C(R"09"^1) * P";" / utf8.char
+ P"&" * C(R("az","AZ","09")^1) * P";" / ENTITIES
+ S" \t\r\n"^1 / " "
+ P(1)
-- *************************************************************************
local Hc,H do
local char = R("AZ","az")
/ function(c)
return P(c:lower()) + P(c:upper())
end
+ P(1)
/ function(c)
return P(c)
end
local cis = Cf(char^1,function(a,b) return a * b end)
Hc = function(s)
return cis:match(s) / s
end
H = function(s)
return cis:match(s)
end
end
-- *************************************************************************
local function tagi(name,attrib,body,optclose)
local open = P"<"
* Cg(Hc(name),'tag') * #S" \t\r\n>"
* Cg(
Cf(Ct"" * Cg(attrib)^0,function(t,n,v) t[n] = v return t end),
'attributes'
)
* Cg(Cc(true),'inline')
* WS^0 * P">"
local close = P"</" * H(name) * P">"
if optclose then
close = close^-1
end
return Ct(open * body * close)
end
-- *************************************************************************
local function tagb(name,attrib,body,optclose,optopen)
local otag,ctag do
if name == 'pre' then
otag = Cmt(
C(H(name) * #S" \t\r\n>" * Carg(1)),
function(_,position,capture,state)
state.pre = true
return position,capture:lower()
end
)
ctag = Cmt(
C(H(name) * Carg(1)),
function(_,position,_,state)
state.pre = false
return position
end
)
else
otag = Hc(name)
ctag = H(name)
end
end
local open = WS^0 * P"<"
* Cg(otag,'tag') * #S" \t\r\n>"
* Cg(
Cf(Ct"" * Cg(attrib)^0,function(t,n,v) t[n] = v return t end),
'attributes'
)
* Cg(Cc(true),'block')
* WS^0 * P">" * WS^0
local close = P"</" * ctag * P">" * WS^0
if optclose then
close = close^-1
end
if optopen then
return Ct((open + Cg(Cc(name),'tag') * Cg(Ct"",'attributes') * Cg(Cc(true),'block')) * body * close)
else
return Ct(open * body * close)
end
end
-- *************************************************************************
local function attribute(name,value)
local this = S" \t\r\n"^1 * Hc(name) * #S" \t\r\n=>"
local that do
if value then
that = P"'" * value * P"'"
+ P'"' * value * P'"'
+ value
else
that = P"'" * Cs((CHARa - P"'")^0) * P"'"
+ P'"' * Cs((CHARa - P'"')^0) * P'"'
+ Cs((CHARa - S" \t\r\n>")^0)
end
end
return this
* (S" \t\r\n"^0 * P'=' * S" \t\r\n"^0 * that + Cc"")
end
-- *************************************************************************
local PCDATA = Cs((CHAR - P"<")^1)
local Cinline = Ct(P"")^0,"comment") * P"-->")
local Cblock = Ct(P"")^0,"comment") * P"-->") * WS^0
local ATTR =
{
abbr = attribute('abbr'), -- Text
accept = attribute('accept'), -- ContentType
accept_charset = attribute('accept_charset'), -- Charsets
accesskey = attribute('accesskey'), -- Character
action = attribute('action'), -- URI
align = attribute('align',Hc'left' + Hc'center' + Hc'right' + Hc'justiry' + Hc'char'),
allowfullscreen = attribute('allowfullscreen',Hc'true'),
allowscriptaccess = attribute('allowscriptaccess'), -- ???
alt = attribute('alt'), -- Text or CDATA
archive = attribute('archive'), -- CDATA
axis = attribute('axis'), -- CDATA
bgcolor = attribute('bgcolor'), -- Color
border = attribute('border'), -- Pixels
cellpadding = attribute('cellpadding'), -- Length
cellspacing = attribute('cellspacing'), -- Length
char = attribute('char'), -- Character
charoff = attribute('charoff'), -- Length
charset = attribute('charset'), -- Charset
checked = attribute('checked',Hc'checked'),
cite = attribute('cite'), -- URI
class = attribute('class'), -- CDATA
classid = attribute('classid'), -- URI
codebase = attribute('codebase'), -- URI
codetype = attribute('codetype'), -- ContentType
color = attribute('color'), -- Color
cols = attribute('cols',R"09"^1/tonumber),
colspan = attribute('colspan',R"09"^1/tonumber),
content = attribute('content'), -- CDATA
coords = attribute('coords'), -- Coords
data = attribute('data'), -- URI
datafld = attribute('datafld'), -- CDATA
datapagesize = attribute('datapasesize'), -- CDATA
datasrc = attribute('datasrc'), -- URI
datetime = attribute('datetime'), -- Datetime
declare = attribute('declare',Hc'declare'),
defer = attribute('defer',Hc'defer'),
dir = attribute('dir', Hc'ltr' + Hc'rtl'),
disabled = attribute('disabled',Hc'disable'),
enctype = attribute('enctype'), -- ContentType
event = attribute('event'), -- CDATA
face = attribute('face'), -- CDATA
flashvars = attribute('flashvars'), -- ???
forr = attribute('for'), -- URI
frame = attribute('frame',Hc'void' + Hc'above' + Hc'below' + Hc'hsides' + Hc'lhs' + Hc'rhs' + Hc'vsides' + Hc'box' + Hc'border'),
headers = attribute('headers'), -- IDREFS
height = attribute('height'), -- Length
href = attribute('href'), -- URI
hreflang = attribute('hreflang'), -- LanguageCode
http_equiv = attribute('http-equiv'), -- NAME
id = attribute('id'), -- ID
ismap = attribute('ismap',Hc'ismap'),
label = attribute('label'), -- Text
lang = attribute('lang'), -- LanguageCode
longdesc = attribute('longdesc'), -- URI
maxlength = attribute('maxlength',R"09"^1/tonumber),
media = attribute('media'), -- MediaDesc
method = attribute('method',Hc'GET' + Hc'POST'),
multiple = attribute('multiple',Hc'multiple'),
name = attribute('name'), -- CDATA
nohref = attribute('nohref',Hc'nohref'),
onblur = attribute('onblur'), -- Script
onchange = attribute('onchange'), -- Script
onclick = attribute('onclick'), -- Script
ondblclick = attribute('ondblclick'), -- Script
onfocus = attribute('onfocus'), -- Script
onkeydown = attribute('onkeydown'), -- Script
onkeypress = attribute('onkeypress'), -- Script
onkeyup = attribute('onkeyup'), -- Script
onmousedown = attribute('onmousedown'), -- Script
onmousemove = attribute('onmousemove'), -- Script
onmouseout = attribute('onmouseout'), -- Script
onmouseover = attribute('onmouseover'), -- Script
onmouseup = attribute('onmouseup'), -- Script
onreset = attribute('onreset'), -- Script
onselect = attribute('onselect'), -- Script
onsubmit = attribute('onsubmit'), -- Script
pluginspage = attribute('pluginspage'), -- URI
profile = attribute('profile'), -- URI
quality = attribute('quality'), -- ???
readonly = attribute('readonly',Hc'readonly'),
rel = attribute('rel'), -- LinkTypes
rev = attribute('rev'), -- LinkTypes
rows = attribute('rows',R"09"^1/tonumber),
rowspan = attribute('rowspan',R"09"^1/tonumber),
rules = attribute('rules',Hc'none' + Hc'groups' + Hc'rows' + Hc'cols' + Hc'all'),
scheme = attribute('scheme'), -- CDATA
scope = attribute('scope'), -- Scope
selected = attribute('selected',Hc'selected'),
shape = attribute('shape'), -- Shape
size = attribute('size'), -- CDATA
span = attribute('span',R"09"^1/tonumber),
src = attribute('src'), -- URI
standby = attribute('standby'), -- Text
start = attribute('start',R"09"^1 / tonumber), -- XXX non standard
style = attribute('style'), -- StyleSheet
summary = attribute('summary'), -- Text
tabindex = attribute('tabindex',R"09"^1/tonumber),
title = attribute('title'), -- Text
type = attribute('type'), -- ContentType
type2 = attribute('type',Hc'button' + Hc'submit' + Hc'reset'),
usemap = attribute('usemap'), -- URI
valign = attribute('valign',Hc'top' + Hc'middle' + Hc'bottom' + Hc'baseline'),
value = attribute('value'), -- CDATA
valuetype = attribute('valuetype',Hc'DATA' + Hc'REF' + Hc'OBJECT'),
width = attribute('width'), -- Length
}
local coreattrs = ATTR.id + ATTR.class + ATTR.style + ATTR.title
local i18n = ATTR.lang + ATTR.dir
local events = ATTR.onclick + ATTR.ondblclick + ATTR.onmousedown
+ ATTR.onmouseup + ATTR.onmouseover + ATTR.onmousemove
+ ATTR.onmouseout + ATTR.onkeypress + ATTR.onkeydown
+ ATTR.onkeyup
local reserved = ATTR.datasrc + ATTR.datafld
local attrs = coreattrs + i18n + events
local cellhalign = ATTR.align + ATTR.char + ATTR.charoff
local cellvalign = ATTR.valign
local A_attr = attrs
+ ATTR.charset + ATTR.type + ATTR.name + ATTR.href
+ ATTR.hreflang + ATTR.rel + ATTR.rev + ATTR.accesskey
+ ATTR.shape + ATTR.coords + ATTR.tabindex + ATTR.onfocus
+ ATTR.onblur
local IMG_attr = attrs
+ ATTR.src + ATTR.alt + ATTR.longdesc + ATTR.name
+ ATTR.height + ATTR.width + ATTR.usemap + ATTR.ismap
local SCRIPT_attr = events
+ ATTR.charset + ATTR.type + ATTR.src
+ ATTR.defer + ATTR.forr
local AREA_attr = attrs
+ ATTR.shape + ATTR.coords + ATTR.href
+ ATTR.nohref + ATTR.alt + ATTR.tabindex
+ ATTR.accesskey + ATTR.onfocus + ATTR.onblur
local OBJECT_attr = attrs
+ ATTR.declare + ATTR.classid + ATTR.codebase + ATTR.data
+ ATTR.type + ATTR.codetype + ATTR.archive + ATTR.standby
+ ATTR.height + ATTR.width + ATTR.usemap + ATTR.name
+ ATTR.tabindex + reserved
local PARAM_attr = ATTR.id + ATTR.name + ATTR.value + ATTR.valuetype
+ ATTR.type
local EMBED_attr = attrs
+ ATTR.align
+ ATTR.allowfullscreen
+ ATTR.allowscriptaccess
+ ATTR.bgcolor
+ ATTR.flashvars
+ ATTR.height
+ ATTR.href
+ ATTR.pluginspage
+ ATTR.quality
+ ATTR.src
+ ATTR.type
+ ATTR.width
local FONT_attr = coreattrs + i18n
+ ATTR.size + ATTR.color + ATTR.face
local INPUT_attr = attrs
+ ATTR.type + ATTR.name + ATTR.value + ATTR.checked
+ ATTR.disabled + ATTR.readonly + ATTR.size + ATTR.maxlength
+ ATTR.src + ATTR.alt + ATTR.usemap + ATTR.ismap
+ ATTR.tabindex + ATTR.accesskey + ATTR.onfocus + ATTR.onblur
+ ATTR.onselect + ATTR.onchange + ATTR.accept + reserved
local SELECT_attr = attrs
+ ATTR.name + ATTR.size + ATTR.multiple + ATTR.disabled
+ ATTR.tabindex + ATTR.onfocus + ATTR.onblur + ATTR.onchange
+ reserved
local TEXTAREA_attr = attrs
+ ATTR.name + ATTR.rows + ATTR.cols + ATTR.disabled
+ ATTR.readonly + ATTR.tabindex + ATTR.accesskey + ATTR.onfocus
+ ATTR.onblur + ATTR.onselect + ATTR.onchange + reserved
local LABEL_attr = attrs
+ ATTR.forr + ATTR.accesskey + ATTR.onfocus + ATTR.onblur
local BUTTON_attr = attrs
+ ATTR.name + ATTR.value + ATTR.type2
+ ATTR.disabled + ATTR.tabindex + ATTR.accesskey
+ ATTR.onfocus + ATTR.onblur + reserved
local FORM_attr = attrs
+ ATTR.action + ATTR.method + ATTR.enctype + ATTR.accept
+ ATTR.name + ATTR.onsubmit + ATTR.onreset + ATTR.accept_charset
local TABLE_attr = attrs
+ ATTR.summary + ATTR.width + ATTR.border
+ ATTR.frame + ATTR.rules + ATTR.cellspacing
+ ATTR.cellpadding + ATTR.datapagesize + reserved
local THD_attr = attrs
+ ATTR.abbr + ATTR.axis + ATTR.headers + ATTR.scope
+ ATTR.rowspan + ATTR.colspan + cellhalign + cellvalign
-- *************************************************************************
local parse_tags = P {
'BODY',
BODY = V'flow', -- XXX
flow = V'block' + V'inline',
--=======================================================================
inline = V'fontstyle' + V'phrase' + V'special' + V'iINS'
+ V'formctrl' + V'iDEL' + Cinline + PCDATA,
fontstyle = V'TT' + V'I' + V'B' + V'BIG' + V'SMALL' + V'U',
TT = tagi('tt' , attrs , V'inline'^0),
I = tagi('i' , attrs , V'inline'^0),
B = tagi('b' , attrs , V'inline'^0),
BIG = tagi('big' , attrs , V'inline'^0),
SMALL = tagi('small' , attrs , V'inline'^0),
U = tagi('u' , attrs , V'inline'^0), -- XXX non-standard
phrase = V'EM' + V'STRONG' + V'DFN' + V'CODE' + V'SAMP'
+ V'KBD' + V'VAR' + V'CITE' + V'ABBR' + V'ACRONYM',
EM = tagi('em' , attrs , V'inline'^0),
STRONG = tagi('strong' , attrs , V'inline'^0),
DFN = tagi('dfn' , attrs , V'inline'^0),
CODE = tagi('code' , attrs , V'inline'^0),
SAMP = tagi('samp' , attrs , V'inline'^0),
KBD = tagi('kbd' , attrs , V'inline'^0),
VAR = tagi('var' , attrs , V'inline'^0),
CITE = tagi('cite' , attrs , V'inline'^0),
ABBR = tagi('abbr' , attrs , V'inline'^0),
ACRONYM = tagi('acronym', attrs , V'inline'^0),
special = V'A' + V'IMG' + V'BR' + V'SCRIPT' + V'BDO'
+ V'Q' + V'SUB' + V'SUP' + V'SPAN' + V'OBJECT'
+ V'FONT' + V'MAP',
A = tagi('a' , A_attr , (V'inline' - V'A')^0),
IMG = tagi('img' , IMG_attr , EMPTY,true),
BR = tagi('br' , coreattrs , EMPTY,true),
SCRIPT = tagi('script' , SCRIPT_attr , Cs((CHAR - (P"</" * Hc'script' * P">"))^0)), -- Script
Q = tagi('q' , (attrs + ATTR.cite),V'inline'^0),
SUB = tagi('sub' , attrs , V'inline'^0),
SUP = tagi('sup' , attrs , V'inline'^0),
SPAN = tagi('span' , (attrs + reserved) , V'inline'^0),
BDO = tagi('bdo' , (coreattrs + ATTR.lang + ATTR.dir),V'inline'^0),
MAP = tagi('map' , (attrs + ATTR.name) , (V'block' + V'AREA' + WS)^1),
AREA = tagi('area' , AREA_attr , EMPTY , true),
OBJECT = tagi('object' , OBJECT_attr , (V'PARAM' + V'EMBED' + V'flow')^0),
PARAM = tagi('param' , PARAM_attr , EMPTY , true),
EMBED = tagi('embed' , EMBED_attr , V'inline'^0), -- XXX non-standard
FONT = tagi('font' , FONT_attr , V'inline'^0), -- XXX non-standard
formctrl = V'INPUT' + V'SELECT' + V'TEXTAREA' + V'LABEL' + V'BUTTON',
INPUT = tagi('input' , INPUT_attr , EMPTY,true),
SELECT = tagi('select' , SELECT_attr , (V'OPTGROUP' + V'OPTION' + WS)^1),
TEXTAREA = tagi('textarea' , TEXTAREA_attr , PCDATA^0),
LABEL = tagi('label' , LABEL_attr , (V'inline' - V'LABEL')^0),
BUTTON = tagi('button' , BUTTON_attr , (V'flow' - (V'A' + V'formctrl' + V'FORM' + V'FIELDSET'))^0),
OPTGROUP = tagi('optgroup' , (attrs + ATTR.disabled + ATTR.label),(V'OPTION' + WS)^1),
OPTION = tagi('option' , (attrs + ATTR.selected + ATTR.disabled + ATTR.label + ATTR.value),PCDATA^0,true),
iINS = tagi('ins' , (attrs + ATTR.cite + ATTR.datetime),V'flow'^0),
iDEL = tagi('del' , (attrs + ATTR.cite + ATTR.datetime),V'flow'^0),
--=======================================================================
block = V'P' + V'PRE' + V'DL' + V'NOSCRIPT' + V'BLOCKQUOTE'
+ V'FORM' + V'HR' + V'TABLE' + V'FIELDSET' + V'ADDRESS'
+ V'H1' + V'H2' + V'H3' + V'H4' + V'H5'
+ V'H6' + V'UL' + V'OL' + V'DIV'
+ V'bINS' + V'bDEL' + Cblock ,
pre_exclude = V'IMG' + V'OBJECT' + V'BIG' + V'SMALL' + V'SUB' + V'SUP',
P = tagb('p' , attrs , V'inline'^0,true),
PRE = tagb('pre' , attrs , (V'inline' - V'pre_exclude')^0),
BLOCKQUOTE = tagb('blockquote' , (attrs + ATTR.cite),(V'block' + V'SCRIPT')^1),
HR = tagb('hr' , attrs , EMPTY,true),
ADDRESS = tagb('address' , attrs , V'inline'^0),
H1 = tagb('h1' , attrs , V'inline'^0),
H2 = tagb('h2' , attrs , V'inline'^0),
H3 = tagb('h3' , attrs , V'inline'^0),
H4 = tagb('h4' , attrs , V'inline'^0),
H5 = tagb('h5' , attrs , V'inline'^0),
H6 = tagb('h6' , attrs , V'inline'^0),
DIV = tagb('div' , (attrs + reserved),V'flow'^0),
DL = tagb('dl' , attrs , (V'DT' + V'DD')^1),
DT = tagb('dt' , attrs , V'inline'^0,true),
DD = tagb('dd' , attrs , V'flow'^0,true),
UL = tagb('ul' , attrs , (V'LI' + Cblock)^1),
OL = tagb('ol' , attrs + ATTR.start + ATTR.type, (V'LI' + Cblock)^1), -- XXX non-standard
LI = tagb('li' , attrs , V'flow'^0,true),
NOSCRIPT = tagb('noscript' , attrs , V'block'^1),
FORM = tagb('form' , FORM_attr , ((V'block' + V'SCRIPT') - V'FORM')^0),
FIELDSET = tagb('fieldset' , attrs , (V'LEGEND' + V'flow' + PCDATA)^0),
LEGEND = tagb('legend' , (attrs + ATTR.accesskey),V'inline'^0),
TABLE = tagb('table' , TABLE_attr,V'CAPTION'^-1 * (V'COL' + V'COLGROUP')^0 * V'THEAD'^-1 * V'TFOOT'^-1 * V'TBODY'^1),
CAPTION = tagb('caption' , attrs,V'inline'^0),
COL = tagb('col' , (attrs + ATTR.span + ATTR.width + cellhalign + cellvalign),EMPTY,true),
COLGROUP = tagb('colgroup' , (attrs + ATTR.span + ATTR.width + cellhalign + cellvalign),V'COL'^0,true),
THEAD = tagb('thead' , (attrs + cellhalign + cellvalign) , V'TR'^1,true),
TFOOT = tagb('tfoot' , (attrs + cellhalign + cellvalign) , V'TR'^1,true),
TBODY = tagb('tbody' , (attrs + cellhalign + cellvalign) , V'TR'^1,true,true),
TR = tagb('tr' , (attrs + cellhalign + cellvalign) , (V'TH' + V'TD')^1,true),
TH = tagb('th' , THD_attr , V'flow'^0,true),
TD = tagb('td' , THD_attr , V'flow'^0,true),
bINS = tagb('ins' , (attrs + ATTR.cite + ATTR.datetime),V'flow'^0),
bDEL = tagb('del' , (attrs + ATTR.cite + ATTR.datetime),V'flow'^0),
}
-- *************************************************************************
return Ct(parse_tags^1) * lpeg.Cp()
text/plain; charset=us-ascii
This content has been proxied by September (3851b).