-- ***************************************************************


-- Parse HTML 4.01 strict (mostly)

-- Copyright 2020 by Sean Conner. All Rights Reserved.


-- This library is free software; you can redistribute it and/or modify it

-- under the terms of the GNU Lesser General Public License as published by

-- the Free Software Foundation; either version 3 of the License, or (at your

-- option) any later version.


-- This library is distributed in the hope that it will be useful, but

-- WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY

-- or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public

-- License for more details.


-- You should have received a copy of the GNU Lesser General Public License

-- along with this library; if not, see http://www.gnu.org/licenses/.


-- Comments, questions and criticisms can be sent to: sean@conman.org


-- ********************************************************************

-- luacheck: ignore 611 631

-- HTML 4.01 strict

local ENTITIES = require "org.conman.const.entity"

local lpeg = require "lpeg"

local Carg = lpeg.Carg

local Cmt = lpeg.Cmt

local Cc = lpeg.Cc

local Cf = lpeg.Cf

local Cg = lpeg.Cg

local Cs = lpeg.Cs

local Ct = lpeg.Ct

local C = lpeg.C

local P = lpeg.P

local R = lpeg.R

local S = lpeg.S

local V = lpeg.V

local EMPTY = P(true)

local WS = S" \t\r\n"

local CHAR = P"&#" * C(R"09"^1) * P";" / utf8.char

        + P"&"  * C(R("az","AZ","09")^1) * P";" / ENTITIES

        + Cmt( -- This exists JUST to support the PRE tag.

               C(S" \t\r\n"^1 * Carg(1)),


                 return position,state.pre and cap or " "



        + P(1)

local CHARa = P"&#" * C(R"09"^1) * P";" / utf8.char

        + P"&"  * C(R("az","AZ","09")^1) * P";" / ENTITIES

        + S" \t\r\n"^1 / " "

        + P(1)

-- *************************************************************************

local Hc,H do

local char = R("AZ","az")

         / function(c)

             return P(c:lower()) + P(c:upper())


         + P(1)

         / function(c)

             return P(c)


local cis = Cf(char^1,function(a,b) return a * b end)

Hc = function(s)

return cis:match(s) / s


H = function(s)

return cis:match(s)



-- *************************************************************************

local function tagi(name,attrib,body,optclose)

local open = P"<"

          * Cg(Hc(name),'tag') * #S" \t\r\n>"

          * Cg(

                Cf(Ct"" * Cg(attrib)^0,function(t,n,v) t[n] = v return t end),



          * Cg(Cc(true),'inline')

          * WS^0 * P">"

local close = P"</" * H(name) * P">"

if optclose then

close = close^-1


return Ct(open * body * close)


-- *************************************************************************

local function tagb(name,attrib,body,optclose,optopen)

local otag,ctag do

if name == 'pre' then

  otag = Cmt(

              C(H(name) * #S" \t\r\n>" * Carg(1)),


                state.pre = true

                return position,capture:lower()



  ctag = Cmt(

              C(H(name) * Carg(1)),


                state.pre = false

                return position




  otag = Hc(name)

  ctag = H(name)



local open = WS^0 * P"<"

         * Cg(otag,'tag') * #S" \t\r\n>"

         * Cg(

                Cf(Ct"" * Cg(attrib)^0,function(t,n,v) t[n] = v return t end),



          * Cg(Cc(true),'block')

          * WS^0 * P">" * WS^0

local close = P"</" * ctag * P">" * WS^0

if optclose then

close = close^-1


if optopen then

return Ct((open + Cg(Cc(name),'tag') * Cg(Ct"",'attributes') * Cg(Cc(true),'block')) * body * close)


return Ct(open * body * close)



-- *************************************************************************

local function attribute(name,value)

local this = S" \t\r\n"^1 * Hc(name) * #S" \t\r\n=>"

local that do

if value then

  that = P"'" * value * P"'"

       + P'"' * value * P'"'

       + value


  that = P"'" * Cs((CHARa - P"'")^0) * P"'"

       + P'"' * Cs((CHARa - P'"')^0) * P'"'

       + Cs((CHARa - S" \t\r\n>")^0)



return this

   * (S" \t\r\n"^0 * P'=' * S" \t\r\n"^0 * that + Cc"")


-- *************************************************************************

local PCDATA = Cs((CHAR - P"<")^1)

local Cinline = Ct(P"")^0,"comment") * P"-->")

local Cblock = Ct(P"")^0,"comment") * P"-->") * WS^0

local ATTR =


abbr = attribute('abbr'), -- Text

accept = attribute('accept'), -- ContentType

accept_charset = attribute('accept_charset'), -- Charsets

accesskey = attribute('accesskey'), -- Character

action = attribute('action'), -- URI

align = attribute('align',Hc'left' + Hc'center' + Hc'right' + Hc'justiry' + Hc'char'),

allowfullscreen = attribute('allowfullscreen',Hc'true'),

allowscriptaccess = attribute('allowscriptaccess'), -- ???

alt = attribute('alt'), -- Text or CDATA

archive = attribute('archive'), -- CDATA

axis = attribute('axis'), -- CDATA

bgcolor = attribute('bgcolor'), -- Color

border = attribute('border'), -- Pixels

cellpadding = attribute('cellpadding'), -- Length

cellspacing = attribute('cellspacing'), -- Length

char = attribute('char'), -- Character

charoff = attribute('charoff'), -- Length

charset = attribute('charset'), -- Charset

checked = attribute('checked',Hc'checked'),

cite = attribute('cite'), -- URI

class = attribute('class'), -- CDATA

classid = attribute('classid'), -- URI

codebase = attribute('codebase'), -- URI

codetype = attribute('codetype'), -- ContentType

color = attribute('color'), -- Color

cols = attribute('cols',R"09"^1/tonumber),

colspan = attribute('colspan',R"09"^1/tonumber),

content = attribute('content'), -- CDATA

coords = attribute('coords'), -- Coords

data = attribute('data'), -- URI

datafld = attribute('datafld'), -- CDATA

datapagesize = attribute('datapasesize'), -- CDATA

datasrc = attribute('datasrc'), -- URI

datetime = attribute('datetime'), -- Datetime

declare = attribute('declare',Hc'declare'),

defer = attribute('defer',Hc'defer'),

dir = attribute('dir', Hc'ltr' + Hc'rtl'),

disabled = attribute('disabled',Hc'disable'),

enctype = attribute('enctype'), -- ContentType

event = attribute('event'), -- CDATA

face = attribute('face'), -- CDATA

flashvars = attribute('flashvars'), -- ???

forr = attribute('for'), -- URI

frame = attribute('frame',Hc'void' + Hc'above' + Hc'below' + Hc'hsides' + Hc'lhs' + Hc'rhs' + Hc'vsides' + Hc'box' + Hc'border'),

headers = attribute('headers'), -- IDREFS

height = attribute('height'), -- Length

href = attribute('href'), -- URI

hreflang = attribute('hreflang'), -- LanguageCode

http_equiv = attribute('http-equiv'), -- NAME

id = attribute('id'), -- ID

ismap = attribute('ismap',Hc'ismap'),

label = attribute('label'), -- Text

lang = attribute('lang'), -- LanguageCode

longdesc = attribute('longdesc'), -- URI

maxlength = attribute('maxlength',R"09"^1/tonumber),

media = attribute('media'), -- MediaDesc

method = attribute('method',Hc'GET' + Hc'POST'),

multiple = attribute('multiple',Hc'multiple'),

name = attribute('name'), -- CDATA

nohref = attribute('nohref',Hc'nohref'),

onblur = attribute('onblur'), -- Script

onchange = attribute('onchange'), -- Script

onclick = attribute('onclick'), -- Script

ondblclick = attribute('ondblclick'), -- Script

onfocus = attribute('onfocus'), -- Script

onkeydown = attribute('onkeydown'), -- Script

onkeypress = attribute('onkeypress'), -- Script

onkeyup = attribute('onkeyup'), -- Script

onmousedown = attribute('onmousedown'), -- Script

onmousemove = attribute('onmousemove'), -- Script

onmouseout = attribute('onmouseout'), -- Script

onmouseover = attribute('onmouseover'), -- Script

onmouseup = attribute('onmouseup'), -- Script

onreset = attribute('onreset'), -- Script

onselect = attribute('onselect'), -- Script

onsubmit = attribute('onsubmit'), -- Script

pluginspage = attribute('pluginspage'), -- URI

profile = attribute('profile'), -- URI

quality = attribute('quality'), -- ???

readonly = attribute('readonly',Hc'readonly'),

rel = attribute('rel'), -- LinkTypes

rev = attribute('rev'), -- LinkTypes

rows = attribute('rows',R"09"^1/tonumber),

rowspan = attribute('rowspan',R"09"^1/tonumber),

rules = attribute('rules',Hc'none' + Hc'groups' + Hc'rows' + Hc'cols' + Hc'all'),

scheme = attribute('scheme'), -- CDATA

scope = attribute('scope'), -- Scope

selected = attribute('selected',Hc'selected'),

shape = attribute('shape'), -- Shape

size = attribute('size'), -- CDATA

span = attribute('span',R"09"^1/tonumber),

src = attribute('src'), -- URI

standby = attribute('standby'), -- Text

start = attribute('start',R"09"^1 / tonumber), -- XXX non standard

style = attribute('style'), -- StyleSheet

summary = attribute('summary'), -- Text

tabindex = attribute('tabindex',R"09"^1/tonumber),

title = attribute('title'), -- Text

type = attribute('type'), -- ContentType

type2 = attribute('type',Hc'button' + Hc'submit' + Hc'reset'),

usemap = attribute('usemap'), -- URI

valign = attribute('valign',Hc'top' + Hc'middle' + Hc'bottom' + Hc'baseline'),

value = attribute('value'), -- CDATA

valuetype = attribute('valuetype',Hc'DATA' + Hc'REF' + Hc'OBJECT'),

width = attribute('width'), -- Length


local coreattrs = ATTR.id + ATTR.class + ATTR.style + ATTR.title

local i18n = ATTR.lang + ATTR.dir

local events = ATTR.onclick + ATTR.ondblclick + ATTR.onmousedown

             + ATTR.onmouseup  + ATTR.onmouseover + ATTR.onmousemove

             + ATTR.onmouseout + ATTR.onkeypress  + ATTR.onkeydown

             + ATTR.onkeyup

local reserved = ATTR.datasrc + ATTR.datafld

local attrs = coreattrs + i18n + events

local cellhalign = ATTR.align + ATTR.char + ATTR.charoff

local cellvalign = ATTR.valign

local A_attr = attrs

           + ATTR.charset  + ATTR.type   + ATTR.name     + ATTR.href

           + ATTR.hreflang + ATTR.rel    + ATTR.rev      + ATTR.accesskey

           + ATTR.shape    + ATTR.coords + ATTR.tabindex + ATTR.onfocus

           + ATTR.onblur

local IMG_attr = attrs

           + ATTR.src    + ATTR.alt   + ATTR.longdesc + ATTR.name

           + ATTR.height + ATTR.width + ATTR.usemap   + ATTR.ismap

local SCRIPT_attr = events

              + ATTR.charset + ATTR.type + ATTR.src

              + ATTR.defer   + ATTR.forr

local AREA_attr = attrs

            + ATTR.shape     + ATTR.coords  + ATTR.href

            + ATTR.nohref    + ATTR.alt     + ATTR.tabindex

            + ATTR.accesskey + ATTR.onfocus + ATTR.onblur

local OBJECT_attr = attrs

              + ATTR.declare  + ATTR.classid  + ATTR.codebase + ATTR.data

              + ATTR.type     + ATTR.codetype + ATTR.archive  + ATTR.standby

              + ATTR.height   + ATTR.width    + ATTR.usemap   + ATTR.name

              + ATTR.tabindex + reserved

local PARAM_attr = ATTR.id + ATTR.name + ATTR.value + ATTR.valuetype

              + ATTR.type

local EMBED_attr = attrs

            + ATTR.align

            + ATTR.allowfullscreen

            + ATTR.allowscriptaccess

            + ATTR.bgcolor

            + ATTR.flashvars

            + ATTR.height

            + ATTR.href

            + ATTR.pluginspage

            + ATTR.quality

            + ATTR.src

            + ATTR.type

            + ATTR.width

local FONT_attr = coreattrs + i18n

             + ATTR.size + ATTR.color + ATTR.face

local INPUT_attr = attrs

             + ATTR.type     + ATTR.name      + ATTR.value   + ATTR.checked

             + ATTR.disabled + ATTR.readonly  + ATTR.size    + ATTR.maxlength

             + ATTR.src      + ATTR.alt       + ATTR.usemap  + ATTR.ismap

             + ATTR.tabindex + ATTR.accesskey + ATTR.onfocus + ATTR.onblur

             + ATTR.onselect + ATTR.onchange  + ATTR.accept  + reserved

local SELECT_attr = attrs

              + ATTR.name     + ATTR.size    + ATTR.multiple + ATTR.disabled

              + ATTR.tabindex + ATTR.onfocus + ATTR.onblur   + ATTR.onchange

              + reserved

local TEXTAREA_attr = attrs

                + ATTR.name     + ATTR.rows     + ATTR.cols      + ATTR.disabled

                + ATTR.readonly + ATTR.tabindex + ATTR.accesskey + ATTR.onfocus

                + ATTR.onblur   + ATTR.onselect + ATTR.onchange  + reserved

local LABEL_attr = attrs

             + ATTR.forr + ATTR.accesskey + ATTR.onfocus + ATTR.onblur

local BUTTON_attr = attrs

              + ATTR.name     + ATTR.value    + ATTR.type2

              + ATTR.disabled + ATTR.tabindex + ATTR.accesskey

              + ATTR.onfocus  + ATTR.onblur   + reserved

local FORM_attr = attrs

              + ATTR.action + ATTR.method   + ATTR.enctype + ATTR.accept

              + ATTR.name   + ATTR.onsubmit + ATTR.onreset + ATTR.accept_charset

local TABLE_attr = attrs

             + ATTR.summary     + ATTR.width        + ATTR.border

             + ATTR.frame       + ATTR.rules        + ATTR.cellspacing

             + ATTR.cellpadding + ATTR.datapagesize + reserved

local THD_attr = attrs

           + ATTR.abbr    + ATTR.axis    + ATTR.headers + ATTR.scope

           + ATTR.rowspan + ATTR.colspan + cellhalign   + cellvalign

-- *************************************************************************

local parse_tags = P {


BODY = V'flow', -- XXX

flow = V'block' + V'inline',


inline = V'fontstyle' + V'phrase' + V'special' + V'iINS'

        + V'formctrl'  + V'iDEL'   + Cinline    + PCDATA,

fontstyle = V'TT' + V'I' + V'B' + V'BIG' + V'SMALL' + V'U',

TT = tagi('tt' , attrs , V'inline'^0),

I = tagi('i' , attrs , V'inline'^0),

B = tagi('b' , attrs , V'inline'^0),

BIG = tagi('big' , attrs , V'inline'^0),

SMALL = tagi('small' , attrs , V'inline'^0),

U = tagi('u' , attrs , V'inline'^0), -- XXX non-standard

phrase = V'EM' + V'STRONG' + V'DFN' + V'CODE' + V'SAMP'

      + V'KBD' + V'VAR'    + V'CITE' + V'ABBR' + V'ACRONYM',

EM = tagi('em' , attrs , V'inline'^0),

STRONG = tagi('strong' , attrs , V'inline'^0),

DFN = tagi('dfn' , attrs , V'inline'^0),

CODE = tagi('code' , attrs , V'inline'^0),

SAMP = tagi('samp' , attrs , V'inline'^0),

KBD = tagi('kbd' , attrs , V'inline'^0),

VAR = tagi('var' , attrs , V'inline'^0),

CITE = tagi('cite' , attrs , V'inline'^0),

ABBR = tagi('abbr' , attrs , V'inline'^0),

ACRONYM = tagi('acronym', attrs , V'inline'^0),

special = V'A' + V'IMG' + V'BR' + V'SCRIPT' + V'BDO'

       + V'Q'    + V'SUB' + V'SUP' + V'SPAN'   + V'OBJECT'

       + V'FONT' + V'MAP',

A = tagi('a' , A_attr , (V'inline' - V'A')^0),

IMG = tagi('img' , IMG_attr , EMPTY,true),

BR = tagi('br' , coreattrs , EMPTY,true),

SCRIPT = tagi('script' , SCRIPT_attr , Cs((CHAR - (P"</" * Hc'script' * P">"))^0)), -- Script

Q = tagi('q' , (attrs + ATTR.cite),V'inline'^0),

SUB = tagi('sub' , attrs , V'inline'^0),

SUP = tagi('sup' , attrs , V'inline'^0),

SPAN = tagi('span' , (attrs + reserved) , V'inline'^0),

BDO = tagi('bdo' , (coreattrs + ATTR.lang + ATTR.dir),V'inline'^0),

MAP = tagi('map' , (attrs + ATTR.name) , (V'block' + V'AREA' + WS)^1),

AREA = tagi('area' , AREA_attr , EMPTY , true),

OBJECT = tagi('object' , OBJECT_attr , (V'PARAM' + V'EMBED' + V'flow')^0),

PARAM = tagi('param' , PARAM_attr , EMPTY , true),

EMBED = tagi('embed' , EMBED_attr , V'inline'^0), -- XXX non-standard

FONT = tagi('font' , FONT_attr , V'inline'^0), -- XXX non-standard


INPUT = tagi('input' , INPUT_attr , EMPTY,true),

SELECT = tagi('select' , SELECT_attr , (V'OPTGROUP' + V'OPTION' + WS)^1),

TEXTAREA = tagi('textarea' , TEXTAREA_attr , PCDATA^0),

LABEL = tagi('label' , LABEL_attr , (V'inline' - V'LABEL')^0),

BUTTON = tagi('button' , BUTTON_attr , (V'flow' - (V'A' + V'formctrl' + V'FORM' + V'FIELDSET'))^0),

OPTGROUP = tagi('optgroup' , (attrs + ATTR.disabled + ATTR.label),(V'OPTION' + WS)^1),

OPTION = tagi('option' , (attrs + ATTR.selected + ATTR.disabled + ATTR.label + ATTR.value),PCDATA^0,true),

iINS = tagi('ins' , (attrs + ATTR.cite + ATTR.datetime),V'flow'^0),

iDEL = tagi('del' , (attrs + ATTR.cite + ATTR.datetime),V'flow'^0),


block = V'P' + V'PRE' + V'DL' + V'NOSCRIPT' + V'BLOCKQUOTE'


    + V'H1'   + V'H2'   + V'H3'    + V'H4'       + V'H5'

    + V'H6'   + V'UL'   + V'OL'    + V'DIV'

    + V'bINS' + V'bDEL' + Cblock ,

pre_exclude = V'IMG' + V'OBJECT' + V'BIG' + V'SMALL' + V'SUB' + V'SUP',

P = tagb('p' , attrs , V'inline'^0,true),

PRE = tagb('pre' , attrs , (V'inline' - V'pre_exclude')^0),

BLOCKQUOTE = tagb('blockquote' , (attrs + ATTR.cite),(V'block' + V'SCRIPT')^1),

HR = tagb('hr' , attrs , EMPTY,true),

ADDRESS = tagb('address' , attrs , V'inline'^0),

H1 = tagb('h1' , attrs , V'inline'^0),

H2 = tagb('h2' , attrs , V'inline'^0),

H3 = tagb('h3' , attrs , V'inline'^0),

H4 = tagb('h4' , attrs , V'inline'^0),

H5 = tagb('h5' , attrs , V'inline'^0),

H6 = tagb('h6' , attrs , V'inline'^0),

DIV = tagb('div' , (attrs + reserved),V'flow'^0),

DL = tagb('dl' , attrs , (V'DT' + V'DD')^1),

DT = tagb('dt' , attrs , V'inline'^0,true),

DD = tagb('dd' , attrs , V'flow'^0,true),

UL = tagb('ul' , attrs , (V'LI' + Cblock)^1),

OL = tagb('ol' , attrs + ATTR.start + ATTR.type, (V'LI' + Cblock)^1), -- XXX non-standard

LI = tagb('li' , attrs , V'flow'^0,true),

NOSCRIPT = tagb('noscript' , attrs , V'block'^1),

FORM = tagb('form' , FORM_attr , ((V'block' + V'SCRIPT') - V'FORM')^0),

FIELDSET = tagb('fieldset' , attrs , (V'LEGEND' + V'flow' + PCDATA)^0),

LEGEND = tagb('legend' , (attrs + ATTR.accesskey),V'inline'^0),

TABLE = tagb('table' , TABLE_attr,V'CAPTION'^-1 * (V'COL' + V'COLGROUP')^0 * V'THEAD'^-1 * V'TFOOT'^-1 * V'TBODY'^1),

CAPTION = tagb('caption' , attrs,V'inline'^0),

COL = tagb('col' , (attrs + ATTR.span + ATTR.width + cellhalign + cellvalign),EMPTY,true),

COLGROUP = tagb('colgroup' , (attrs + ATTR.span + ATTR.width + cellhalign + cellvalign),V'COL'^0,true),

THEAD = tagb('thead' , (attrs + cellhalign + cellvalign) , V'TR'^1,true),

TFOOT = tagb('tfoot' , (attrs + cellhalign + cellvalign) , V'TR'^1,true),

TBODY = tagb('tbody' , (attrs + cellhalign + cellvalign) , V'TR'^1,true,true),

TR = tagb('tr' , (attrs + cellhalign + cellvalign) , (V'TH' + V'TD')^1,true),

TH = tagb('th' , THD_attr , V'flow'^0,true),

TD = tagb('td' , THD_attr , V'flow'^0,true),

bINS = tagb('ins' , (attrs + ATTR.cite + ATTR.datetime),V'flow'^0),

bDEL = tagb('del' , (attrs + ATTR.cite + ATTR.datetime),V'flow'^0),


-- *************************************************************************

return Ct(parse_tags^1) * lpeg.Cp()

