// +-------------------------------------------------------------------------+ // | StreamFilters library vers. 0.2.03 | // | Copyright (c) Andrey V. Stolyarov 2022-2025 | // | ----------------------------------------------------------------------- | // | This is free software. Permission is granted to everyone to use, copy | // | or modify this software under the terms and conditions of | // | GNU LESSER GENERAL PUBLIC LICENSE, v. 2.1 | // | as published by Free Software Foundation (see the file LGPL.txt) | // | | // | Please visit http://www.croco.net/software/stfilter to get a fresh copy | // | ----------------------------------------------------------------------- | // | This code is provided strictly and exclusively on the "AS IS" basis. | // | !!! THERE IS NO WARRANTY OF ANY KIND, NEITHER EXPRESSED NOR IMPLIED !!! | // +-------------------------------------------------------------------------+ #include // for strlen, str[i]cmp #include "stfhtml.hpp" StreamFilterHtmlTags:: StreamFilterHtmlTags(const char * const * anames, const char * const * aattrs, StreamFilter *next) : StreamFilter(next), names(anames), attrs(aattrs), pre_tags(0), astate(start), qstate(unquoted) #if 0 namebuf(0), namebufsize(0), attrbuf(0), attrbufsize(0) #endif { } StreamFilterHtmlTags::~StreamFilterHtmlTags() { #if 0 if(namebuf) delete[] namebuf; #endif } // please note we don't use isalpha(3) function family // as well as str[n]casecmp(3) // so that we don't suck in the damn locale infrastructure static int isspc(int c) { return c==' ' || c=='\t' || c=='\r' || c=='\n' || c =='\v' || c=='\f'; } static int is_tagnamechar(int c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || c == ':' || c == '.'; } static int lowcase(int c) { return (c >= 'A' && c <= 'Z') ? c - 'A' + 'a' : c; } void StreamFilterHtmlTags::FeedChar(int c) { if(qstate != unquoted) { if(astate == altag_goodattr) PutChar(c); if((qstate == single_quoted && c == '\'') || (qstate == double_quoted && c == '\"')) { qstate = unquoted; } return; } // once we're here, this means we're in the unquoted state switch(astate) { case start: HandleStart(c); break; case just_started_tag: HandleJustStartedTag(c); break; case unnamed_tag: HandleUnnamedTag(c); break; case tag_name: HandleTagName(c); break; case allowed_tag: HandleAllowedTag(c); break; case altag_attrname: HandleAltagAttrname(c); break; case altag_expect_eq: HandleAltagExpectEq(c); break; case altag_expect_val: HandleAltagExpectVal(c); break; case altag_badattr: HandleAltagBadAttr(c); break; case altag_goodattr: HandleAltagGoodAttr(c); break; case altag_slashclose: HandleAltagSlashClose(c); break; case prohibited_tag: HandleProhibitedTag(c); break; case comment_in_excl: HandleCommentInExcl(c); break; case comment_in_dash: HandleCommentInDash(c); break; case comment: HandleComment(c); break; case comment_out_dash: HandleCommentOutDash(c); break; case comment_out_dash2: HandleCommentOutDash2(c); break; } } void StreamFilterHtmlTags::FeedEnd() { if(astate == allowed_tag || astate == altag_expect_eq || astate == altag_expect_val || astate == altag_badattr || astate == altag_goodattr || astate == altag_slashclose) { if(astate == altag_goodattr) { switch(qstate) { case single_quoted: PutChar('\''); break; case double_quoted: PutChar('\"'); break; case unquoted: ; } } if(astate == altag_slashclose) PutChar('/'); PutChar('>'); } PutEnd(); } void StreamFilterHtmlTags::Reset() { astate = start; qstate = unquoted; pre_level = 0; } void StreamFilterHtmlTags:: AddControlledNLReplacer(const char * const *pt, bool texstyle) { pre_tags = pt; StreamFilterHtmlReplaceNL *nlr = new StreamFilterHtmlReplaceNL(texstyle, 0); InsertAfter(nlr); pre_level = 0; } void StreamFilterHtmlTags::HandleStart(int c) { if(c == '<') { NLModeBlock(true); astate = just_started_tag; negtag = false; namebuf.Reset(); } else { PutChar(c); } } void StreamFilterHtmlTags::HandleJustStartedTag(int c) { if(c == '!') { astate = comment_in_excl; } else if(c == '/' || is_tagnamechar(c)) { astate = unnamed_tag; HandleUnnamedTag(c); } else { PutStr("<"); PutChar(c); astate = start; } } static bool tagname_eq(const char *a, const char *b) { while(*a && *b) { if(lowcase(*a) != lowcase(*b)) return 0; a++; b++; } return !*a && !*b; } static bool tagname_is_there(const char *name, const char * const *array) { int i; for(i = 0; array[i]; i++) if(tagname_eq(array[i], name)) return true; return false; } void StreamFilterHtmlTags::HandleUnnamedTag(int c) { switch(c) { case '/': if(negtag) // more than one '/' before the name is too much astate = prohibited_tag; else negtag = true; break; case '>': astate = start; // just ignore the malformed tag break; default: if(isspc(c)) break; if(is_tagnamechar(c)) { astate = tag_name; namebuf.Reset(); namebuf.AddChar(c); } else { astate = prohibited_tag; ProcessCharInsideSkippedTag(c); } } } void StreamFilterHtmlTags::HandleTagName(int c) { // correct delimiter after the name may be either space, or '>', or "/>" if(isspc(c) || c == '>' || c == '/') { if(!names || tagname_is_there(namebuf.Get(), names)) { astate = allowed_tag; if(!negtag) UpdateNLMode(); PutChar('<'); if(negtag) PutChar('/'); PutStr(namebuf.Get()); if(c == '>') CloseAllowedTag(); else if(c == '/') astate = altag_slashclose; } else { astate = prohibited_tag; if(c == '>') { NLModeBlock(false); astate = start; } } } else if(is_tagnamechar(c)) { namebuf.AddChar(c); } else { astate = prohibited_tag; ProcessCharInsideSkippedTag(c); } } void StreamFilterHtmlTags::HandleAllowedTag(int c) { switch(c) { case ' ': case '\t': case '\n': case '\r': break; case '=': attrbuf.Reset(); // no name, so the ``value'' will get ignored astate = altag_expect_val; break; case '<': astate = altag_badattr; break; case '\'': qstate = single_quoted; astate = altag_badattr; break; case '\"': qstate = double_quoted; astate = altag_badattr; break; case '/': astate = altag_slashclose; break; case '>': CloseAllowedTag(); break; default: attrbuf.Reset(); attrbuf.AddChar(c); astate = altag_attrname; } } static bool attrname_allowed(const char *tag, const char *attr, const char * const *array) { if(!array) return true; bool res; char *qual; int qlen = strlen(tag) + 1 + strlen(attr) + 1; qual = new char[qlen]; strcpy(qual, tag); strcat(qual, "="); strcat(qual, attr); int i; for(i = 0; array[i]; i++) { if(tagname_eq(array[i], attr) || tagname_eq(array[i], qual)) { res = true; goto quit; } } res = false; quit: delete[] qual; return res; } void StreamFilterHtmlTags::HandleAltagAttrname(int c) { switch(c) { case ' ': case '\t': case '\n': case '\r': astate = altag_expect_eq; break; case '=': astate = altag_expect_val; break; case '<': astate = altag_badattr; break; case '\'': qstate = single_quoted; astate = altag_badattr; break; case '\"': qstate = double_quoted; astate = altag_badattr; break; case '/': CommitAttrName(); astate = altag_slashclose; break; case '>': CommitAttrName(); CloseAllowedTag(); break; default: attrbuf.AddChar(c); } } void StreamFilterHtmlTags::HandleAltagExpectEq(int c) { switch(c) { case ' ': case '\t': case '\n': case '\r': break; case '=': astate = altag_expect_val; break; case '<': CommitAttrName(); astate = altag_badattr; break; case '\'': CommitAttrName(); qstate = single_quoted; astate = altag_badattr; break; case '\"': CommitAttrName(); qstate = double_quoted; astate = altag_badattr; break; case '/': CommitAttrName(); astate = altag_slashclose; break; case '>': CommitAttrName(); CloseAllowedTag(); break; default: CommitAttrName(); attrbuf.Reset(); attrbuf.AddChar(c); astate = altag_attrname; } } void StreamFilterHtmlTags::HandleAltagExpectVal(int c) { switch(c) { case ' ': case '\t': case '\n': case '\r': return; case '=': case '<': astate = altag_badattr; return; case '\'': qstate = single_quoted; break; case '\"': qstate = double_quoted; break; case '/': /* CommitAttrName(); */ // no commit! malformed attr astate = altag_slashclose; return; case '>': /* CommitAttrName(); */ // no commit! malformed attr CloseAllowedTag(); return; default: break; } bool good = CommitAttrName(); if(good) { PutChar('='); PutChar(c); astate = altag_goodattr; } else { astate = altag_badattr; } } void StreamFilterHtmlTags::HandleAltagBadAttr(int c) { switch(c) { case ' ': case '\t': case '\n': case '\r': astate = allowed_tag; break; case '=': case '<': break; case '\'': qstate = single_quoted; break; case '\"': qstate = double_quoted; break; case '/': astate = altag_slashclose; break; case '>': CloseAllowedTag(); break; default: break; } } void StreamFilterHtmlTags::HandleAltagGoodAttr(int c) { switch(c) { case ' ': case '\t': case '\n': case '\r': astate = allowed_tag; break; case '=': case '<': astate = altag_badattr; break; case '\'': PutChar(c); qstate = single_quoted; break; case '\"': PutChar(c); qstate = double_quoted; break; case '/': astate = altag_slashclose; break; case '>': CloseAllowedTag(); break; default: PutChar(c); break; } } void StreamFilterHtmlTags::HandleAltagSlashClose(int c) { if(c == '>') { PutChar(' '); PutChar('/'); CloseAllowedTag(); } } void StreamFilterHtmlTags::HandleProhibitedTag(int c) { if(qstate == unquoted && c == '\n') { PutChar('\n'); astate = start; return; } ProcessCharInsideSkippedTag(c); } void StreamFilterHtmlTags::HandleCommentInExcl(int c) { if(c == '-') { astate = comment_in_dash; } else { // this is incorrect comment start! let's handle this... somehow PutStr("<!"); PutChar(c); astate = start; } } void StreamFilterHtmlTags::HandleCommentInDash(int c) { if(c == '-') { astate = comment; } else { // this is incorrect comment start! let's handle this... somehow PutStr("<!-"); PutChar(c); } } void StreamFilterHtmlTags::HandleComment(int c) { if(c == '-') { astate = comment_out_dash; } } void StreamFilterHtmlTags::HandleCommentOutDash(int c) { if(c == '-') { astate = comment_out_dash2; } else { astate = comment; } } void StreamFilterHtmlTags::HandleCommentOutDash2(int c) { switch(c) { case '>': astate = start; NLModeBlock(false); break; case '-': // well, let's just remain waiting for ``-->'' // actually, this may be smth. like ``--------->'' break; default: // failed to quit the comment :-) astate = comment; } } void StreamFilterHtmlTags::ProcessCharInsideSkippedTag(int c) { if(c == '\'') qstate = single_quoted; else if(c == '\"') qstate = double_quoted; else if(c == '>') { astate = start; NLModeBlock(false); } } bool StreamFilterHtmlTags::CommitAttrName() { bool res = attrname_allowed(namebuf.Get(), attrbuf.Get(), attrs); if(res) { PutChar(' '); PutStr(attrbuf.Get()); } return res; } void StreamFilterHtmlTags::CloseAllowedTag() { PutChar('>'); NLModeBlock(false); if(negtag) UpdateNLMode(); astate = start; } void StreamFilterHtmlTags::NLModeBlock(bool b) { if(!pre_tags) return; StreamFilterHtmlReplaceNL *replacer = static_cast(GetNext()); if(b) replacer->TagStart(); else replacer->TagEnd(); } void StreamFilterHtmlTags::UpdateNLMode() { if(!pre_tags) return; StreamFilterHtmlReplaceNL *replacer = static_cast(GetNext()); if(tagname_is_there(namebuf.Get(), pre_tags)) { if(negtag) { if(pre_level > 0) pre_level--; if(pre_level == 0) replacer->Enable(); } else { if(pre_level == 0) replacer->Disable(); pre_level++; } } } //////////////////////////////////////////////////////////////////////// void StreamFilterHtmlReplaceNL::FeedChar(int c) { if(blocked && state == start && c == '<') { // VERY special case, when a paragraph starts with a tag // which (in the present version) is not marked as disabling // the NL replacing (but for future versions: with a tag which // may appear inside a paragraph) PutStr("

"); state = plain; PutChar('<'); return; } if(blocked) { PutChar(c); return; } if(c == '\r') { out_cr = true; // yes, we will transmit CRs for all NLs, ... return; // but we ignore them on input } switch(state) { case start: if(isspc(c)) { XPutChar(c); break; } // non-whitespace found! PutStr("

"); XPutChar(c); state = plain; break; case plain: if(c == '\n') { state = after_single_nl; lead_spaces = 0; if(texstyle) PutNL(); } else { XPutChar(c); } break; case after_single_nl: if(c == '\n') { PutStr("

"); PutNL(); state = start; } else if(isspc(c)) { lead_spaces++; if(texstyle) XPutChar(c); } else { CommitSingleNL(); XPutChar(c); state = plain; } break; case disabled: XPutChar(c); }; } void StreamFilterHtmlReplaceNL::FeedEnd() { if(state != start && state != disabled && !blocked) { // at least one char has been output! PutStr("

"); PutNL(); } /* the only ``unfinished'' situation is after 1 NL, but, well, we don't want any trailing spaces anyway */ PutEnd(); } void StreamFilterHtmlReplaceNL::Reset() { state = start; blocked = false; } void StreamFilterHtmlReplaceNL::TagStart() { if(state == after_single_nl) { CommitSingleNL(); state = plain; } blocked = true; } void StreamFilterHtmlReplaceNL::TagEnd() { blocked = false; } void StreamFilterHtmlReplaceNL::Enable() { state = start; } void StreamFilterHtmlReplaceNL::Disable() { if(state != start) { // at least one char was output! PutStr("

"); PutNL(); } state = disabled; } void StreamFilterHtmlReplaceNL::XPutChar(int c) { if(c == '\r') return; if(out_cr && c == '\n') PutChar('\r'); PutChar(c); } void StreamFilterHtmlReplaceNL::PutNL() { if(out_cr) PutChar('\r'); PutChar('\n'); } void StreamFilterHtmlReplaceNL::CommitSingleNL() { if(texstyle) return; PutStr("
"); PutNL(); int i; for(i = 0; i < lead_spaces; i++) PutChar(' '); } /////////////////////////////////////////////////////////////////////// void StreamFilterHtmlProtect::FeedChar(int c) { switch(c) { case '>': PutStr(">"); break; case '<': PutStr("<"); break; case '&': PutStr("&"); break; default: PutChar(c); } } /////////////////////////////////////////////////////////////////////// // boring tables follow /* the following tables cover up the whole set of HTTP4 (!) entities, as listed at https://www.w3schools.com/charsets/ref_html_entities_4.asp However, three small sections are excluded intentionally to reduce the search time, so ƒ, ◊, ♠, ♣, ♥ and ♦ are left aside; if you need them, remove the appropriate "#if 0"s below. Please note that HTML5 entities are not in the list; there are too many of them, and HTML5 itself is just another commitee-made bastard which has no right to exist. */ static const char *html_entities_a0[96] = { "nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar", "sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg", "macr", "deg", "plusmn", "sup2", "sup3", "acute", "micro", "para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14", "frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave", "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc", "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde", "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml", "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc", "atilde", "auml", "aring", "aelig", "ccedil", "egrave", "eacute", "ecirc", "euml", "igrave", "iacute", "icirc", "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde", "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc", "uuml", "yacute", "thorn", "yuml" }; #if 0 static const char *html_entities_192[1] = { "fnof" }; #endif static const char *html_entities_391[70] = { "Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta", "Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho", 0, "Sigma", "Tau", "Upsilon", "Phi", "Chi", "Psi", "Omega", 0, 0, 0, 0, 0, 0, 0, "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta", "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho", "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega", 0, 0, 0, 0, 0, 0, 0, "thetasym", "upsih", 0, 0, 0, "piv" }; static const char *html_entities_2022[35] = { "bull", 0, 0, 0, "hellip", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, "prime", "Prime", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, "oline", 0, 0, 0, 0, 0, "frasl" }; static const char *html_entities_2111[255] = { "image", 0, 0, 0, 0, 0, 0, "weierp", 0, 0, 0, "real", 0, 0, 0, 0, 0, "trade", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, "alefsym", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, "larr", "uarr", "rarr", "darr", "harr", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, "crarr", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, "lArr", "uArr", "rArr", "dArr", "hArr", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, "forall", 0, "part", "exist", 0, "empty", 0, "nabla", "isin", "notin", 0, "ni", 0, 0, 0, "prod" }; static const char *html_entities_2211[282] = { "sum", "minus", 0, 0, 0, 0, "lowast", 0, 0, "radic", 0, 0, "prop", "infin", 0, "ang", 0, 0, 0, 0, 0, 0, "and", "or", "cap", "cup", "int", 0, 0, 0, 0, 0, 0, 0, 0, "there4", 0, 0, 0, 0, 0, 0, 0, "sim", 0, 0, 0, 0, 0, 0, 0, 0, "cong", 0, 0, "asymp", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, "ne", "equiv", 0, 0, "le", "ge", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, "sub", "sup", "nsub", 0, "sube", "supe", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, "oplus", 0, "otimes", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, "perp", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, "sdot", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, "lceil", "rceil", "lfloor", "rfloor", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, "lang", "rang" }; #if 0 static const char *html_entities_25ca[1] = { "loz" }; static const char *html_entities_2660[7] = { "spades", 0, 0, "clubs", 0, "hearts", "diams" }; #endif struct html_entity_section { int base, len; const char **names; }; static struct html_entity_section html_entity_table[] = { { 0x000a0, 96, html_entities_a0 }, #if 0 { 0x00192, 1, html_entities_192 }, #endif { 0x00391, 70, html_entities_391 }, { 0x02022, 35, html_entities_2022 }, { 0x02111, 255, html_entities_2111 }, { 0x02211, 282, html_entities_2211 }, #if 0 { 0x025ca, 1, html_entities_25ca }, { 0x02660, 7, html_entities_2660 }, #endif { 0, 0, 0 } }; // end of boring tables /////////////////////////////////////////////////////////////////////// static const char *get_html_entity(int code) { struct html_entity_section *p; for(p = html_entity_table; p->names; p++) if(code >= p->base && code < p->base + p->len) return p->names[code - p->base]; return 0; } void StreamFilterUtf8ToHtml::UnknownCode(int code) { const char *e = get_html_entity(code); if(e) { PutChar('&'); PutStr(e); PutChar(';'); } else { PutStr("&#x"); PutHex(code, 0, true); PutChar(';'); } }