thalassa/lib/stfilter/stfhtml.cpp

// +-------------------------------------------------------------------------+
// |                   StreamFilters library vers. 0.2.03                    |
// |  Copyright (c) Andrey V. Stolyarov <croco at croco dot net> 2022-2025   |
// | ----------------------------------------------------------------------- |
// | This is free software.  Permission is granted to everyone to use, copy  |
// |        or modify this software under the terms and conditions of        |
// |                 GNU LESSER GENERAL PUBLIC LICENSE, v. 2.1               |
// |     as published by Free Software Foundation (see the file LGPL.txt)    |
// |                                                                         |
// | Please visit http://www.croco.net/software/stfilter to get a fresh copy |
// | ----------------------------------------------------------------------- |
// |   This code is provided strictly and exclusively on the "AS IS" basis.  |
// | !!! THERE IS NO WARRANTY OF ANY KIND, NEITHER EXPRESSED NOR IMPLIED !!! |
// +-------------------------------------------------------------------------+


#include <string.h>    // for strlen, str[i]cmp

#include "stfhtml.hpp"


StreamFilterHtmlTags::
StreamFilterHtmlTags(const char * const * anames,
                     const char * const * aattrs,
                     StreamFilter *next)
    : StreamFilter(next), names(anames), attrs(aattrs), pre_tags(0),
    astate(start), qstate(unquoted)
#if 0
    namebuf(0), namebufsize(0), attrbuf(0), attrbufsize(0)
#endif
{
}

StreamFilterHtmlTags::~StreamFilterHtmlTags()
{
#if 0
    if(namebuf)
        delete[] namebuf;
#endif
}

// please note we don't use isalpha(3) function family
// as well as str[n]casecmp(3)
// so that we don't suck in the damn locale infrastructure

static int isspc(int c)
{
    return c==' ' || c=='\t' || c=='\r' || c=='\n' || c =='\v' || c=='\f';
}

static int is_tagnamechar(int c)
{
    return
        (c >= 'a' && c <= 'z') ||
        (c >= 'A' && c <= 'Z') ||
        (c >= '0' && c <= '9') ||
        c == '_' || c == ':' || c == '.';
}

static int lowcase(int c)
{
    return (c >= 'A' && c <= 'Z') ? c - 'A' + 'a' : c;
}

void StreamFilterHtmlTags::FeedChar(int c)
{
    if(qstate != unquoted) {
        if(astate == altag_goodattr)
            PutChar(c);
        if((qstate == single_quoted && c == '\'') ||
            (qstate == double_quoted && c == '\"'))
        {
            qstate = unquoted;
        }
        return;
    }
    // once we're here, this means we're in the unquoted state
    switch(astate) {
    case start:              HandleStart(c); break;
    case just_started_tag:   HandleJustStartedTag(c); break;
    case unnamed_tag:        HandleUnnamedTag(c); break;
    case tag_name:           HandleTagName(c); break;
    case allowed_tag:        HandleAllowedTag(c); break;
    case altag_attrname:     HandleAltagAttrname(c); break;
    case altag_expect_eq:    HandleAltagExpectEq(c); break;
    case altag_expect_val:   HandleAltagExpectVal(c); break;
    case altag_badattr:      HandleAltagBadAttr(c); break;
    case altag_goodattr:     HandleAltagGoodAttr(c); break;
    case altag_slashclose:   HandleAltagSlashClose(c); break;
    case prohibited_tag:     HandleProhibitedTag(c); break;
    case comment_in_excl:    HandleCommentInExcl(c); break;
    case comment_in_dash:    HandleCommentInDash(c); break;
    case comment:            HandleComment(c); break;
    case comment_out_dash:   HandleCommentOutDash(c); break;
    case comment_out_dash2:  HandleCommentOutDash2(c); break;
    }
}

void StreamFilterHtmlTags::FeedEnd()
{
    if(astate == allowed_tag || astate == altag_expect_eq ||
        astate == altag_expect_val || astate == altag_badattr ||
        astate == altag_goodattr || astate == altag_slashclose)
    {
        if(astate == altag_goodattr) {
            switch(qstate) {
            case single_quoted:
                PutChar('\'');
                break;
            case double_quoted:
                PutChar('\"');
                break;
            case unquoted:
                ;
            }
        }
        if(astate == altag_slashclose)
            PutChar('/');
        PutChar('>');
    }
    PutEnd();
}

void StreamFilterHtmlTags::Reset()
{
    astate = start;
    qstate = unquoted;
    pre_level = 0;
}

void StreamFilterHtmlTags::
AddControlledNLReplacer(const char * const *pt, bool texstyle)
{
    pre_tags = pt;
    StreamFilterHtmlReplaceNL *nlr =
        new StreamFilterHtmlReplaceNL(texstyle, 0);
    InsertAfter(nlr);
    pre_level = 0;
}

void StreamFilterHtmlTags::HandleStart(int c)
{
    if(c == '<') {
        NLModeBlock(true);
        astate = just_started_tag;
        negtag = false;
        namebuf.Reset();
    } else {
        PutChar(c);
    }
}

void StreamFilterHtmlTags::HandleJustStartedTag(int c)
{
    if(c == '!') {
        astate = comment_in_excl;
    } else
    if(c == '/' || is_tagnamechar(c)) {
        astate = unnamed_tag;
        HandleUnnamedTag(c);
    } else {
        PutStr("&lt;");
        PutChar(c);
        astate = start;
    }
}

static bool tagname_eq(const char *a, const char *b)
{
    while(*a && *b) {
        if(lowcase(*a) != lowcase(*b))
            return 0;
        a++;
        b++;
    }
    return !*a && !*b;
}

static bool tagname_is_there(const char *name, const char * const *array)
{
    int i;
    for(i = 0; array[i]; i++)
        if(tagname_eq(array[i], name))
            return true;
    return false;
}

void StreamFilterHtmlTags::HandleUnnamedTag(int c)
{
    switch(c) {
    case '/':
        if(negtag)  // more than one '/' before the name is too much
            astate = prohibited_tag;
        else
            negtag = true;
        break;
    case '>':
        astate = start;  // just ignore the malformed tag
        break;
    default:
        if(isspc(c))
            break;
        if(is_tagnamechar(c)) {
            astate = tag_name;
            namebuf.Reset();
            namebuf.AddChar(c);
        } else {
            astate = prohibited_tag;
            ProcessCharInsideSkippedTag(c);
        }
    }
}

void StreamFilterHtmlTags::HandleTagName(int c)
{
    // correct delimiter after the name may be either space, or '>', or "/>"
    if(isspc(c) || c == '>' || c == '/') {
        if(!names || tagname_is_there(namebuf.Get(), names)) {
            astate = allowed_tag;
            if(!negtag)
                UpdateNLMode();
            PutChar('<');
            if(negtag)
                PutChar('/');
            PutStr(namebuf.Get());
            if(c == '>')
                CloseAllowedTag();
            else
            if(c == '/')
                astate = altag_slashclose;
        } else {
            astate = prohibited_tag;
            if(c == '>') {
                NLModeBlock(false);
                astate = start;
            }
        }
    } else
    if(is_tagnamechar(c)) {
        namebuf.AddChar(c);
    } else {
        astate = prohibited_tag;
        ProcessCharInsideSkippedTag(c);
    }
}


void StreamFilterHtmlTags::HandleAllowedTag(int c)
{
    switch(c) {
    case ' ':
    case '\t':
    case '\n':
    case '\r':
        break;
    case '=':
        attrbuf.Reset();    // no name, so the ``value'' will get ignored
        astate = altag_expect_val;
        break;
    case '<':
        astate = altag_badattr;
        break;
    case '\'':
        qstate = single_quoted;
        astate = altag_badattr;
        break;
    case '\"':
        qstate = double_quoted;
        astate = altag_badattr;
        break;
    case '/':
        astate = altag_slashclose;
        break;
    case '>':
        CloseAllowedTag();
        break;
    default:
        attrbuf.Reset();
        attrbuf.AddChar(c);
        astate = altag_attrname;
    }
}

static bool attrname_allowed(const char *tag, const char *attr,
                             const char * const *array)
{
    if(!array)
        return true;

    bool res;
    char *qual;
    int qlen = strlen(tag) + 1 + strlen(attr) + 1;
    qual = new char[qlen];
    strcpy(qual, tag);
    strcat(qual, "=");
    strcat(qual, attr);

    int i;
    for(i = 0; array[i]; i++) {
        if(tagname_eq(array[i], attr) || tagname_eq(array[i], qual)) {
            res = true;
            goto quit;
        }
    }
    res = false;
quit:
    delete[] qual;
    return res;
}

void StreamFilterHtmlTags::HandleAltagAttrname(int c)
{
    switch(c) {
    case ' ':
    case '\t':
    case '\n':
    case '\r':
        astate = altag_expect_eq;
        break;
    case '=':
        astate = altag_expect_val;
        break;
    case '<':
        astate = altag_badattr;
        break;
    case '\'':
        qstate = single_quoted;
        astate = altag_badattr;
        break;
    case '\"':
        qstate = double_quoted;
        astate = altag_badattr;
        break;
    case '/':
        CommitAttrName();
        astate = altag_slashclose;
        break;
    case '>':
        CommitAttrName();
        CloseAllowedTag();
        break;
    default:
        attrbuf.AddChar(c);
    }
}

void StreamFilterHtmlTags::HandleAltagExpectEq(int c)
{
    switch(c) {
    case ' ':
    case '\t':
    case '\n':
    case '\r':
        break;
    case '=':
        astate = altag_expect_val;
        break;
    case '<':
        CommitAttrName();
        astate = altag_badattr;
        break;
    case '\'':
        CommitAttrName();
        qstate = single_quoted;
        astate = altag_badattr;
        break;
    case '\"':
        CommitAttrName();
        qstate = double_quoted;
        astate = altag_badattr;
        break;
    case '/':
        CommitAttrName();
        astate = altag_slashclose;
        break;
    case '>':
        CommitAttrName();
        CloseAllowedTag();
        break;
    default:
        CommitAttrName();
        attrbuf.Reset();
        attrbuf.AddChar(c);
        astate = altag_attrname;
    }
}

void StreamFilterHtmlTags::HandleAltagExpectVal(int c)
{
    switch(c) {
    case ' ':
    case '\t':
    case '\n':
    case '\r':
        return;
    case '=':
    case '<':
        astate = altag_badattr;
        return;
    case '\'':
        qstate = single_quoted;
        break;
    case '\"':
        qstate = double_quoted;
        break;
    case '/':
        /* CommitAttrName(); */ // no commit! malformed attr
        astate = altag_slashclose;
        return;
    case '>':
        /* CommitAttrName(); */ // no commit! malformed attr
        CloseAllowedTag();
        return;
    default:
        break;
    }

    bool good = CommitAttrName();
    if(good) {
        PutChar('=');
        PutChar(c);
        astate = altag_goodattr;
    } else {
        astate = altag_badattr;
    }
}

void StreamFilterHtmlTags::HandleAltagBadAttr(int c)
{
    switch(c) {
    case ' ':
    case '\t':
    case '\n':
    case '\r':
        astate = allowed_tag;
        break;
    case '=':
    case '<':
        break;
    case '\'':
        qstate = single_quoted;
        break;
    case '\"':
        qstate = double_quoted;
        break;
    case '/':
        astate = altag_slashclose;
        break;
    case '>':
        CloseAllowedTag();
        break;
    default:
        break;
    }
}

void StreamFilterHtmlTags::HandleAltagGoodAttr(int c)
{
    switch(c) {
    case ' ':
    case '\t':
    case '\n':
    case '\r':
        astate = allowed_tag;
        break;
    case '=':
    case '<':
        astate = altag_badattr;
        break;
    case '\'':
        PutChar(c);
        qstate = single_quoted;
        break;
    case '\"':
        PutChar(c);
        qstate = double_quoted;
        break;
    case '/':
        astate = altag_slashclose;
        break;
    case '>':
        CloseAllowedTag();
        break;
    default:
        PutChar(c);
        break;
    }
}

void StreamFilterHtmlTags::HandleAltagSlashClose(int c)
{
    if(c == '>') {
        PutChar(' ');
        PutChar('/');
        CloseAllowedTag();
    }
}

void StreamFilterHtmlTags::HandleProhibitedTag(int c)
{
    if(qstate == unquoted && c == '\n') {
        PutChar('\n');
        astate = start;
        return;
    }
    ProcessCharInsideSkippedTag(c);
}

void StreamFilterHtmlTags::HandleCommentInExcl(int c)
{
    if(c == '-') {
        astate = comment_in_dash;
    } else {
        // this is incorrect comment start! let's handle this... somehow
        PutStr("&lt;!");
        PutChar(c);
        astate = start;
    }
}

void StreamFilterHtmlTags::HandleCommentInDash(int c)
{
    if(c == '-') {
        astate = comment;
    } else {
        // this is incorrect comment start! let's handle this... somehow
        PutStr("&lt;!-");
        PutChar(c);
    }
}

void StreamFilterHtmlTags::HandleComment(int c)
{
    if(c == '-') {
        astate = comment_out_dash;
    }
}

void StreamFilterHtmlTags::HandleCommentOutDash(int c)
{
    if(c == '-') {
        astate = comment_out_dash2;
    } else {
        astate = comment;
    }
}

void StreamFilterHtmlTags::HandleCommentOutDash2(int c)
{
    switch(c) {
    case '>':
        astate = start;
        NLModeBlock(false);
        break;
    case '-':   // well, let's just remain waiting for ``-->''
                // actually, this may be smth. like ``--------->''
        break;
    default:    // failed to quit the comment :-)
        astate = comment;
    }
}


void StreamFilterHtmlTags::ProcessCharInsideSkippedTag(int c)
{
    if(c == '\'')
        qstate = single_quoted;
    else
    if(c == '\"')
        qstate = double_quoted;
    else
    if(c == '>') {
        astate = start;
        NLModeBlock(false);
    }
}

bool StreamFilterHtmlTags::CommitAttrName()
{
    bool res = attrname_allowed(namebuf.Get(), attrbuf.Get(), attrs);
    if(res) {
        PutChar(' ');
        PutStr(attrbuf.Get());
    }
    return res;
}

void StreamFilterHtmlTags::CloseAllowedTag()
{
    PutChar('>');
    NLModeBlock(false);
    if(negtag)
        UpdateNLMode();
    astate = start;
}

void StreamFilterHtmlTags::NLModeBlock(bool b)
{
    if(!pre_tags)
        return;
    StreamFilterHtmlReplaceNL *replacer =
        static_cast<StreamFilterHtmlReplaceNL*>(GetNext());
    if(b)
        replacer->TagStart();
    else
        replacer->TagEnd();
}

void StreamFilterHtmlTags::UpdateNLMode()
{
    if(!pre_tags)
        return;
    StreamFilterHtmlReplaceNL *replacer =
        static_cast<StreamFilterHtmlReplaceNL*>(GetNext());
    if(tagname_is_there(namebuf.Get(), pre_tags)) {
        if(negtag) {
            if(pre_level > 0)
                pre_level--;
            if(pre_level == 0)
                replacer->Enable();
        } else {
            if(pre_level == 0)
                replacer->Disable();
            pre_level++;
        }
    }
}


////////////////////////////////////////////////////////////////////////


void StreamFilterHtmlReplaceNL::FeedChar(int c)
{
    if(blocked && state == start && c == '<') {
            // VERY special case, when a paragraph starts with a tag
            // which (in the present version) is not marked as disabling
            // the NL replacing (but for future versions: with a tag which
            // may appear inside a paragraph)
        PutStr("<p>");
        state = plain;
        PutChar('<');
        return;
    }
    if(blocked) {
        PutChar(c);
        return;
    }
    if(c == '\r') {
        out_cr = true;   // yes, we will transmit CRs for all NLs, ...
        return;          // but we ignore them on input
    }
    switch(state) {
    case start:
        if(isspc(c)) {
            XPutChar(c);
            break;
        }
        // non-whitespace found!
        PutStr("<p>");
        XPutChar(c);
        state = plain;
        break;
    case plain:
        if(c == '\n') {
            state = after_single_nl;
            lead_spaces = 0;
            if(texstyle)
                PutNL();
        } else {
            XPutChar(c);
        }
        break;
    case after_single_nl:
        if(c == '\n') {
            PutStr("</p>");
            PutNL();
            state = start;
        } else
        if(isspc(c)) {
            lead_spaces++;
            if(texstyle)
                XPutChar(c);
        } else {
            CommitSingleNL();
            XPutChar(c);
            state = plain;
        }
        break;
    case disabled:
        XPutChar(c);
    };
}


void StreamFilterHtmlReplaceNL::FeedEnd()
{
    if(state != start && state != disabled && !blocked) {
                                  // at least one char has been output!
        PutStr("</p>");
        PutNL();
    }

    /* the only ``unfinished'' situation is after 1 NL, but, well,
       we don't want any trailing spaces anyway
     */

    PutEnd();
}

void StreamFilterHtmlReplaceNL::Reset()
{
    state = start;
    blocked = false;
}

void StreamFilterHtmlReplaceNL::TagStart()
{
    if(state == after_single_nl) {
        CommitSingleNL();
        state = plain;
    }
    blocked = true;
}

void StreamFilterHtmlReplaceNL::TagEnd()
{
    blocked = false;
}

void StreamFilterHtmlReplaceNL::Enable()
{
    state = start;
}

void StreamFilterHtmlReplaceNL::Disable()
{
    if(state != start) {   // at least one char was output!
        PutStr("</p>");
        PutNL();
    }
    state = disabled;
}

void StreamFilterHtmlReplaceNL::XPutChar(int c)
{
    if(c == '\r')
        return;
    if(out_cr && c == '\n')
        PutChar('\r');
    PutChar(c);
}

void StreamFilterHtmlReplaceNL::PutNL()
{
    if(out_cr)
        PutChar('\r');
    PutChar('\n');
}

void StreamFilterHtmlReplaceNL::CommitSingleNL()
{
    if(texstyle)
        return;
    PutStr("<br />");
    PutNL();
    int i;
    for(i = 0; i < lead_spaces; i++)
        PutChar(' ');
}


///////////////////////////////////////////////////////////////////////

void StreamFilterHtmlProtect::FeedChar(int c)
{
    switch(c) {
    case '>': PutStr("&gt;"); break;
    case '<': PutStr("&lt;"); break;
    case '&': PutStr("&amp;"); break;
    default:  PutChar(c);
    }
}

///////////////////////////////////////////////////////////////////////
// boring tables follow

/* the following tables cover up the whole set of HTTP4 (!) entities,
   as listed at https://www.w3schools.com/charsets/ref_html_entities_4.asp

   However, three small sections are excluded intentionally to reduce the
   search time, so &fnof;, &loz;, &spades;, &clubs;, &hearts; and &diams;
   are left aside; if you need them, remove the appropriate "#if 0"s
   below.

   Please note that HTML5 entities are not in the list; there are too
   many of them, and HTML5 itself is just another commitee-made bastard
   which has no right to exist.
 */

static const char *html_entities_a0[96] = {
    "nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar", "sect",
    "uml", "copy", "ordf", "laquo", "not", "shy", "reg", "macr",
    "deg", "plusmn", "sup2", "sup3", "acute", "micro", "para", "middot",
    "cedil", "sup1", "ordm", "raquo", "frac14", "frac12", "frac34", "iquest",
    "Agrave", "Aacute", "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil",
    "Egrave", "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc", "Iuml",
    "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde", "Ouml", "times",
    "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml", "Yacute", "THORN", "szlig",
    "agrave", "aacute", "acirc", "atilde", "auml", "aring", "aelig", "ccedil",
    "egrave", "eacute", "ecirc", "euml", "igrave", "iacute", "icirc", "iuml",
    "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde", "ouml", "divide",
    "oslash", "ugrave", "uacute", "ucirc", "uuml", "yacute", "thorn", "yuml"
};
#if 0
static const char *html_entities_192[1] = {
    "fnof"
};
#endif
static const char *html_entities_391[70] = {
    "Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta",
    "Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi",
    "Rho", 0, "Sigma", "Tau", "Upsilon", "Phi", "Chi", "Psi",
    "Omega", 0, 0, 0, 0, 0, 0, 0,
    "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta",
    "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi",
    "rho", "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi",
    "omega", 0, 0, 0, 0, 0, 0, 0,
    "thetasym", "upsih", 0, 0, 0, "piv"
};
static const char *html_entities_2022[35] = {
    "bull", 0, 0, 0, "hellip", 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    "prime", "Prime", 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, "oline", 0, 0, 0,
    0, 0, "frasl"
};
static const char *html_entities_2111[255] = {
    "image", 0, 0, 0, 0, 0, 0, "weierp",
    0, 0, 0, "real", 0, 0, 0, 0,
    0, "trade", 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, "alefsym", 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, "larr",
    "uarr", "rarr", "darr", "harr", 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, "crarr", 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, "lArr",
    "uArr", "rArr", "dArr", "hArr", 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, "forall",
    0, "part", "exist", 0, "empty", 0, "nabla", "isin",
    "notin", 0, "ni", 0, 0, 0, "prod"
};
static const char *html_entities_2211[282] = {
    "sum", "minus", 0, 0, 0, 0, "lowast", 0,
    0, "radic", 0, 0, "prop", "infin", 0, "ang",
    0, 0, 0, 0, 0, 0, "and", "or",
    "cap", "cup", "int", 0, 0, 0, 0, 0,
    0, 0, 0, "there4", 0, 0, 0, 0,
    0, 0, 0, "sim", 0, 0, 0, 0,
    0, 0, 0, 0, "cong", 0, 0, "asymp",
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, "ne",
    "equiv", 0, 0, "le", "ge", 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, "sub", "sup", "nsub", 0, "sube", "supe", 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, "oplus", 0, "otimes", 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, "perp", 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, "sdot", 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, "lceil",
    "rceil", "lfloor", "rfloor", 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    "lang", "rang"
};
#if 0
static const char *html_entities_25ca[1] = {
    "loz"
};
static const char *html_entities_2660[7] = {
    "spades", 0, 0, "clubs", 0, "hearts", "diams"
};
#endif
struct html_entity_section {
    int base, len;
    const char **names;
};
static struct html_entity_section html_entity_table[] = {
    { 0x000a0,  96, html_entities_a0 },
#if 0
    { 0x00192,   1, html_entities_192 },
#endif
    { 0x00391,  70, html_entities_391 },
    { 0x02022,  35, html_entities_2022 },
    { 0x02111, 255, html_entities_2111 },
    { 0x02211, 282, html_entities_2211 },
#if 0
    { 0x025ca,   1, html_entities_25ca },
    { 0x02660,   7, html_entities_2660 },
#endif
    { 0, 0, 0 }
};

// end of boring tables
///////////////////////////////////////////////////////////////////////

static const char *get_html_entity(int code)
{
    struct html_entity_section *p;
    for(p = html_entity_table; p->names; p++)
        if(code >= p->base && code < p->base + p->len)
            return p->names[code - p->base];
    return 0;
}

void StreamFilterUtf8ToHtml::UnknownCode(int code)
{
    const char *e = get_html_entity(code);
    if(e) {
        PutChar('&');
        PutStr(e);
        PutChar(';');
    } else {
        PutStr("&#x");
        PutHex(code, 0, true);
        PutChar(';');
    }
}