thalassa/lib/stfilter/stfhtml.hpp

180 lines
6.9 KiB
C++
Raw Normal View History

2026-03-19 01:23:52 +00:00
// +-------------------------------------------------------------------------+
// | StreamFilters library vers. 0.2.03 |
// | Copyright (c) Andrey V. Stolyarov <croco at croco dot net> 2022-2025 |
// | ----------------------------------------------------------------------- |
// | This is free software. Permission is granted to everyone to use, copy |
// | or modify this software under the terms and conditions of |
// | GNU LESSER GENERAL PUBLIC LICENSE, v. 2.1 |
// | as published by Free Software Foundation (see the file LGPL.txt) |
// | |
// | Please visit http://www.croco.net/software/stfilter to get a fresh copy |
// | ----------------------------------------------------------------------- |
// | This code is provided strictly and exclusively on the "AS IS" basis. |
// | !!! THERE IS NO WARRANTY OF ANY KIND, NEITHER EXPRESSED NOR IMPLIED !!! |
// +-------------------------------------------------------------------------+
#ifndef STFHTML_HPP_SENTRY
#define STFHTML_HPP_SENTRY
#include "stfilter.hpp"
#include "stfencod.hpp"
//! Filter off HTML tags that aren't explicitly allowed
class StreamFilterHtmlTags : public StreamFilter {
const char * const * names;
const char * const * attrs;
const char * const * pre_tags;
enum {
start, just_started_tag, unnamed_tag, tag_name,
allowed_tag,
altag_attrname, altag_expect_eq, altag_expect_val,
altag_badattr, altag_goodattr, altag_slashclose,
prohibited_tag,
comment_in_excl, /* seen ``<!'' */
comment_in_dash, /* seen ``<!-'' */
comment, /* inside the comment */
comment_out_dash, /* seen ``-'' */
comment_out_dash2 /* seen ``--'' */
};
enum { unquoted, single_quoted, double_quoted };
char astate, qstate; // astate <- start... ; qstate <- unquoted...
bool negtag;
/* qstate can only become anything other than unquoted within the
tags AND when the tag name is known, that is, in allowed_tag,
altag_* and progibited_tag states;
in case qstate!=unquoted, the astate is simply ignored until
the matching quote is found and qstate is unquoted again
*/
StreamFilterBuffer namebuf, attrbuf;
int pre_level;
public:
//! The constructor
/*! \param both allowed_names and allowed_tags must be
NULL-terminated arrays of string pointers;
neither the arrays nor the strings are copied nor owned;
make sure they exist all the time the object is in use,
and that they don't change after the filtering is started
(but it's ok to change them between the construction of the
object and the first time the FeedChar method is called).
This is because the object calculates the length of the
longest name/attr and only stores that many chars of the analysed
name/attr; if the name is longer, the tag is assumed prohibited.
NULL means all tags/attrs are allowed (makes sense if we're going
to call AddControlledNLReplacer so nothing will be filtered off
but the object will still tell the NLReplacer where not to
replace NLs); to prohibit all tags, pass a pointer to a NULL
pointer (that is, an array of zero length, as opposite to NULL).
Within the allowed_attrs array, a string containing no ``=''
means this attr is allowed for any allowed tag, while a
string of the form ``name=attr'' means the attr is only allowed
for the particular tag, e.g., "img=src", "a=href"...
*/
StreamFilterHtmlTags(const char * const * allowed_names,
const char * const * allowed_attrs,
StreamFilter *next);
~StreamFilterHtmlTags();
void FeedChar(int c);
void FeedEnd();
void Reset();
void AddControlledNLReplacer(const char * const *pre_tags, bool texstyle);
private:
void HandleStart(int c);
void HandleJustStartedTag(int c);
void HandleUnnamedTag(int c);
void HandleTagName(int c);
void HandleAllowedTag(int c);
void HandleAltagAttrname(int c);
void HandleAltagExpectEq(int c);
void HandleAltagExpectVal(int c);
void HandleAltagBadAttr(int c);
void HandleAltagGoodAttr(int c);
void HandleAltagSlashClose(int c);
void HandleProhibitedTag(int c);
void HandleCommentInExcl(int c);
void HandleCommentInDash(int c);
void HandleComment(int c);
void HandleCommentOutDash(int c);
void HandleCommentOutDash2(int c);
void ProcessCharInsideSkippedTag(int c);
bool CommitAttrName();
void CloseAllowedTag();
void NLModeBlock(bool b);
void UpdateNLMode();
};
//! Turn NLs into paragraph breaks.
/*! This filter can work in two modes, the default and the TeX-style.
By default, it replaces a single NL with ``<br />'', and an empty
line (that is, two NLs separated with nothing but spaces) with
a new paragraph; actually, ``<p>'' is being placed in front of
the first non-space char, the ``</p>\n'' is placed in the end of
the stream, and for NLs, if there's the second one, ``</p>\n<p>''
is transmitted, otherwise ``<br />'' is out. All subsequent
NLs are ignored, as well as all whitespace beetween them.
In TeX-style, only empty lines are taken into account, which
effectively means no ``<br />''s are output, and for <p>/</p>
the behaviour is the same.
\note In case a '\r' (CR) char is encountered at least once,
all the CRs are ignored, but a CR is transmitted in front of
each NL.
*/
class StreamFilterHtmlReplaceNL : public StreamFilter {
bool texstyle;
enum { start, plain, after_single_nl, disabled };
int state;
int lead_spaces;
bool blocked;
bool out_cr;
public:
StreamFilterHtmlReplaceNL(bool texs, StreamFilter *next)
: StreamFilter(next), texstyle(texs), state(start),
blocked(false), out_cr(false) {}
~StreamFilterHtmlReplaceNL() {}
void FeedChar(int c);
void FeedEnd();
void Reset();
void TagStart();
void TagEnd();
void Enable();
void Disable();
private:
void XPutChar(int c);
void PutNL(); //!< output [<CR>]<LF>
void CommitSingleNL();
};
//! Replace <, > and & with HTML entites
class StreamFilterHtmlProtect : public StreamFilter {
public:
StreamFilterHtmlProtect(StreamFilter *next)
: StreamFilter(next) {}
~StreamFilterHtmlProtect() {}
void FeedChar(int c);
};
//! Decode utf8 replacing unknown chars with HTML entities
class StreamFilterUtf8ToHtml : public StreamFilterUtf8ToExtAscii {
public:
StreamFilterUtf8ToHtml(const int * const *table, StreamFilter *next)
: StreamFilterUtf8ToExtAscii(table, next) {}
void UnknownCode(int code);
};
#endif