383 lines
12 KiB
C++
383 lines
12 KiB
C++
|
|
// +-------------------------------------------------------------------------+
|
||
|
|
// | StreamFilters library vers. 0.2.03 |
|
||
|
|
// | Copyright (c) Andrey V. Stolyarov <croco at croco dot net> 2022-2025 |
|
||
|
|
// | ----------------------------------------------------------------------- |
|
||
|
|
// | This is free software. Permission is granted to everyone to use, copy |
|
||
|
|
// | or modify this software under the terms and conditions of |
|
||
|
|
// | GNU LESSER GENERAL PUBLIC LICENSE, v. 2.1 |
|
||
|
|
// | as published by Free Software Foundation (see the file LGPL.txt) |
|
||
|
|
// | |
|
||
|
|
// | Please visit http://www.croco.net/software/stfilter to get a fresh copy |
|
||
|
|
// | ----------------------------------------------------------------------- |
|
||
|
|
// | This code is provided strictly and exclusively on the "AS IS" basis. |
|
||
|
|
// | !!! THERE IS NO WARRANTY OF ANY KIND, NEITHER EXPRESSED NOR IMPLIED !!! |
|
||
|
|
// +-------------------------------------------------------------------------+
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
// +-------------------------------------------------------------------------+
|
||
|
|
// | StreamFilters library vers. 0.2.03 |
|
||
|
|
// | Copyright (c) Andrey V. Stolyarov <croco at croco dot net> 2022-2025 |
|
||
|
|
// | ----------------------------------------------------------------------- |
|
||
|
|
// | This is free software. Permission is granted to everyone to use, copy |
|
||
|
|
// | or modify this software under the terms and conditions of |
|
||
|
|
// | GNU LESSER GENERAL PUBLIC LICENSE, v. 2.1 |
|
||
|
|
// | as published by Free Software Foundation (see the file LGPL.txt) |
|
||
|
|
// | |
|
||
|
|
// | Please visit http://www.croco.net/software/stfilter to get a fresh copy |
|
||
|
|
// | ----------------------------------------------------------------------- |
|
||
|
|
// | This code is provided strictly and exclusively on the "AS IS" basis. |
|
||
|
|
// | !!! THERE IS NO WARRANTY OF ANY KIND, NEITHER EXPRESSED NOR IMPLIED !!! |
|
||
|
|
// +-------------------------------------------------------------------------+
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
#include <stdio.h>
|
||
|
|
#include <string.h>
|
||
|
|
#include "stfilter.hpp"
|
||
|
|
#include "stfencod.hpp"
|
||
|
|
#include "stfhtml.hpp"
|
||
|
|
#include "version.h"
|
||
|
|
|
||
|
|
|
||
|
|
class StreamFilterOutput : public StreamFilter {
|
||
|
|
FILE *f;
|
||
|
|
public:
|
||
|
|
StreamFilterOutput() : StreamFilter(0), f(stdout) {}
|
||
|
|
~StreamFilterOutput() {}
|
||
|
|
|
||
|
|
void SetStream(FILE *s) { f = s; }
|
||
|
|
private:
|
||
|
|
virtual void FeedChar(int c) { fputc(c, f); }
|
||
|
|
};
|
||
|
|
|
||
|
|
#if 0
|
||
|
|
class StreamFilterE : public StreamFilter {
|
||
|
|
public:
|
||
|
|
StreamFilterE(StreamFilter *next) : StreamFilter(next) {}
|
||
|
|
~StreamFilterE() {}
|
||
|
|
virtual void FeedChar(int c) { PutChar(c); }
|
||
|
|
};
|
||
|
|
#endif
|
||
|
|
|
||
|
|
|
||
|
|
/* differs from the parent in that is accepts a string which
|
||
|
|
contains space-separated list of tags, builds the array
|
||
|
|
for the parent and owns both the array and the string
|
||
|
|
*/
|
||
|
|
class StreamFilterHtmlTagsHolder : public StreamFilterHtmlTags {
|
||
|
|
char *str;
|
||
|
|
char **tags;
|
||
|
|
char *str2;
|
||
|
|
char **attrs;
|
||
|
|
public:
|
||
|
|
StreamFilterHtmlTagsHolder(const char *ts, const char *as,
|
||
|
|
StreamFilter *rest);
|
||
|
|
~StreamFilterHtmlTagsHolder();
|
||
|
|
};
|
||
|
|
|
||
|
|
static const char * const the_space_preserving_tags[] = {
|
||
|
|
"pre", "ul", "ol", "p", "h1", "h2", "h3", "h4", "h5", "h6", 0
|
||
|
|
};
|
||
|
|
|
||
|
|
static char *strdup_n(const char *s)
|
||
|
|
{
|
||
|
|
int len = strlen(s);
|
||
|
|
char *res = new char[len+1];
|
||
|
|
strcpy(res, s);
|
||
|
|
return res;
|
||
|
|
}
|
||
|
|
|
||
|
|
// modifies the str placing zeroes as appropriate
|
||
|
|
// the array returned consists of pointers into the string
|
||
|
|
static char **make_argv(char *str, int n)
|
||
|
|
{
|
||
|
|
char **res;
|
||
|
|
while(*str && (*str == ' ' || *str == '\t'))
|
||
|
|
str++;
|
||
|
|
if(!*str) {
|
||
|
|
res = new char* [n+1];
|
||
|
|
res[n] = 0;
|
||
|
|
return res;
|
||
|
|
}
|
||
|
|
char *pos = str;
|
||
|
|
while(*str && *str != ' ' && *str != '\t')
|
||
|
|
str++;
|
||
|
|
if(*str) {
|
||
|
|
*str = 0;
|
||
|
|
str++;
|
||
|
|
}
|
||
|
|
res = make_argv(str, n+1);
|
||
|
|
res[n] = pos;
|
||
|
|
return res;
|
||
|
|
}
|
||
|
|
|
||
|
|
StreamFilterHtmlTagsHolder::
|
||
|
|
StreamFilterHtmlTagsHolder(const char *ts, const char *as, StreamFilter *rest)
|
||
|
|
: StreamFilterHtmlTags((tags = make_argv((str = strdup_n(ts)), 0)),
|
||
|
|
(attrs = make_argv((str2 = strdup_n(as)), 0)),
|
||
|
|
rest)
|
||
|
|
{
|
||
|
|
// nothing to do
|
||
|
|
}
|
||
|
|
|
||
|
|
StreamFilterHtmlTagsHolder::~StreamFilterHtmlTagsHolder()
|
||
|
|
{
|
||
|
|
delete[] tags;
|
||
|
|
delete[] str;
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
static void help()
|
||
|
|
{
|
||
|
|
fprintf(stderr,
|
||
|
|
"text2text vers. " STFILTER_VERSION " (compiled " __DATE__ ")\n"
|
||
|
|
"Copyright (c) Andrey Vikt. Stolyarov, 2023-2025\n"
|
||
|
|
"\n"
|
||
|
|
"Usage: text2text <filters> [inputfile] [-o outputfile]\n"
|
||
|
|
"where <filters> are one or more of:\n"
|
||
|
|
" -htags[+|-] 'tags' 'attrs'\n"
|
||
|
|
" filter off HTML tags and attributes except the listed\n"
|
||
|
|
" (tags and attrs are space-separated lists, use quotes)\n"
|
||
|
|
" The ``+'' enables a slave (controlled) NL converter.\n"
|
||
|
|
" The ``-'' also enables NL converter, but for empty\n"
|
||
|
|
" lines only (that is, single NLs are left untouched)\n"
|
||
|
|
" Attrs may be specified by names (allow for all tags)\n"
|
||
|
|
" or in the form tag=attr (e.g. img=src) to allow the\n"
|
||
|
|
" attr for a particular tag only.\n"
|
||
|
|
" -hprot replace '<', '>' and '&' with HTML entities\n"
|
||
|
|
" -hbrk replace newlines with HTML paragraph breaks\n"
|
||
|
|
" -hbrk+ the same, but not inside certain tags\n"
|
||
|
|
" -hpar[+] same as hbrk[+], but only empty lines are\n"
|
||
|
|
" replaced, and single NLs are left untouched\n"
|
||
|
|
" -from_utf[_ht] enc convert utf8 to the 'enc' encoding\n"
|
||
|
|
" (_ht means to represent unknown chars with html entities\n"
|
||
|
|
" -to_utf enc convert the 'enc'-encoded text to utf8\n"
|
||
|
|
"\n"
|
||
|
|
"Run ``text2text -L'' for the list of available encodings;\n"
|
||
|
|
"all other params are ignored if -L is given.\n"
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
struct cmdl_params {
|
||
|
|
const char *input;
|
||
|
|
const char *output;
|
||
|
|
|
||
|
|
StreamFilterOutput *dest;
|
||
|
|
|
||
|
|
cmdl_params() : input(0), output(0) {}
|
||
|
|
~cmdl_params() {} /* both strings aren't owned */
|
||
|
|
};
|
||
|
|
|
||
|
|
#define CMDLSKIP ((StreamFilter*)0)
|
||
|
|
#define CMDLERR ((StreamFilter*)-1)
|
||
|
|
|
||
|
|
static const char *find_str(const char *str, const char *const *arr)
|
||
|
|
{
|
||
|
|
int i;
|
||
|
|
for(i = 0; arr[i]; i++)
|
||
|
|
if(0 == strcmp(str, arr[i]))
|
||
|
|
return arr[i];
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
static StreamFilter *
|
||
|
|
make_filter_fromutf(const char *enc, bool html, StreamFilter *next)
|
||
|
|
{
|
||
|
|
const int * const *tbl = StreamFilterUtf8ToExtAscii::FindTable(enc);
|
||
|
|
if(!tbl) {
|
||
|
|
fprintf(stderr, "Encoding [%s] not found\n", enc);
|
||
|
|
return CMDLERR;
|
||
|
|
}
|
||
|
|
if(html)
|
||
|
|
return new StreamFilterUtf8ToHtml(tbl, next);
|
||
|
|
else
|
||
|
|
return new StreamFilterUtf8ToExtAscii(tbl, next);
|
||
|
|
}
|
||
|
|
|
||
|
|
static StreamFilter *make_filter(const char *cmd,
|
||
|
|
const char *arg, const char *arg2,
|
||
|
|
StreamFilter *next)
|
||
|
|
{
|
||
|
|
if(0 == strcmp(cmd, "hprot"))
|
||
|
|
return new StreamFilterHtmlProtect(next);
|
||
|
|
if(0 == strcmp(cmd, "hbrk"))
|
||
|
|
return new StreamFilterHtmlReplaceNL(false, next);
|
||
|
|
if(0 == strcmp(cmd, "hbrk+")) {
|
||
|
|
StreamFilterHtmlTags *p = new StreamFilterHtmlTags(0, 0, next);
|
||
|
|
p->AddControlledNLReplacer(the_space_preserving_tags, false);
|
||
|
|
return p;
|
||
|
|
}
|
||
|
|
if(0 == strcmp(cmd, "hpar"))
|
||
|
|
return new StreamFilterHtmlReplaceNL(true, next);
|
||
|
|
if(0 == strcmp(cmd, "hpar+")) {
|
||
|
|
StreamFilterHtmlTags *p = new StreamFilterHtmlTags(0, 0, next);
|
||
|
|
p->AddControlledNLReplacer(the_space_preserving_tags, true);
|
||
|
|
return p;
|
||
|
|
}
|
||
|
|
if(0 == strcmp(cmd, "htags"))
|
||
|
|
return new StreamFilterHtmlTagsHolder(arg, arg2, next);
|
||
|
|
if(0 == strcmp(cmd, "htags+")) {
|
||
|
|
StreamFilterHtmlTagsHolder *p =
|
||
|
|
new StreamFilterHtmlTagsHolder(arg, arg2, next);
|
||
|
|
p->AddControlledNLReplacer(the_space_preserving_tags, false);
|
||
|
|
return p;
|
||
|
|
}
|
||
|
|
if(0 == strcmp(cmd, "htags-")) {
|
||
|
|
StreamFilterHtmlTagsHolder *p =
|
||
|
|
new StreamFilterHtmlTagsHolder(arg, arg2, next);
|
||
|
|
p->AddControlledNLReplacer(the_space_preserving_tags, true);
|
||
|
|
return p;
|
||
|
|
}
|
||
|
|
if(0 == strcmp(cmd, "to_utf")) {
|
||
|
|
const int *tbl = StreamFilterExtAsciiToUtf8::FindTable(arg);
|
||
|
|
if(!tbl) {
|
||
|
|
fprintf(stderr, "Encoding [%s] not found\n", arg);
|
||
|
|
return CMDLERR;
|
||
|
|
}
|
||
|
|
return new StreamFilterExtAsciiToUtf8(tbl, next);
|
||
|
|
}
|
||
|
|
if(0 == strcmp(cmd, "from_utf"))
|
||
|
|
return make_filter_fromutf(arg, false, next);
|
||
|
|
if(0 == strcmp(cmd, "from_utf_ht"))
|
||
|
|
return make_filter_fromutf(arg, true, next);
|
||
|
|
fprintf(stderr, "Command %s not yet implemented, sorry\n", cmd);
|
||
|
|
return CMDLERR; /* shouldn't happen actually */
|
||
|
|
}
|
||
|
|
|
||
|
|
static void print_encoding_name_list()
|
||
|
|
{
|
||
|
|
int i, n;
|
||
|
|
n = streamfilter_encoding_name_count();
|
||
|
|
for(i = 0; i < n; i++)
|
||
|
|
printf("%s\n", streamfilter_encoding_name_by_index(i));
|
||
|
|
}
|
||
|
|
|
||
|
|
static StreamFilter *parse_cmdline(char **arg_rest, cmdl_params *cp)
|
||
|
|
{
|
||
|
|
static const char * const keys0[] = /* keys with no parameter */
|
||
|
|
{ "hprot", "hbrk", "hbrk+", "hpar", "hpar+", 0 };
|
||
|
|
static const char * const keys1[] = /* keys with 1 parameter */
|
||
|
|
{ "from_utf", "from_utf_ht", "to_utf", "o", 0 };
|
||
|
|
static const char * const keys2[] = /* keys with 2 parameters */
|
||
|
|
{ "htags", "htags+", "htags-", 0 };
|
||
|
|
|
||
|
|
if(!*arg_rest) {
|
||
|
|
cp->dest = new StreamFilterOutput();
|
||
|
|
return cp->dest;
|
||
|
|
}
|
||
|
|
|
||
|
|
if(*arg_rest[0] != '-') { /* may be it's the input filename? */
|
||
|
|
if(!cp->input) {
|
||
|
|
cp->input = *arg_rest;
|
||
|
|
return parse_cmdline(arg_rest + 1, cp);
|
||
|
|
} else {
|
||
|
|
fprintf(stderr, "too many input files, only one is allowed\n");
|
||
|
|
return CMDLERR;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if(arg_rest[0][1] == 'L' && !arg_rest[0][2]) { /* just list them? */
|
||
|
|
print_encoding_name_list();
|
||
|
|
return CMDLSKIP;
|
||
|
|
}
|
||
|
|
|
||
|
|
const char *cmd;
|
||
|
|
const char *cmdarg = 0, *cmdarg2 = 0;
|
||
|
|
char **arg_tail;
|
||
|
|
|
||
|
|
cmd = find_str((*arg_rest) + 1, keys0);
|
||
|
|
if(cmd) {
|
||
|
|
arg_tail = arg_rest + 1;
|
||
|
|
} else {
|
||
|
|
cmd = find_str((*arg_rest) + 1, keys1);
|
||
|
|
if(cmd) {
|
||
|
|
arg_tail = arg_rest + 2;
|
||
|
|
cmdarg = arg_rest[1];
|
||
|
|
if(!cmdarg || *cmdarg == '-') {
|
||
|
|
fprintf(stderr, "-%s requires an argument\n", cmd);
|
||
|
|
return CMDLERR;
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
cmd = find_str((*arg_rest) + 1, keys2);
|
||
|
|
if(cmd) {
|
||
|
|
arg_tail = arg_rest + 3;
|
||
|
|
cmdarg = arg_rest[1];
|
||
|
|
cmdarg2 = arg_rest[2];
|
||
|
|
if(!cmdarg || *cmdarg == '-' || !cmdarg2 || *cmdarg2 == '-') {
|
||
|
|
fprintf(stderr, "-%s requires TWO arguments\n", cmd);
|
||
|
|
return CMDLERR;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if(!cmd) {
|
||
|
|
fprintf(stderr, "Key unknown [%s], run with no args for help\n",
|
||
|
|
*arg_rest);
|
||
|
|
return CMDLERR;
|
||
|
|
}
|
||
|
|
|
||
|
|
if(0 == strcmp(cmd, "o")) {
|
||
|
|
cp->output = cmdarg;
|
||
|
|
return parse_cmdline(arg_tail, cp);
|
||
|
|
}
|
||
|
|
|
||
|
|
StreamFilter *tail = parse_cmdline(arg_tail, cp);
|
||
|
|
if(tail == CMDLERR)
|
||
|
|
return CMDLERR;
|
||
|
|
|
||
|
|
StreamFilter *ret = make_filter(cmd, cmdarg, cmdarg2, tail);
|
||
|
|
if(ret == CMDLERR)
|
||
|
|
tail->DeleteChain();
|
||
|
|
return ret;
|
||
|
|
}
|
||
|
|
|
||
|
|
int main(int argc, char **argv)
|
||
|
|
{
|
||
|
|
if(argc < 2) {
|
||
|
|
help();
|
||
|
|
return 1;
|
||
|
|
}
|
||
|
|
cmdl_params cp;
|
||
|
|
StreamFilter *chain = parse_cmdline(argv + 1, &cp);
|
||
|
|
if(chain == CMDLSKIP) /* everything already done */
|
||
|
|
return 0;
|
||
|
|
if(chain == CMDLERR) /* diags already printed */
|
||
|
|
return 1;
|
||
|
|
|
||
|
|
FILE *in_f = stdin;
|
||
|
|
FILE *out_f = stdout;
|
||
|
|
|
||
|
|
if(cp.input) {
|
||
|
|
in_f = fopen(cp.input, "r");
|
||
|
|
if(!in_f) {
|
||
|
|
perror(cp.input);
|
||
|
|
return 2;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if(cp.output) {
|
||
|
|
out_f = fopen(cp.output, "w");
|
||
|
|
if(!out_f) {
|
||
|
|
perror(cp.output);
|
||
|
|
return 3;
|
||
|
|
}
|
||
|
|
cp.dest->SetStream(out_f);
|
||
|
|
}
|
||
|
|
|
||
|
|
int c;
|
||
|
|
while((c = fgetc(in_f)) != EOF)
|
||
|
|
chain->FeedChar(c);
|
||
|
|
chain->FeedEnd();
|
||
|
|
|
||
|
|
if(cp.input)
|
||
|
|
fclose(in_f);
|
||
|
|
if(cp.output)
|
||
|
|
fclose(out_f);
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|