thalassa/lib/stfilter/stfencod.cpp
2026-03-19 06:23:52 +05:00

559 lines
20 KiB
C++

// +-------------------------------------------------------------------------+
// | StreamFilters library vers. 0.2.03 |
// | Copyright (c) Andrey V. Stolyarov <croco at croco dot net> 2022-2025 |
// | ----------------------------------------------------------------------- |
// | This is free software. Permission is granted to everyone to use, copy |
// | or modify this software under the terms and conditions of |
// | GNU LESSER GENERAL PUBLIC LICENSE, v. 2.1 |
// | as published by Free Software Foundation (see the file LGPL.txt) |
// | |
// | Please visit http://www.croco.net/software/stfilter to get a fresh copy |
// | ----------------------------------------------------------------------- |
// | This code is provided strictly and exclusively on the "AS IS" basis. |
// | !!! THERE IS NO WARRANTY OF ANY KIND, NEITHER EXPRESSED NOR IMPLIED !!! |
// +-------------------------------------------------------------------------+
#include "stfencod.hpp"
void StreamFilterExtAsciiToUtf8::FeedChar(int c)
{
int c2;
if(c < 0x80) { /* ASCII symbol goes unchanged */
PutChar(c);
return;
}
c2 = the_table[c - 0x80];
#if 0
/* it is assumed here that the table doesn't contain
codes less than 0x80 (ascii codes) which is true
for all ExtAscii encodings known to the author;
if, however, you decide to apply this object to
a table that contains at least a single code of
the ASCII range, then replace #if 0 with #if 1
*/
if(c2 < 0x80) { /* ASCII symbol goes unchanged */
PutChar(c2);
return;
}
#endif
if(c2 < 0x800) { /* 110xxxxx 10xxxxxx */
PutChar(0xC0 | ((c2 >> 6) & 0x1F));
PutChar(0x80 | (c2 & 0x3F));
return;
}
if(c2 < 0x10000) { /* 1110xxxx 10xxxxxx 10xxxxxx */
PutChar(0xE0 | ((c2 >> 12) & 0x0F));
PutChar(0x80 | ((c2 >> 6) & 0x3F));
PutChar(0x80 | (c2 & 0x3F));
return;
}
/* well, here perhaps should be a check for (c2 < 0x11000) which is
the UTF8 upper limit, but we just drop the "extra" bits; finally,
these codes come from the table supplied by the application, not
retrieved from the external world, so we leave it to the programmer
to make sure the codes are correct.
*/
/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
PutChar(0xF0 | ((c2 >> 18) & 0x03));
PutChar(0x80 | ((c2 >> 12) & 0x3F));
PutChar(0x80 | ((c2 >> 6) & 0x3F));
PutChar(0x80 | (c2 & 0x3F));
}
void StreamFilterUtf8ToExtAscii::FeedChar(int c)
{
// first of all, perform the self-synchronizing error check
if(expected != 0 && (c & 0xC0) != 0x80) {
// unexpected byte that (perhaps) starts another symbol
// we report error, reset the state and let the byte go
DecodingError(current_err);
expected = 0;
}
if(expected == 0) {
current_err = c & 0xFF;
received = 0;
if((c & 0x80) == 0) { // single-byte (ASCII)
PutChar(c);
} else
if((c & 0xE0) == 0xC0) { // two-byte
current_code = c & 0x1F;
expected = 1;
} else
if((c & 0xF0) == 0xE0) { // three-byte
current_code = c & 0x0F;
expected = 2;
} else
if((c & 0xF8) == 0xF0) { // four-byte
current_code = c & 0x07;
expected = 3;
} else {
DecodingError(current_err);
}
} else {
// okay, we're sure it's a continuation byte, because the case
// of expected > 0 and not a 10xxxxxx byte is handled already
static const int min_lims[3] = { 0x80, 0x800, 0x10000 };
current_err = (current_err << 8) | (c & 0xFF);
current_code = (current_code << 6) | (c & 0x3F);
received++;
if(received >= expected) {
if(current_code<min_lims[expected-1] || current_code>0x10FFFF)
DecodingError(current_err);
else
HandleMultibyte(current_code);
expected = 0;
}
}
}
void StreamFilterUtf8ToExtAscii::FeedEnd()
{
if(expected != 0) {
DecodingError(current_err);
expected = 0;
}
}
void StreamFilterUtf8ToExtAscii::Reset()
{
expected = 0;
received = 0;
}
// by default, we output smth. like ``#err:C09F ''
void StreamFilterUtf8ToExtAscii::DecodingError(int err)
{
PutStr("#err:");
PutHex(err, 0, true);
PutChar(' ');
}
// by default, we output smth. like ``#[AF23]''
void StreamFilterUtf8ToExtAscii::UnknownCode(int code)
{
PutStr("#[");
PutHex(code, 5, true);
PutChar(']');
}
void StreamFilterUtf8ToExtAscii::HandleMultibyte(int code)
{
int rescode = -1;
// first, try to find it in the table
// remember, each ``line'' of the table starts with
// the first codepoint and the length
const int * const *p;
for(p = the_table; *p; p++) {
if(code >= (*p)[0] && code < (*p)[0] + (*p)[1]) { // found?
rescode = (*p)[code - (*p)[0] + 2];
break;
}
}
if(rescode == -1) // remains unknown
UnknownCode(code);
else
PutChar(rescode);
}
/////////////////////////////////////////////////////////////////////
// boring tables follow
// koi8r ////////////////////////////////////////////////////////////
static const int koi8r_to_unicode[128] = { /* offset 0x80 */
0x2500, 0x2502, 0x250C, 0x2510, 0x2514, 0x2518, 0x251C, 0x2524,
0x252C, 0x2534, 0x253C, 0x2580, 0x2584, 0x2588, 0x258C, 0x2590,
0x2591, 0x2592, 0x2593, 0x2320, 0x25A0, 0x2219, 0x221A, 0x2248,
0x2264, 0x2265, 0x00A0, 0x2321, 0x00B0, 0x00B2, 0x00B7, 0x00F7,
0x2550, 0x2551, 0x2552, 0x0451, 0x2553, 0x2554, 0x2555, 0x2556,
0x2557, 0x2558, 0x2559, 0x255A, 0x255B, 0x255C, 0x255D, 0x255E,
0x255F, 0x2560, 0x2561, 0x0401, 0x2562, 0x2563, 0x2564, 0x2565,
0x2566, 0x2567, 0x2568, 0x2569, 0x256A, 0x256B, 0x256C, 0x00A9,
0x044E, 0x0430, 0x0431, 0x0446, 0x0434, 0x0435, 0x0444, 0x0433,
0x0445, 0x0438, 0x0439, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E,
0x043F, 0x044F, 0x0440, 0x0441, 0x0442, 0x0443, 0x0436, 0x0432,
0x044C, 0x044B, 0x0437, 0x0448, 0x044D, 0x0449, 0x0447, 0x044A,
0x042E, 0x0410, 0x0411, 0x0426, 0x0414, 0x0415, 0x0424, 0x0413,
0x0425, 0x0418, 0x0419, 0x041A, 0x041B, 0x041C, 0x041D, 0x041E,
0x041F, 0x042F, 0x0420, 0x0421, 0x0422, 0x0423, 0x0416, 0x0412,
0x042C, 0x042B, 0x0417, 0x0428, 0x042D, 0x0429, 0x0427, 0x042A
};
static const int unicode_to_koi8r_401[83] = { 0x00401, 81,
0xb3, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, 0xe1,
0xe2, 0xf7, 0xe7, 0xe4, 0xe5, 0xf6, 0xfa, 0xe9,
0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf2,
0xf3, 0xf4, 0xf5, 0xe6, 0xe8, 0xe3, 0xfe, 0xfb,
0xfd, 0xff, 0xf9, 0xf8, 0xfc, 0xe0, 0xf1, 0xc1,
0xc2, 0xd7, 0xc7, 0xc4, 0xc5, 0xd6, 0xda, 0xc9,
0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd2,
0xd3, 0xd4, 0xd5, 0xc6, 0xc8, 0xc3, 0xde, 0xdb,
0xdd, 0xdf, 0xd9, 0xd8, 0xdc, 0xc0, 0xd1, -1,
0xa3
};
static const int unicode_to_koi8r_2500[163] = { 0x02500, 161,
0x80, -1, 0x81, -1, -1, -1, -1, -1,
-1, -1, -1, -1, 0x82, -1, -1, -1,
0x83, -1, -1, -1, 0x84, -1, -1, -1,
0x85, -1, -1, -1, 0x86, -1, -1, -1,
-1, -1, -1, -1, 0x87, -1, -1, -1,
-1, -1, -1, -1, 0x88, -1, -1, -1,
-1, -1, -1, -1, 0x89, -1, -1, -1,
-1, -1, -1, -1, 0x8a, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
0xa0, 0xa1, 0xa2, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8,
0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0,
0xb1, 0xb2, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9,
0xba, 0xbb, 0xbc, 0xbd, 0xbe, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
0x8b, -1, -1, -1, 0x8c, -1, -1, -1,
0x8d, -1, -1, -1, 0x8e, -1, -1, -1,
0x8f, 0x90, 0x91, 0x92, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
0x94
};
static const int unicode_to_koi8r_2219[267] = { 0x02219, 265,
0x95, 0x96, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, 0x97,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, 0x98, 0x99, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, 0x93,
0x9b
};
static const int unicode_to_koi8r_a0[90] = { 0x000a0, 88,
0x9a, -1, -1, -1, -1, -1, -1, -1,
-1, 0xbf, -1, -1, -1, -1, -1, -1,
0x9c, -1, 0x9d, -1, -1, -1, -1, 0x9e,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, 0x9f
};
static const int * const unicode_to_koi8r[] = {
unicode_to_koi8r_401,
unicode_to_koi8r_2500,
unicode_to_koi8r_2219,
unicode_to_koi8r_a0,
0
};
// cp1251 ////////////////////////////////////////////////////////////
static const int cp1251_to_unicode[128] = { /* offset 0x80 */
/* NOTE undefined position at 0x98 filled with 0xFFFD */
0x0402, 0x0403, 0x201A, 0x0453, 0x201E, 0x2026, 0x2020, 0x2021,
0x20AC, 0x2030, 0x0409, 0x2039, 0x040A, 0x040C, 0x040B, 0x040F,
0x0452, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
0xFFFD, 0x2122, 0x0459, 0x203A, 0x045A, 0x045C, 0x045B, 0x045F,
0x00A0, 0x040E, 0x045E, 0x0408, 0x00A4, 0x0490, 0x00A6, 0x00A7,
0x0401, 0x00A9, 0x0404, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x0407,
0x00B0, 0x00B1, 0x0406, 0x0456, 0x0491, 0x00B5, 0x00B6, 0x00B7,
0x0451, 0x2116, 0x0454, 0x00BB, 0x0458, 0x0405, 0x0455, 0x0457,
0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417,
0x0418, 0x0419, 0x041A, 0x041B, 0x041C, 0x041D, 0x041E, 0x041F,
0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427,
0x0428, 0x0429, 0x042A, 0x042B, 0x042C, 0x042D, 0x042E, 0x042F,
0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437,
0x0438, 0x0439, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F,
0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447,
0x0448, 0x0449, 0x044A, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F
};
static const int unicode_to_cp1251_401[147] = { 0x00401, 145,
0xa8, 0x80, 0x81, 0xaa, 0xbd, 0xb2, 0xaf, 0xa3,
0x8a, 0x8c, 0x8e, 0x8d, -1, 0xa1, 0x8f, 0xc0,
0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8,
0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0,
0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8,
0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0,
0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8,
0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0,
0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, -1,
0xb8, 0x90, 0x83, 0xba, 0xbe, 0xb3, 0xbf, 0xbc,
0x9a, 0x9c, 0x9e, 0x9d, -1, 0xa2, 0x9f, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, 0xa5,
0xb4
};
static const int unicode_to_cp1251_a0[30] = { 0x000a0, 28,
0xa0, -1, -1, -1, 0xa4, -1, 0xa6, 0xa7,
-1, 0xa9, -1, 0xab, 0xac, 0xad, 0xae, -1,
0xb0, 0xb1, -1, -1, -1, 0xb5, 0xb6, 0xb7,
-1, -1, -1, 0xbb
};
static const int unicode_to_cp1251_2013[274] = { 0x02013, 272,
0x96, 0x97, -1, -1, -1, 0x91, 0x92, 0x82,
-1, 0x93, 0x94, 0x84, -1, 0x86, 0x87, 0x95,
-1, -1, -1, 0x85, -1, -1, -1, -1,
-1, -1, -1, -1, -1, 0x89, -1, -1,
-1, -1, -1, -1, -1, -1, 0x8b, 0x9b,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, 0x88, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, 0xb9, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, 0x99
};
const int * const unicode_to_cp1251[] = {
unicode_to_cp1251_401,
unicode_to_cp1251_a0,
unicode_to_cp1251_2013,
0
};
// ascii ////////////////////////////////////////////////////////////
static const int * const unicode_to_ascii[] = {
0
};
// encoding directory ///////////////////////////////////////////////
#if 0
struct encoding_info {
const char *name;
const int *to_unicode;
const int * const *from_unicode;
};
static const encoding_info encoding_info_base[] = {
{ "koi8-r", koi8r_to_unicode, unicode_to_koi8r },
{ "koi8r", koi8r_to_unicode, unicode_to_koi8r },
{ "cp1251", cp1251_to_unicode, unicode_to_cp1251 },
{ "koi8", koi8r_to_unicode, unicode_to_koi8r },
{ "koi-8", koi8r_to_unicode, unicode_to_koi8r },
{ "1251", cp1251_to_unicode, unicode_to_cp1251 },
{ "windows-1251", cp1251_to_unicode, unicode_to_cp1251 },
{ "ascii", 0, unicode_to_ascii },
{ "us-ascii", 0, unicode_to_ascii },
{ 0, 0, 0 }
};
#endif
// we don't use strcasecmp in order to avoid linking locale support
static bool string_eq_nc(const char *freestr, const char *lowcasestr)
{
int i;
for(i = 0; ; i++) {
if(!freestr[i] && !lowcasestr[i]) // both empty
return true;
if(!freestr[i] || !lowcasestr[i]) // one empty, not both
return false;
if(freestr[i] == lowcasestr[i])
continue;
if(freestr[i] >= 'A' && freestr[i] <= 'Z' &&
freestr[i]-'A'+'a' == lowcasestr[i])
{
continue;
}
return false;
}
// we'll never get here
}
#if 0
static const encoding_info *find_encinfo(const char *name)
{
const encoding_info *p;
for(p = encoding_info_base; p->name; p++)
if(string_eq_nc(name, p->name))
return p;
return 0;
}
const int *StreamFilterExtAsciiToUtf8::FindTable(const char *name)
{
const encoding_info *p = find_encinfo(name);
return p ? p->to_unicode : 0;
}
const int * const *StreamFilterUtf8ToExtAscii::FindTable(const char *name)
{
const encoding_info *p = find_encinfo(name);
return p ? p->from_unicode : 0;
}
#endif
const int *StreamFilterExtAsciiToUtf8::FindTable(const char *name)
{
int n = streamfilter_find_encoding(name);
return GetTable(n);
}
const int * const *StreamFilterUtf8ToExtAscii::FindTable(const char *name)
{
int n = streamfilter_find_encoding(name);
return GetTable(n);
}
const int *StreamFilterExtAsciiToUtf8::GetTable(int enc)
{
switch(enc) {
case streamfilter_enc_ascii:
return 0;
case streamfilter_enc_koi8r:
return koi8r_to_unicode;
case streamfilter_enc_cp1251:
return cp1251_to_unicode;
default:
return 0;
}
}
const int * const *StreamFilterUtf8ToExtAscii::GetTable(int enc)
{
switch(enc) {
case streamfilter_enc_ascii:
return unicode_to_ascii;
case streamfilter_enc_koi8r:
return unicode_to_koi8r;
case streamfilter_enc_cp1251:
return unicode_to_cp1251;
default:
return 0;
}
}
StreamFilterKoi8rToUtf8::StreamFilterKoi8rToUtf8(StreamFilter *next)
: StreamFilterExtAsciiToUtf8(koi8r_to_unicode, next)
{}
struct encentry {
const char *name;
int code;
};
static const struct encentry encs[] = {
{ "utf8", streamfilter_enc_utf8 },
{ "ascii", streamfilter_enc_ascii },
{ "koi8-r", streamfilter_enc_koi8r },
{ "cp1251", streamfilter_enc_cp1251 },
{ "utf-8", streamfilter_enc_utf8 },
{ "us-ascii", streamfilter_enc_ascii },
{ "koi8r", streamfilter_enc_koi8r },
{ "koi8", streamfilter_enc_koi8r },
{ "1251", streamfilter_enc_cp1251 },
{ "win1251", streamfilter_enc_cp1251 },
{ "win-1251", streamfilter_enc_cp1251 },
{ "windows-1251", streamfilter_enc_cp1251 },
{ 0, 0 }
};
int streamfilter_find_encoding(const char *enc)
{
int i;
for(i = 0; encs[i].name; i++)
if(string_eq_nc(enc, encs[i].name))
return encs[i].code;
return streamfilter_enc_unknown;
}
int streamfilter_encoding_name_count()
{
return sizeof(encs) / sizeof(*encs) - 1;
}
const char *streamfilter_encoding_name_by_index(int index)
{
return encs[index].name;
}
// ! must be in sync with enum streamfilter_char_encodings, starting with 0
static const char * const char_encoding_names[streamfilter_enc_lastknown+1] = {
"utf8", "ascii", "koi8-r", "cp1251"
};
const char *streamfilter_encoding_canonical_name(const char *enc)
{
int c = streamfilter_find_encoding(enc);
if(c < 0 || c > streamfilter_enc_lastknown)
return 0;
return char_encoding_names[c];
}