Fixed crashes when Fl_Text_* detects illegal UTF 8 sequences. Widgets will not do any further processing but just jump over the character. Screen representation depends largely on whatever the underlying OS does with those sequences, but I feel that this is out of the scope of this library. (STR 2348)

git-svn-id: file:///fltk/svn/fltk/branches/branch-1.3@7965 ea41ed52-d2ee-0310-a9c1-e6b18d33e121
This commit is contained in:
Matthias Melcher 2010-12-06 18:22:22 +00:00
parent 06e5a163cd
commit 1bac8a0cca
6 changed files with 52 additions and 37 deletions

View File

@ -1,5 +1,7 @@
CHANGES IN FLTK 1.3.0
- Fixed crashes when detecting illegal utf 8 sequences
in Fl_Text_* widgets (STR #2348)
- Fixed Fl_Text_Display Tabulator calculations (STR #2450)
- Fixed file access code to use UTF-8 strings (STR #2440)
- Fixed ARM Unicode cross compilation issue (STR #2432)

View File

@ -34,7 +34,7 @@
#define FL_TEXT_BUFFER_H
#define ASSERT_UTF8
#undef ASSERT_UTF8
#ifdef ASSERT_UTF8
# include <assert.h>
@ -47,22 +47,11 @@
/*
Suggested UTF-8 terminology for this file:
?? "length" is the number of characters in a string
?? "size" is the number of bytes
?? "index" is the position in a string in number of characters
?? "offset" is the position in a string in bytes (and must be kept on a charater boundary)
(there seems to be no standard in Uncode documents, howevere "length" is commonly
referencing the number of bytes. Maybe "bytes" and "glyphs" would be the most
obvious way to describe sizes?)
"character size" is the size of a UTF-8 character in bytes
"character width" is the width of a Unicode character in pixels
"column" was orginally defined as a character offset from the left margin. It was
identical to the byte offset. In UTF-8, we have neither a byte offset nor
truly fixed width fonts (*). Column could be a pixel value multiplied with
"character width" is the width of a Unicode character in pixels
"column" was orginally defined as a character offset from the left margin.
It was identical to the byte offset. In UTF-8, we have neither a byte offset
nor truly fixed width fonts (*). Column could be a pixel value multiplied with
an average character width (which is a bearable approximation).
* in Unicode, there are no fixed width fonts! Even if the ASCII characters may

View File

@ -99,13 +99,16 @@ FL_EXPORT int fl_utf8bytes(unsigned ucs);
/* OD: returns the byte length of the first UTF-8 char sequence (returns -1 if not valid) */
FL_EXPORT int fl_utf8len(char c);
/* OD: returns the byte length of the first UTF-8 char sequence (returns +1 if not valid) */
FL_EXPORT int fl_utf8len1(char c);
/* OD: returns the number of Unicode chars in the UTF-8 string */
FL_EXPORT int fl_utf_nb_char(const unsigned char *buf, int len);
/* F2: Convert the next UTF8 char-sequence into a Unicode value (and say how many bytes were used) */
FL_EXPORT unsigned fl_utf8decode(const char* p, const char* end, int* len);
/* F2: Encode a Unicode value into a UTF8 sequence, return the number of bytes used */
FL_EXPORT int fl_utf8encode(unsigned ucs, char* buf);

View File

@ -1025,7 +1025,7 @@ int Fl_Text_Buffer::search_forward(int startPos, const char *searchString,
*foundPos = startPos;
return 1;
}
int l = fl_utf8len(c);
int l = fl_utf8len1(c);
if (memcmp(sp, address(bp), l))
break;
sp += l; bp += l;
@ -1077,7 +1077,7 @@ int Fl_Text_Buffer::search_backward(int startPos, const char *searchString,
*foundPos = startPos;
return 1;
}
int l = fl_utf8len(c);
int l = fl_utf8len1(c);
if (memcmp(sp, address(bp), l))
break;
sp += l; bp += l;
@ -1602,7 +1602,7 @@ int Fl_Text_Buffer::prev_char(int pos) const
int Fl_Text_Buffer::next_char(int pos) const
{
IS_UTF8_ALIGNED2(this, (pos))
int n = fl_utf8len(byte_at(pos));
int n = fl_utf8len1(byte_at(pos));
pos += n;
if (pos>=mLength)
return mLength;

View File

@ -753,7 +753,7 @@ void Fl_Text_Display::overstrike(const char* text) {
/* determine how many displayed character positions are covered */
startIndent = mBuffer->count_displayed_characters( lineStart, startPos );
indent = startIndent;
for ( c = text; *c != '\0'; c += fl_utf8len(*c) )
for ( c = text; *c != '\0'; c += fl_utf8len1(*c) )
indent++;
endIndent = indent;
@ -1735,7 +1735,7 @@ int Fl_Text_Display::handle_vline(
style = position_style(lineStartPos, lineLen, 0);
for (i=0; i<lineLen; ) {
currChar = lineStr[i]; // one byte is enough to handele tabs and other cases
int len = fl_utf8len(currChar);
int len = fl_utf8len1(currChar);
if (len<=0) len = 1; // OUCH!
charStyle = position_style(lineStartPos, lineLen, i);
if (charStyle!=style || currChar=='\t' || prevChar=='\t') {
@ -1829,7 +1829,7 @@ int Fl_Text_Display::find_x(const char *s, int len, int style, int x) const {
// TODO: use binary search which may be quicker.
int i = 0;
while (i<len) {
int cl = fl_utf8len(s[i]);
int cl = fl_utf8len1(s[i]);
int w = int( string_width(s, i+cl, style) );
if (w>x)
return i;
@ -3204,7 +3204,7 @@ double Fl_Text_Display::measure_proportional_character(const char *s, int xPix,
return (((xPix/tab)+1)*tab) - xPix;
}
int charLen = fl_utf8len(*s), style = 0;
int charLen = fl_utf8len1(*s), style = 0;
if (mStyleBuffer) {
style = mStyleBuffer->byte_at(pos);
}
@ -3284,7 +3284,7 @@ int Fl_Text_Display::wrap_uses_character(int lineEndPos) const {
c = buffer()->char_at(lineEndPos);
return c == '\n' || ((c == '\t' || c == ' ') &&
lineEndPos + fl_utf8len(c) < buffer()->length());
lineEndPos + fl_utf8len1(c) < buffer()->length());
}

View File

@ -112,9 +112,11 @@ Toupper(
}
/**
return the byte length of the UTF-8 sequence with first byte \p c,
or -1 if \p c is not valid.
*/
return the byte length of the UTF-8 sequence with first byte \p c,
or -1 if \p c is not valid.
This function is helpful for finding faulty UTF8 sequences.
\see fl_utf8len1
*/
int fl_utf8len(char c)
{
if (!(c & 0x80)) return 1;
@ -137,15 +139,34 @@ int fl_utf8len(char c)
} // fl_utf8len
#if 0
int fl_utflen(
const unsigned char *buf,
int len)
/**
Return the byte length of the UTF-8 sequence with first byte \p c,
or 1 if \p c is not valid.
This function can be used to scan faulty UTF8 sequence, albeit ignoring invalid
codes.
\see fl_utf8len
*/
int fl_utf8len1(char c)
{
unsigned int ucs;
return fl_utf2ucs(buf, len, &ucs);
}
#endif
if (!(c & 0x80)) return 1;
if (c & 0x40) {
if (c & 0x20) {
if (c & 0x10) {
if (c & 0x08) {
if (c & 0x04) {
return 6;
}
return 5;
}
return 4;
}
return 3;
}
return 2;
}
return 1;
} // fl_utf8len1
/**
returns the number of Unicode chars in the UTF-8 string