diff --git a/FL/fl_utf8.h b/FL/fl_utf8.h index bd61ab804..dc311327f 100644 --- a/FL/fl_utf8.h +++ b/FL/fl_utf8.h @@ -210,6 +210,9 @@ FL_EXPORT void fl_make_path_for_file( const char *path ); /* OD: recursively create a path in the file system */ FL_EXPORT char fl_make_path( const char *path ); +FL_EXPORT const char *fl_utf8_next_composed_char(const char *from, const char *end); + +FL_EXPORT const char *fl_utf8_previous_composed_char(const char *from, const char *begin); /** @} */ diff --git a/src/Fl_Input.cxx b/src/Fl_Input.cxx index fd66d8f24..7c2644126 100644 --- a/src/Fl_Input.cxx +++ b/src/Fl_Input.cxx @@ -190,14 +190,20 @@ int Fl_Input::kf_delete_eol() { int Fl_Input::kf_delete_char_right() { if (readonly()) { fl_beep(); return 1; } if (mark() != insert_position()) cut(); - else cut(1); + else { + const char *next = fl_utf8_next_composed_char(value() + insert_position(), value() + size()); + replace(insert_position(), next - value(), 0); + } return 1; } int Fl_Input::kf_delete_char_left() { if (readonly()) { fl_beep(); return 1; } if (mark() != insert_position()) cut(); - else cut(-1); + else { + const char *before = fl_utf8_previous_composed_char(value() + insert_position(), value()); + replace(insert_position(), before - value(), 0); + } return 1; } @@ -225,7 +231,8 @@ int Fl_Input::kf_clear_eol() { // If OPTION_ARROW_FOCUS is disabled, return 1 to prevent focus navigation. // int Fl_Input::kf_move_char_left() { - int i = shift_position(insert_position()-1) + NORMAL_INPUT_MOVE; + const char *before = fl_utf8_previous_composed_char(value() + insert_position(), value()); + int i = shift_position(before - value()) + NORMAL_INPUT_MOVE; return Fl::option(Fl::OPTION_ARROW_FOCUS) ? i : 1; } @@ -233,7 +240,8 @@ int Fl_Input::kf_move_char_left() { // If OPTION_ARROW_FOCUS is disabled, return 1 to prevent focus navigation. // int Fl_Input::kf_move_char_right() { - int i = shift_position(insert_position()+1) + NORMAL_INPUT_MOVE; + const char *next = fl_utf8_next_composed_char(value() + insert_position(), value() + size()); + int i = shift_position(next - value()) + NORMAL_INPUT_MOVE; return Fl::option(Fl::OPTION_ARROW_FOCUS) ? i : 1; } diff --git a/src/Fl_Input_.cxx b/src/Fl_Input_.cxx index 18945c242..4c80449b6 100644 --- a/src/Fl_Input_.cxx +++ b/src/Fl_Input_.cxx @@ -658,8 +658,7 @@ void Fl_Input_::handle_mouse(int X, int Y, int /*W*/, int /*H*/, int drag) { const char *l, *r, *t; double f0 = Fl::event_x()-X+xscroll_; for (l = p, r = e; l 0) { f1 = X-xscroll_+expandpos(p, l + cw, buf, 0) - Fl::event_x(); if (f1 < f0) l = l+cw; diff --git a/src/Fl_Text_Buffer.cxx b/src/Fl_Text_Buffer.cxx index f26afb39d..ebd67c107 100644 --- a/src/Fl_Text_Buffer.cxx +++ b/src/Fl_Text_Buffer.cxx @@ -2075,15 +2075,17 @@ int Fl_Text_Buffer::prev_char_clipped(int pos) const return 0; IS_UTF8_ALIGNED2(this, (pos)) - - char c; - do { - pos--; - if (pos==0) - return 0; - c = byte_at(pos); - } while ( (c&0xc0) == 0x80); - + const int l_t = 40; + char t[l_t + 1]; t[l_t] = 0; + int l = l_t, p = pos, ll; + for (int i = l_t; i > 0 && p > 0; i--) { + t[--l] = byte_at(--p); + ll = fl_utf8len(t[l]); + if (ll == 1 || ll == 2) break; + } + const char *previous = fl_utf8_previous_composed_char(t + l_t, t + l); + ll = strlen(t + l); + pos = (pos - ll) + (previous - (t+l)); IS_UTF8_ALIGNED2(this, (pos)) return pos; } @@ -2091,6 +2093,7 @@ int Fl_Text_Buffer::prev_char_clipped(int pos) const /* Return the previous character position. + This function processes a composed character (e.g., a flag emoji) as a single character. Returns -1 if the beginning of the buffer is reached. */ int Fl_Text_Buffer::prev_char(int pos) const @@ -2102,13 +2105,28 @@ int Fl_Text_Buffer::prev_char(int pos) const /* Return the next character position. + This function processes a composed character (e.g., a flag emoji) as a single character. Returns length() if the end of the buffer is reached. */ int Fl_Text_Buffer::next_char(int pos) const { IS_UTF8_ALIGNED2(this, (pos)) - int n = fl_utf8len1(byte_at(pos)); - pos += n; + unsigned l = fl_utf8len1(byte_at(pos)); + if (l > 2) { // test for composed character only if pos is at long codepoint + int p = pos, ll, b; + char t[40]; // crazyest composed characters I know use 28 bytes in UTF8 (e.g., 🏴󠁧󠁢󠁷󠁬󠁳󠁿) + l = 0; + // extract bytes after pos stopping after short codepoint or 40 bytes at most + while (p < mLength && l < sizeof(t)) { + b = byte_at(p++); + t[l++] = b; + ll = fl_utf8len(b); + for (int i = 1; i < ll && l < sizeof(t); i++) t[l++] = byte_at(p++); + if (ll == 1 || ll == 2) break; // stop after short codepoint (includes '\n') + } + l = fl_utf8_next_composed_char(t, t + l) - t; // length of possibly composed character starting at pos + } + pos += l; if (pos>=mLength) return mLength; IS_UTF8_ALIGNED2(this, (pos)) diff --git a/src/Fl_Text_Display.cxx b/src/Fl_Text_Display.cxx index 7f14623d6..cad201daa 100644 --- a/src/Fl_Text_Display.cxx +++ b/src/Fl_Text_Display.cxx @@ -2263,7 +2263,8 @@ int Fl_Text_Display::find_x(const char *s, int len, int style, int x) const { int i = 0; int last_w = 0; // STR #2788 while (ix) { if (cursor_pos && (w-x < x-last_w)) return i+cl; // STR #2788 diff --git a/src/fl_utf8.cxx b/src/fl_utf8.cxx index 2ca9d6b2f..f7a73359e 100644 --- a/src/fl_utf8.cxx +++ b/src/fl_utf8.cxx @@ -1089,8 +1089,13 @@ unsigned fl_utf8toa(const char* src, unsigned srclen, dst[count] = c; p++; } else { - int len; unsigned ucs = fl_utf8decode(p,e,&len); - p += len; + unsigned ucs = 0x100; + int len = fl_utf8len(*p); + if (len > 2) p = fl_utf8_next_composed_char(p, e); + else { + ucs = fl_utf8decode(p,e,&len); + p += len; + } if (ucs < 0x100) dst[count] = ucs; else dst[count] = '?'; } @@ -1100,9 +1105,11 @@ unsigned fl_utf8toa(const char* src, unsigned srclen, while (p < e) { if (!(*p & 0x80)) p++; else { - int len; - fl_utf8decode(p,e,&len); - p += len; + int len = fl_utf8len1(*p); + if (len > 2) p = fl_utf8_next_composed_char(p, e); + else { + p += len; + } } ++count; } @@ -1393,4 +1400,104 @@ unsigned fl_utf8from_mb(char* dst, unsigned dstlen, const char* src, unsigned sr return Fl::system_driver()->utf8from_mb(dst, dstlen, src, srclen); } +/** + Returns pointer to beginning of character after given location in UTF8 string accounting for emoji sequences. + Unicode encodes some emojis (examples: 👩‍✈️ "woman pilot", 🇸🇲 "San Marino flag", 9️⃣ "keycap 9") + via an emoji sequence, that is, they are represented by sequences of consecutive unicode points. + An emoji sequence may pair two successive codepoints with "zero-width joiner" and may qualify + any component with "variation selectors" or "Fitzpatrick emoji modifiers". Most flag emojis are encoded with two successive + "regional indicator symbols". Keycap emojis are encoded with key + "emoji variation selector" + "combining enclosing keycap". + \param from points to a location within a UTF8 string. If this location is inside the UTF8 + encoding of a codepoint or is an invalid byte, this function returns \p from + 1. + \param end points past last codepoint of the string. + \return pointer to beginning of first codepoint after character, possibly an emoji sequence, that begins at \p from. + */ +const char *fl_utf8_next_composed_char(const char *from, const char *end) { + int skip = fl_utf8len(*from); + if (skip == -1) return from + 1; + unsigned u; + if (skip >= 4) { + u = fl_utf8decode(from, end, NULL); + if (u >= 0x1F1E6 && u <= 0x1F1FF) { // a 1st regional indicator symbol can be a flag + u = fl_utf8decode(from + skip, end, NULL); + if (u >= 0x1F1E6 && u <= 0x1F1FF) { // a 2nd regional indicator symbol gives a flag + return from + 2 * skip; + } + } else if (u == 0x1F3F4) { // “waving black flag” may start subdivision flags (e.g. 🏴󠁧󠁢󠁷󠁬󠁳󠁿) + const char *next = from + skip; + do { + u = fl_utf8decode(next, end, NULL); + next += fl_utf8len1(*next); + if (u == 0xE007F) return next; // ends with "cancel tag" + } while (u >= 0xE0020 && u <= 0xE007E); // any series of "tag components" + } + } + from += skip; // skip 1st codepoint + while (from < end) { + u = fl_utf8decode(from, end, NULL); + if (u == 0x200D) { // zero-width joiner + from += fl_utf8len(*from); // skip joiner + from += fl_utf8len(*from); // skip joined codepoint + } else if (u >= 0xFE00 && u <= 0xFE0F) { // a variation selector + from += fl_utf8len(*from); // skip variation selector + } else if (u >= 0x1F3FB && u <= 0x1F3FF) { // EMOJI MODIFIER FITZPATRICK + from += fl_utf8len(*from); // skip modifier + } else if (u == 0x20E3) { // combining enclosing keycap (e.g., 9️⃣*️⃣#️⃣9︎⃣) + from += fl_utf8len(*from); // skip it + } else break; + } + return from; +} + + +/** + Returns pointer to beginning of character before given location in UTF8 string accounting for emoji sequences. + See fl_utf8_next_composed_char() for a hint about what is an emoji sequence. + \param from points to a location within a UTF8 string. If this location is inside the UTF8 + encoding of a codepoint or is an invalid byte, this function returns \p from - 1. + \param begin points to start of first codepoint of the string. + \return pointer to beginning of first character, possibly an emoji sequence, before the codepoint that begins at \p from. + */ +const char *fl_utf8_previous_composed_char(const char *from, const char *begin) { + int l = fl_utf8len(*from); + if (from <= begin || l == -1) return from - 1; + const char *keep = from + l; + from = fl_utf8back(from - 1, begin, NULL); + unsigned u = fl_utf8decode(from, keep, NULL); + if (u >= 0x1F1E6 && u <= 0x1F1FF) { // a 1st regional indicator symbol can be a flag + const char *previous = fl_utf8back(from - 1, begin, NULL); + u = fl_utf8decode(previous, keep, NULL); + if (u >= 0x1F1E6 && u <= 0x1F1FF) { // a 2nd Regional indicator symbol gives a flag + return previous; + } + } else if (u == 0xE007F) { // ends with "cancel tag" + const char *previous = from; + do { + if (previous <= begin) return begin; + previous = fl_utf8back(previous - 1, begin, NULL); + u = fl_utf8decode(previous, keep, NULL); + if (u == 0x1F3F4) return previous; // “waving black flag” starts subdivision flags + } while (u >= 0xE0020 && u <= 0xE007E); // any series of "tag components" + } + while (from >= begin) { + u = fl_utf8decode(from, keep, NULL); + if (u >= 0xFE00 && u <= 0xFE0F) { // a variation selector + from = fl_utf8back(from - 1, begin, NULL); + } else if (u >= 0x1F3FB && u <= 0x1F3FF) { // EMOJI MODIFIER FITZPATRICK + from = fl_utf8back(from - 1, begin, NULL); + } else if (u == 0x20E3) { // combining enclosing keycap + from = fl_utf8back(from - 1, begin, NULL); + } else if (from > begin) { + keep = fl_utf8back(from - 1, begin, NULL); + u = fl_utf8decode(keep, from, NULL); + if (u == 0x200D) { // zero-width joiner + from = fl_utf8back(keep - 1, begin, NULL); + continue; + } + return from; + } else break; + } + return from; +} + /** @} */