Emojis: add support of keycap emoji sequences.

- It is expected that all emojis listed in the Wikipedia "emoji" article
as of early 2026 are recognized as single glyphs by FLTK text widgets.
- Document functions fl_utf8_{next|previous}_composed_char()
relatively to the notion of "emoji sequence".
- Remove signed/unsigned comparison compilation warnings.
This commit is contained in:
ManoloFLTK 2026-01-29 13:04:35 +01:00
parent 445d26bb71
commit bed38ba3f5
2 changed files with 32 additions and 24 deletions

View File

@ -2127,7 +2127,7 @@ int Fl_Text_Buffer::prev_char(int pos) const
int Fl_Text_Buffer::next_char(int pos) const
{
IS_UTF8_ALIGNED2(this, (pos))
int l = fl_utf8len1(byte_at(pos));
unsigned l = fl_utf8len1(byte_at(pos));
if (l > 2) { // test for composed character only if pos is at long codepoint
int p = pos, ll, b;
char t[40]; // crazyest composed characters I know use 28 bytes in UTF8 (e.g., 🏴󠁧󠁢󠁷󠁬󠁳󠁿)

View File

@ -1637,32 +1637,36 @@ unsigned fl_utf8from_mb(char* dst, unsigned dstlen, const char* src, unsigned sr
/**
Returns pointer to beginning of character after given location in UTF8 string accounting for composed characters.
Some unicode characters (examples: 👩 "woman pilot", 🇸🇲 "San Marino flag") are composed of several unicode points.
They may pair two successive codepoints with U+200D (zero-width joiner) and may qualify any component with
variation selectors or Fitzpatrick emoji modifiers. Most flag emojis are composed of two successive
"regional indicator symbols", each in range [U+1F1E6 , U+1F1FF].
Returns pointer to beginning of character after given location in UTF8 string accounting for emoji sequences.
Unicode encodes some emojis (examples: 👩 "woman pilot", 🇸🇲 "San Marino flag", 9 "keycap 9")
via an <b>emoji sequence</b>, that is, they are represented by sequences of consecutive unicode points.
An emoji sequence may pair two successive codepoints with "zero-width joiner" and may qualify
any component with "variation selectors" or "Fitzpatrick emoji modifiers". Most flag emojis are encoded with two successive
"regional indicator symbols". Keycap emojis are encoded with key + "emoji variation selector" + "combining enclosing keycap".
\param from points to a location within a UTF8 string. If this location is inside the UTF8
encoding of a codepoint or is an invalid byte, this function returns \p from + 1.
\param end points past last codepoint of the string.
\return pointer to beginning of first codepoint after possibly composed character that begins at \p from.
\return pointer to beginning of first codepoint after character, possibly an emoji sequence, that begins at \p from.
*/
const char *fl_utf8_next_composed_char(const char *from, const char *end) {
int skip = fl_utf8len1(*from);
if (skip <= 2) return from + skip;
unsigned u = fl_utf8decode(from, end, NULL);
if (u >= 0x1F1E6 && u <= 0x1F1FF) { // a 1st regional indicator symbol can be a flag
u = fl_utf8decode(from + skip, end, NULL);
if (u >= 0x1F1E6 && u <= 0x1F1FF) { // a 2nd regional indicator symbol gives a flag
return from + 2 * skip;
int skip = fl_utf8len(*from);
if (skip == -1) return from + 1;
unsigned u;
if (skip >= 4) {
u = fl_utf8decode(from, end, NULL);
if (u >= 0x1F1E6 && u <= 0x1F1FF) { // a 1st regional indicator symbol can be a flag
u = fl_utf8decode(from + skip, end, NULL);
if (u >= 0x1F1E6 && u <= 0x1F1FF) { // a 2nd regional indicator symbol gives a flag
return from + 2 * skip;
}
} else if (u == 0x1F3F4) { // “waving black flag” may start subdivision flags (e.g. 🏴󠁧󠁢󠁷󠁬󠁳󠁿)
const char *next = from + skip;
do {
u = fl_utf8decode(next, end, NULL);
next += fl_utf8len1(*next);
if (u == 0xE007F) return next; // ends with "cancel tag"
} while (u >= 0xE0020 && u <= 0xE007E); // any series of "tag components"
}
} else if (u == 0x1F3F4) { // “waving black flag” may start subdivision flags (e.g. 🏴󠁧󠁢󠁷󠁬󠁳󠁿)
const char *next = from + skip;
do {
u = fl_utf8decode(next, end, NULL);
next += fl_utf8len1(*next);
if (u == 0xE007F) return next; // ends with "cancel tag"
} while (u >= 0xE0020 && u <= 0xE007E); // any series of "tag components"
}
from += skip; // skip 1st codepoint
while (from < end) {
@ -1674,6 +1678,8 @@ const char *fl_utf8_next_composed_char(const char *from, const char *end) {
from += fl_utf8len(*from); // skip variation selector
} else if (u >= 0x1F3FB && u <= 0x1F3FF) { // EMOJI MODIFIER FITZPATRICK
from += fl_utf8len(*from); // skip modifier
} else if (u == 0x20E3) { // combining enclosing keycap (e.g., 9⃣*️⃣#⃣9⃣)
from += fl_utf8len(*from); // skip it
} else break;
}
return from;
@ -1681,12 +1687,12 @@ const char *fl_utf8_next_composed_char(const char *from, const char *end) {
/**
Returns pointer to beginning of character before given location in UTF8 string accounting for composed characters.
See fl_utf8_next_composed_char() for a hint about what is a composed unicode character.
Returns pointer to beginning of character before given location in UTF8 string accounting for emoji sequences.
See fl_utf8_next_composed_char() for a hint about what is an emoji sequence.
\param from points to a location within a UTF8 string. If this location is inside the UTF8
encoding of a codepoint or is an invalid byte, this function returns \p from - 1.
\param begin points to start of first codepoint of the string.
\return pointer to beginning of first possibly composed character before the codepoint that begins at \p from.
\return pointer to beginning of first character, possibly an emoji sequence, before the codepoint that begins at \p from.
*/
const char *fl_utf8_previous_composed_char(const char *from, const char *begin) {
int l = fl_utf8len(*from);
@ -1715,6 +1721,8 @@ const char *fl_utf8_previous_composed_char(const char *from, const char *begin)
from = fl_utf8back(from - 1, begin, NULL);
} else if (u >= 0x1F3FB && u <= 0x1F3FF) { // EMOJI MODIFIER FITZPATRICK
from = fl_utf8back(from - 1, begin, NULL);
} else if (u == 0x20E3) { // combining enclosing keycap
from = fl_utf8back(from - 1, begin, NULL);
} else if (from > begin) {
keep = fl_utf8back(from - 1, begin, NULL);
u = fl_utf8decode(keep, from, NULL);