From: Steve Bennett Date: Sun, 4 Sep 2016 01:04:30 +0000 (+1000) Subject: Extend utf-8 support to 4 byte characters X-Git-Url: https://git.lizzy.rs/?a=commitdiff_plain;h=bda17556464bf5b40870f7d4946ac9bb85575187;p=linenoise.git Extend utf-8 support to 4 byte characters Up to \u1fffff Signed-off-by: Steve Bennett --- diff --git a/utf8.c b/utf8.c index 26924b4..db8f7c1 100644 --- a/utf8.c +++ b/utf8.c @@ -1,7 +1,7 @@ /** * UTF-8 utility functions * - * (c) 2010 Steve Bennett + * (c) 2010-2016 Steve Bennett * * See LICENCE for licence details. */ @@ -13,7 +13,7 @@ #include "utf8.h" #ifdef USE_UTF8 -int utf8_fromunicode(char *p, unsigned short uc) +int utf8_fromunicode(char *p, unsigned uc) { if (uc <= 0x7f) { *p = uc; @@ -24,12 +24,20 @@ int utf8_fromunicode(char *p, unsigned short uc) *p = 0x80 | (uc & 0x3f); return 2; } - else { + else if (uc <= 0xffff) { *p++ = 0xe0 | ((uc & 0xf000) >> 12); *p++ = 0x80 | ((uc & 0xfc0) >> 6); *p = 0x80 | (uc & 0x3f); return 3; } + /* Note: We silently truncate to 21 bits here: 0x1fffff */ + else { + *p++ = 0xf0 | ((uc & 0x1c0000) >> 18); + *p++ = 0x80 | ((uc & 0x3f000) >> 12); + *p++ = 0x80 | ((uc & 0xfc0) >> 6); + *p = 0x80 | (uc & 0x3f); + return 4; + } } int utf8_charlen(int c) @@ -76,16 +84,6 @@ int utf8_index(const char *str, int index) return s - str; } -int utf8_charequal(const char *s1, const char *s2) -{ - int c1, c2; - - utf8_tounicode(s1, &c1); - utf8_tounicode(s2, &c2); - - return c1 == c2; -} - int utf8_tounicode(const char *str, int *uc) { unsigned const char *s = (unsigned const char *)str; @@ -106,6 +104,12 @@ int utf8_tounicode(const char *str, int *uc) return 3; } } + else if (s[0] < 0xf8) { + if (((str[1] & 0xc0) == 0x80) && ((str[2] & 0xc0) == 0x80) && ((str[3] & 0xc0) == 0x80)) { + *uc = ((s[0] & ~0xf0) << 18) | ((s[1] & ~0x80) << 12) | ((s[2] & ~0x80) << 6) | (s[3] & ~0x80); + return 4; + } + } /* Invalid sequence, so just return the byte */ *uc = *s; diff --git a/utf8.h b/utf8.h index 9537939..5aee96b 100644 --- a/utf8.h +++ b/utf8.h @@ -1,9 +1,14 @@ #ifndef UTF8_UTIL_H #define UTF8_UTIL_H + +#ifdef __cplusplus +extern "C" { +#endif + /** * UTF-8 utility functions * - * (c) 2010 Steve Bennett + * (c) 2010-2016 Steve Bennett * * See LICENCE for licence details. */ @@ -19,16 +24,16 @@ #else /** - * Converts the given unicode codepoint (0 - 0xffff) to utf-8 + * Converts the given unicode codepoint (0 - 0x1fffff) to utf-8 * and stores the result at 'p'. - * - * Returns the number of utf-8 characters (1-3). + * + * Returns the number of utf-8 characters */ -int utf8_fromunicode(char *p, unsigned short uc); +int utf8_fromunicode(char *p, unsigned uc); /** * Returns the length of the utf-8 sequence starting with 'c'. - * + * * Returns 1-4, or -1 if this is not a valid start byte. * * Note that charlen=4 is not supported by the rest of the API. @@ -36,7 +41,7 @@ int utf8_fromunicode(char *p, unsigned short uc); int utf8_charlen(int c); /** - * Returns the number of characters in the utf-8 + * Returns the number of characters in the utf-8 * string of the given byte length. * * Any bytes which are not part of an valid utf-8 @@ -44,13 +49,13 @@ int utf8_charlen(int c); * * The string *must* be null terminated. * - * Does not support unicode code points > \uffff + * Does not support unicode code points > \u1fffff */ int utf8_strlen(const char *str, int bytelen); /** * Returns the byte index of the given character in the utf-8 string. - * + * * The string *must* be null terminated. * * This will return the byte length of a utf-8 string @@ -61,7 +66,7 @@ int utf8_index(const char *str, int charindex); /** * Returns the unicode codepoint corresponding to the * utf-8 sequence 'str'. - * + * * Stores the result in *uc and returns the number of bytes * consumed. * @@ -70,10 +75,14 @@ int utf8_index(const char *str, int charindex); * * If it is not null terminated, the length *must* be checked first. * - * Does not support unicode code points > \uffff + * Does not support unicode code points > \u1fffff */ int utf8_tounicode(const char *str, int *uc); #endif +#ifdef __cplusplus +} +#endif + #endif