/**
* UTF-8 utility functions
*
- * (c) 2010 Steve Bennett <steveb@workware.net.au>
+ * (c) 2010-2016 Steve Bennett <steveb@workware.net.au>
*
* See LICENCE for licence details.
*/
#include "utf8.h"
#ifdef USE_UTF8
-int utf8_fromunicode(char *p, unsigned short uc)
+int utf8_fromunicode(char *p, unsigned uc)
{
if (uc <= 0x7f) {
*p = uc;
*p = 0x80 | (uc & 0x3f);
return 2;
}
- else {
+ else if (uc <= 0xffff) {
*p++ = 0xe0 | ((uc & 0xf000) >> 12);
*p++ = 0x80 | ((uc & 0xfc0) >> 6);
*p = 0x80 | (uc & 0x3f);
return 3;
}
+ /* Note: We silently truncate to 21 bits here: 0x1fffff */
+ else {
+ *p++ = 0xf0 | ((uc & 0x1c0000) >> 18);
+ *p++ = 0x80 | ((uc & 0x3f000) >> 12);
+ *p++ = 0x80 | ((uc & 0xfc0) >> 6);
+ *p = 0x80 | (uc & 0x3f);
+ return 4;
+ }
}
int utf8_charlen(int c)
return s - str;
}
-int utf8_charequal(const char *s1, const char *s2)
-{
- int c1, c2;
-
- utf8_tounicode(s1, &c1);
- utf8_tounicode(s2, &c2);
-
- return c1 == c2;
-}
-
int utf8_tounicode(const char *str, int *uc)
{
unsigned const char *s = (unsigned const char *)str;
return 3;
}
}
+ else if (s[0] < 0xf8) {
+ if (((str[1] & 0xc0) == 0x80) && ((str[2] & 0xc0) == 0x80) && ((str[3] & 0xc0) == 0x80)) {
+ *uc = ((s[0] & ~0xf0) << 18) | ((s[1] & ~0x80) << 12) | ((s[2] & ~0x80) << 6) | (s[3] & ~0x80);
+ return 4;
+ }
+ }
/* Invalid sequence, so just return the byte */
*uc = *s;
#ifndef UTF8_UTIL_H
#define UTF8_UTIL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
/**
* UTF-8 utility functions
*
- * (c) 2010 Steve Bennett <steveb@workware.net.au>
+ * (c) 2010-2016 Steve Bennett <steveb@workware.net.au>
*
* See LICENCE for licence details.
*/
#else
/**
- * Converts the given unicode codepoint (0 - 0xffff) to utf-8
+ * Converts the given unicode codepoint (0 - 0x1fffff) to utf-8
* and stores the result at 'p'.
- *
- * Returns the number of utf-8 characters (1-3).
+ *
+ * Returns the number of utf-8 characters
*/
-int utf8_fromunicode(char *p, unsigned short uc);
+int utf8_fromunicode(char *p, unsigned uc);
/**
* Returns the length of the utf-8 sequence starting with 'c'.
- *
+ *
* Returns 1-4, or -1 if this is not a valid start byte.
*
* Note that charlen=4 is not supported by the rest of the API.
int utf8_charlen(int c);
/**
- * Returns the number of characters in the utf-8
+ * Returns the number of characters in the utf-8
* string of the given byte length.
*
* Any bytes which are not part of an valid utf-8
*
* The string *must* be null terminated.
*
- * Does not support unicode code points > \uffff
+ * Does not support unicode code points > \u1fffff
*/
int utf8_strlen(const char *str, int bytelen);
/**
* Returns the byte index of the given character in the utf-8 string.
- *
+ *
* The string *must* be null terminated.
*
* This will return the byte length of a utf-8 string
/**
* Returns the unicode codepoint corresponding to the
* utf-8 sequence 'str'.
- *
+ *
* Stores the result in *uc and returns the number of bytes
* consumed.
*
*
* If it is not null terminated, the length *must* be checked first.
*
- * Does not support unicode code points > \uffff
+ * Does not support unicode code points > \u1fffff
*/
int utf8_tounicode(const char *str, int *uc);
#endif
+#ifdef __cplusplus
+}
+#endif
+
#endif