Extend utf-8 support to 4 byte characters

author Steve Bennett <steveb@workware.net.au>

Sun, 4 Sep 2016 01:04:30 +0000 (11:04 +1000)

committer Steve Bennett <steveb@workware.net.au>

Sun, 4 Sep 2016 01:27:54 +0000 (11:27 +1000)
author Steve Bennett <steveb@workware.net.au>
Sun, 4 Sep 2016 01:04:30 +0000 (11:04 +1000)
committer Steve Bennett <steveb@workware.net.au>
Sun, 4 Sep 2016 01:27:54 +0000 (11:27 +1000)
diff --git a/utf8.c b/utf8.c

index 26924b46c19154755c904038c949a98b033e3c50..db8f7c11f6dfafdbf0dccb72583606fbc9ca96b6 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -1,7 +1,7 @@
  /**
   * UTF-8 utility functions
   *
- * (c) 2010 Steve Bennett <steveb@workware.net.au>
+ * (c) 2010-2016 Steve Bennett <steveb@workware.net.au>
   *
   * See LICENCE for licence details.
   */
@@ -13,7 +13,7 @@
  #include "utf8.h"
  
  #ifdef USE_UTF8
-int utf8_fromunicode(char *p, unsigned short uc)
+int utf8_fromunicode(char *p, unsigned uc)
  {
      if (uc <= 0x7f) {
          *p = uc;
@@ -24,12 +24,20 @@ int utf8_fromunicode(char *p, unsigned short uc)
          *p = 0x80 | (uc & 0x3f);
          return 2;
      }
-    else {
+    else if (uc <= 0xffff) {
          *p++ = 0xe0 | ((uc & 0xf000) >> 12);
          *p++ = 0x80 | ((uc & 0xfc0) >> 6);
          *p = 0x80 | (uc & 0x3f);
          return 3;
      }
+    /* Note: We silently truncate to 21 bits here: 0x1fffff */
+    else {
+        *p++ = 0xf0 | ((uc & 0x1c0000) >> 18);
+        *p++ = 0x80 | ((uc & 0x3f000) >> 12);
+        *p++ = 0x80 | ((uc & 0xfc0) >> 6);
+        *p = 0x80 | (uc & 0x3f);
+        return 4;
+    }
  }
  
  int utf8_charlen(int c)
@@ -76,16 +84,6 @@ int utf8_index(const char *str, int index)
      return s - str;
  }
  
-int utf8_charequal(const char *s1, const char *s2)
-{
-    int c1, c2;
-
-    utf8_tounicode(s1, &c1);
-    utf8_tounicode(s2, &c2);
-
-    return c1 == c2;
-}
-
  int utf8_tounicode(const char *str, int *uc)
  {
      unsigned const char *s = (unsigned const char *)str;
@@ -106,6 +104,12 @@ int utf8_tounicode(const char *str, int *uc)
              return 3;
          }
      }
+    else if (s[0] < 0xf8) {
+        if (((str[1] & 0xc0) == 0x80) && ((str[2] & 0xc0) == 0x80) && ((str[3] & 0xc0) == 0x80)) {
+            *uc = ((s[0] & ~0xf0) << 18) | ((s[1] & ~0x80) << 12) | ((s[2] & ~0x80) << 6) | (s[3] & ~0x80);
+            return 4;
+        }
+    }
  
      /* Invalid sequence, so just return the byte */
      *uc = *s;
diff --git a/utf8.h b/utf8.h

index 9537939876ae091451500330b488485f740361f7..5aee96b2a296d4be8bf80a638448a87e95d0d438 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -1,9 +1,14 @@
  #ifndef UTF8_UTIL_H
  #define UTF8_UTIL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
  /**
   * UTF-8 utility functions
   *
- * (c) 2010 Steve Bennett <steveb@workware.net.au>
+ * (c) 2010-2016 Steve Bennett <steveb@workware.net.au>
   *
   * See LICENCE for licence details.
   */
@@ -19,16 +24,16 @@
  
  #else
  /**
- * Converts the given unicode codepoint (0 - 0xffff) to utf-8
+ * Converts the given unicode codepoint (0 - 0x1fffff) to utf-8
   * and stores the result at 'p'.
- * 
- * Returns the number of utf-8 characters (1-3).
+ *
+ * Returns the number of utf-8 characters
   */
-int utf8_fromunicode(char *p, unsigned short uc);
+int utf8_fromunicode(char *p, unsigned uc);
  
  /**
   * Returns the length of the utf-8 sequence starting with 'c'.
- * 
+ *
   * Returns 1-4, or -1 if this is not a valid start byte.
   *
   * Note that charlen=4 is not supported by the rest of the API.
@@ -36,7 +41,7 @@ int utf8_fromunicode(char *p, unsigned short uc);
  int utf8_charlen(int c);
  
  /**
- * Returns the number of characters in the utf-8 
+ * Returns the number of characters in the utf-8
   * string of the given byte length.
   *
   * Any bytes which are not part of an valid utf-8
@@ -44,13 +49,13 @@ int utf8_charlen(int c);
   *
   * The string *must* be null terminated.
   *
- * Does not support unicode code points > \uffff
+ * Does not support unicode code points > \u1fffff
   */
  int utf8_strlen(const char *str, int bytelen);
  
  /**
   * Returns the byte index of the given character in the utf-8 string.
- * 
+ *
   * The string *must* be null terminated.
   *
   * This will return the byte length of a utf-8 string
@@ -61,7 +66,7 @@ int utf8_index(const char *str, int charindex);
  /**
   * Returns the unicode codepoint corresponding to the
   * utf-8 sequence 'str'.
- * 
+ *
   * Stores the result in *uc and returns the number of bytes
   * consumed.
   *
@@ -70,10 +75,14 @@ int utf8_index(const char *str, int charindex);
   *
   * If it is not null terminated, the length *must* be checked first.
   *
- * Does not support unicode code points > \uffff
+ * Does not support unicode code points > \u1fffff
   */
  int utf8_tounicode(const char *str, int *uc);
  
  #endif
  
+#ifdef __cplusplus
+}
+#endif
+
  #endif
author	Steve Bennett <steveb@workware.net.au>
	Sun, 4 Sep 2016 01:04:30 +0000 (11:04 +1000)
committer	Steve Bennett <steveb@workware.net.au>
	Sun, 4 Sep 2016 01:27:54 +0000 (11:27 +1000)