]> git.lizzy.rs Git - rust.git/commitdiff
Fix `fmt::Debug` for strings, e.g. for Chinese characters
authorTobias Bucher <tobiasbucher5991@gmail.com>
Fri, 18 Nov 2016 12:59:44 +0000 (13:59 +0100)
committerTobias Bucher <tobiasbucher5991@gmail.com>
Fri, 18 Nov 2016 13:45:59 +0000 (14:45 +0100)
The problem occured due to lines like

```
3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
```

in `UnicodeData.txt`, which the script previously interpreted as two
characters, although it represents the whole range.

Fixes #34318.

src/etc/char_private.py
src/libcore/char_private.rs
src/libcoretest/char.rs

index 3566d143529be9135a6ce18f80e3b7ba8bb08fe2..9d15f98e06709eaadf5d61d4ef235bc1e65f1b0c 100644 (file)
 # except according to those terms.
 
 # This script uses the following Unicode tables:
-# - Categories.txt
+# - UnicodeData.txt
 
+
+from collections import namedtuple
+import csv
 import os
 import subprocess
 
+NUM_CODEPOINTS=0x110000
+
 def to_ranges(iter):
     current = None
     for i in iter:
@@ -28,10 +33,10 @@ def to_ranges(iter):
     if current is not None:
         yield tuple(current)
 
-def get_escaped(dictionary):
-    for i in range(0x110000):
-        if dictionary.get(i, "Cn") in "Cc Cf Cs Co Cn Zl Zp Zs".split() and i != ord(' '):
-            yield i
+def get_escaped(codepoints):
+    for c in codepoints:
+        if (c.class_ or "Cn") in "Cc Cf Cs Co Cn Zl Zp Zs".split() and c.value != ord(' '):
+            yield c.value
 
 def get_file(f):
     try:
@@ -40,10 +45,41 @@ def get_file(f):
         subprocess.run(["curl", "-O", f], check=True)
         return open(os.path.basename(f))
 
+Codepoint = namedtuple('Codepoint', 'value class_')
+
+def get_codepoints(f):
+    r = csv.reader(f, delimiter=";")
+    prev_codepoint = 0
+    class_first = None
+    for row in r:
+        codepoint = int(row[0], 16)
+        name = row[1]
+        class_ = row[2]
+
+        if class_first is not None:
+            if not name.endswith("Last>"):
+                raise ValueError("Missing Last after First")
+
+        for c in range(prev_codepoint + 1, codepoint):
+            yield Codepoint(c, class_first)
+
+        class_first = None
+        if name.endswith("First>"):
+            class_first = class_
+
+        yield Codepoint(codepoint, class_)
+        prev_codepoint = codepoint
+
+    if class_first != None:
+        raise ValueError("Missing Last after First")
+
+    for c in range(prev_codepoint + 1, NUM_CODEPOINTS):
+        yield Codepoint(c, None)
+
 def main():
-    file = get_file("http://www.unicode.org/notes/tn36/Categories.txt")
+    file = get_file("http://www.unicode.org/Public/UNIDATA/UnicodeData.txt")
 
-    dictionary = {int(line.split()[0], 16): line.split()[1] for line in file}
+    codepoints = get_codepoints(file)
 
     CUTOFF=0x10000
     singletons0 = []
@@ -52,7 +88,7 @@ def main():
     normal1 = []
     extra = []
 
-    for a, b in to_ranges(get_escaped(dictionary)):
+    for a, b in to_ranges(get_escaped(codepoints)):
         if a > 2 * CUTOFF:
             extra.append((a, b - a))
         elif a == b - 1:
index 708e7cc15e7c97f97f89f9f4c326bc02bf2acd97..ddc473592a26026035edd0c0f15705e30f0a3dd0 100644 (file)
@@ -11,6 +11,8 @@
 // NOTE: The following code was generated by "src/etc/char_private.py",
 //       do not edit directly!
 
+use slice::SliceExt;
+
 fn check(x: u16, singletons: &[u16], normal: &[u16]) -> bool {
     for &s in singletons {
         if x == s {
@@ -42,7 +44,16 @@ pub fn is_printable(x: char) -> bool {
     } else if x < 0x20000 {
         check(lower, SINGLETONS1, NORMAL1)
     } else {
-        if 0x20000 <= x && x < 0x2f800 {
+        if 0x2a6d7 <= x && x < 0x2a700 {
+            return false;
+        }
+        if 0x2b735 <= x && x < 0x2b740 {
+            return false;
+        }
+        if 0x2b81e <= x && x < 0x2b820 {
+            return false;
+        }
+        if 0x2cea2 <= x && x < 0x2f800 {
             return false;
         }
         if 0x2fa1e <= x && x < 0xe0100 {
@@ -62,10 +73,13 @@ pub fn is_printable(x: char) -> bool {
     0x38b,
     0x38d,
     0x3a2,
+    0x530,
     0x557,
     0x558,
     0x560,
     0x588,
+    0x58b,
+    0x58c,
     0x590,
     0x61c,
     0x61d,
@@ -79,10 +93,8 @@ pub fn is_printable(x: char) -> bool {
     0x83f,
     0x85c,
     0x85d,
-    0x8a1,
-    0x8ff,
-    0x978,
-    0x980,
+    0x8b5,
+    0x8e2,
     0x984,
     0x98d,
     0x98e,
@@ -154,14 +166,11 @@ pub fn is_printable(x: char) -> bool {
     0xc0d,
     0xc11,
     0xc29,
-    0xc34,
     0xc45,
     0xc49,
     0xc57,
     0xc64,
     0xc65,
-    0xc80,
-    0xc81,
     0xc84,
     0xc8d,
     0xc91,
@@ -193,6 +202,8 @@ pub fn is_printable(x: char) -> bool {
     0xdbf,
     0xdd5,
     0xdd7,
+    0xdf0,
+    0xdf1,
     0xe83,
     0xe85,
     0xe86,
@@ -245,6 +256,10 @@ pub fn is_printable(x: char) -> bool {
     0x1317,
     0x135b,
     0x135c,
+    0x13f6,
+    0x13f7,
+    0x13fe,
+    0x13ff,
     0x1680,
     0x170d,
     0x176d,
@@ -253,6 +268,7 @@ pub fn is_printable(x: char) -> bool {
     0x17df,
     0x180e,
     0x180f,
+    0x191f,
     0x196e,
     0x196f,
     0x1a1c,
@@ -260,6 +276,9 @@ pub fn is_printable(x: char) -> bool {
     0x1a5f,
     0x1a7d,
     0x1a7e,
+    0x1aae,
+    0x1aaf,
+    0x1cf7,
     0x1f16,
     0x1f17,
     0x1f1e,
@@ -285,7 +304,12 @@ pub fn is_printable(x: char) -> bool {
     0x2072,
     0x2073,
     0x208f,
-    0x2700,
+    0x23ff,
+    0x2b74,
+    0x2b75,
+    0x2b96,
+    0x2b97,
+    0x2bc9,
     0x2c2f,
     0x2c5f,
     0x2d26,
@@ -306,8 +330,11 @@ pub fn is_printable(x: char) -> bool {
     0x318f,
     0x321f,
     0x32ff,
-    0xa78f,
+    0xa7af,
+    0xa8fe,
+    0xa8ff,
     0xa9ce,
+    0xa9ff,
     0xaa4e,
     0xaa4f,
     0xaa5a,
@@ -317,6 +344,7 @@ pub fn is_printable(x: char) -> bool {
     0xab0f,
     0xab10,
     0xab27,
+    0xab2f,
     0xabee,
     0xabef,
     0xfa6e,
@@ -350,7 +378,7 @@ pub fn is_printable(x: char) -> bool {
     0x3e,
     0x4e,
     0x4f,
-    0x31f,
+    0x18f,
     0x39e,
     0x49e,
     0x49f,
@@ -361,6 +389,9 @@ pub fn is_printable(x: char) -> bool {
     0x83d,
     0x83e,
     0x856,
+    0x8f3,
+    0x9d0,
+    0x9d1,
     0xa04,
     0xa14,
     0xa18,
@@ -368,6 +399,49 @@ pub fn is_printable(x: char) -> bool {
     0xb57,
     0x10bd,
     0x1135,
+    0x11ce,
+    0x11cf,
+    0x11e0,
+    0x1212,
+    0x1287,
+    0x1289,
+    0x128e,
+    0x129e,
+    0x1304,
+    0x130d,
+    0x130e,
+    0x1311,
+    0x1312,
+    0x1329,
+    0x1331,
+    0x1334,
+    0x133a,
+    0x133b,
+    0x1345,
+    0x1346,
+    0x1349,
+    0x134a,
+    0x134e,
+    0x134f,
+    0x1364,
+    0x1365,
+    0x145a,
+    0x145c,
+    0x15b6,
+    0x15b7,
+    0x1c09,
+    0x1c37,
+    0x1c90,
+    0x1c91,
+    0x1ca8,
+    0x246f,
+    0x6a5f,
+    0x6aee,
+    0x6aef,
+    0x6b5a,
+    0x6b62,
+    0xbc9a,
+    0xbc9b,
     0xd127,
     0xd128,
     0xd455,
@@ -395,6 +469,14 @@ pub fn is_printable(x: char) -> bool {
     0xd6a7,
     0xd7cc,
     0xd7cd,
+    0xdaa0,
+    0xe007,
+    0xe019,
+    0xe01a,
+    0xe022,
+    0xe025,
+    0xe8c5,
+    0xe8c6,
     0xee04,
     0xee20,
     0xee23,
@@ -429,31 +511,25 @@ pub fn is_printable(x: char) -> bool {
     0xeeaa,
     0xf0af,
     0xf0b0,
-    0xf0bf,
     0xf0c0,
     0xf0d0,
     0xf12f,
-    0xf336,
-    0xf3c5,
-    0xf43f,
-    0xf441,
-    0xf4f8,
-    0xf53e,
-    0xf53f,
+    0xf91f,
+    0xf931,
+    0xf932,
+    0xf93f,
 ];
 const NORMAL0: &'static [u16] = &[
     0x0, 0x20,
     0x7f, 0x22,
-    0x37f, 0x5,
-    0x528, 0x9,
-    0x58b, 0x4,
+    0x380, 0x4,
     0x5c8, 0x8,
     0x5eb, 0x5,
     0x5f5, 0x11,
     0x7b2, 0xe,
     0x7fb, 0x5,
     0x85f, 0x41,
-    0x8ad, 0x37,
+    0x8be, 0x16,
     0x9b3, 0x3,
     0x9cf, 0x8,
     0x9d8, 0x4,
@@ -465,7 +541,8 @@ pub fn is_printable(x: char) -> bool {
     0xa5f, 0x7,
     0xa76, 0xb,
     0xad1, 0xf,
-    0xaf2, 0xf,
+    0xaf2, 0x7,
+    0xafa, 0x7,
     0xb4e, 0x8,
     0xb58, 0x4,
     0xb78, 0xa,
@@ -478,21 +555,19 @@ pub fn is_printable(x: char) -> bool {
     0xbc3, 0x3,
     0xbd1, 0x6,
     0xbd8, 0xe,
-    0xbfb, 0x6,
+    0xbfb, 0x5,
     0xc3a, 0x3,
     0xc4e, 0x7,
-    0xc5a, 0x6,
+    0xc5b, 0x5,
     0xc70, 0x8,
     0xcce, 0x7,
     0xcd7, 0x7,
-    0xcf3, 0xf,
-    0xd4f, 0x8,
-    0xd58, 0x8,
-    0xd76, 0x3,
+    0xcf3, 0xe,
+    0xd50, 0x4,
     0xd97, 0x3,
     0xdc7, 0x3,
     0xdcb, 0x4,
-    0xde0, 0x12,
+    0xde0, 0x6,
     0xdf5, 0xc,
     0xe3b, 0x4,
     0xe5c, 0x25,
@@ -503,9 +578,8 @@ pub fn is_printable(x: char) -> bool {
     0x10c8, 0x5,
     0x137d, 0x3,
     0x139a, 0x6,
-    0x13f5, 0xb,
     0x169d, 0x3,
-    0x16f1, 0xf,
+    0x16f9, 0x7,
     0x1715, 0xb,
     0x1737, 0x9,
     0x1754, 0xc,
@@ -516,7 +590,6 @@ pub fn is_printable(x: char) -> bool {
     0x1878, 0x8,
     0x18ab, 0x5,
     0x18f6, 0xa,
-    0x191d, 0x3,
     0x192c, 0x4,
     0x193c, 0x4,
     0x1941, 0x3,
@@ -526,34 +599,34 @@ pub fn is_printable(x: char) -> bool {
     0x19db, 0x3,
     0x1a8a, 0x6,
     0x1a9a, 0x6,
-    0x1aae, 0x52,
+    0x1abf, 0x41,
     0x1b4c, 0x4,
     0x1b7d, 0x3,
     0x1bf4, 0x8,
     0x1c38, 0x3,
     0x1c4a, 0x3,
-    0x1c80, 0x40,
+    0x1c89, 0x37,
     0x1cc8, 0x8,
-    0x1cf7, 0x9,
-    0x1de7, 0x15,
+    0x1cfa, 0x6,
+    0x1df6, 0x5,
     0x1fff, 0x11,
     0x2028, 0x8,
     0x205f, 0x11,
     0x209d, 0x3,
-    0x20ba, 0x16,
+    0x20bf, 0x11,
     0x20f1, 0xf,
-    0x218a, 0x6,
-    0x23f4, 0xc,
+    0x218c, 0x4,
     0x2427, 0x19,
     0x244b, 0x15,
-    0x2b4d, 0x3,
-    0x2b5a, 0xa6,
+    0x2bba, 0x3,
+    0x2bd2, 0x1a,
+    0x2bf0, 0x10,
     0x2cf4, 0x5,
     0x2d28, 0x5,
     0x2d68, 0x7,
     0x2d71, 0xe,
     0x2d97, 0x9,
-    0x2e3c, 0x44,
+    0x2e45, 0x3b,
     0x2ef4, 0xc,
     0x2fd6, 0x1a,
     0x2ffc, 0x5,
@@ -561,32 +634,28 @@ pub fn is_printable(x: char) -> bool {
     0x312e, 0x3,
     0x31bb, 0x5,
     0x31e4, 0xc,
-    0x3400, 0x19c0,
-    0x4e00, 0x5200,
+    0x4db6, 0xa,
+    0x9fd6, 0x2a,
     0xa48d, 0x3,
     0xa4c7, 0x9,
     0xa62c, 0x14,
-    0xa698, 0x7,
     0xa6f8, 0x8,
-    0xa794, 0xc,
-    0xa7ab, 0x4d,
+    0xa7b8, 0x3f,
     0xa82c, 0x4,
     0xa83a, 0x6,
     0xa878, 0x8,
-    0xa8c5, 0x9,
+    0xa8c6, 0x8,
     0xa8da, 0x6,
-    0xa8fc, 0x4,
     0xa954, 0xb,
     0xa97d, 0x3,
     0xa9da, 0x4,
-    0xa9e0, 0x20,
     0xaa37, 0x9,
-    0xaa7c, 0x4,
     0xaac3, 0x18,
     0xaaf7, 0xa,
     0xab17, 0x9,
-    0xab2f, 0x91,
-    0xabfa, 0x2bb6,
+    0xab66, 0xa,
+    0xabfa, 0x6,
+    0xd7a4, 0xc,
     0xd7c7, 0x4,
     0xd7fc, 0x2104,
     0xfada, 0x26,
@@ -596,7 +665,6 @@ pub fn is_printable(x: char) -> bool {
     0xfd40, 0x10,
     0xfdc8, 0x28,
     0xfe1a, 0x6,
-    0xfe27, 0x9,
     0xfe6c, 0x4,
     0xfefd, 0x4,
     0xffbf, 0x3,
@@ -608,61 +676,123 @@ pub fn is_printable(x: char) -> bool {
     0xfb, 0x5,
     0x103, 0x4,
     0x134, 0x3,
-    0x18b, 0x5,
-    0x19c, 0x34,
+    0x19c, 0x4,
+    0x1a1, 0x2f,
     0x1fe, 0x82,
     0x29d, 0x3,
-    0x2d1, 0x2f,
+    0x2d1, 0xf,
+    0x2fc, 0x4,
     0x324, 0xc,
-    0x34b, 0x35,
+    0x34b, 0x5,
+    0x37b, 0x5,
     0x3c4, 0x4,
     0x3d6, 0x2a,
-    0x4aa, 0x356,
+    0x4aa, 0x6,
+    0x4d4, 0x4,
+    0x4fc, 0x4,
+    0x528, 0x8,
+    0x564, 0xb,
+    0x570, 0x90,
+    0x737, 0x9,
+    0x756, 0xa,
+    0x768, 0x98,
     0x839, 0x3,
-    0x860, 0xa0,
+    0x89f, 0x8,
+    0x8b0, 0x30,
+    0x8f6, 0x5,
     0x91c, 0x3,
     0x93a, 0x5,
     0x940, 0x40,
-    0x9b8, 0x6,
-    0x9c0, 0x40,
+    0x9b8, 0x4,
     0xa07, 0x5,
     0xa34, 0x4,
     0xa3b, 0x4,
     0xa48, 0x8,
     0xa59, 0x7,
-    0xa80, 0x80,
+    0xaa0, 0x20,
+    0xae7, 0x4,
+    0xaf7, 0x9,
     0xb36, 0x3,
     0xb73, 0x5,
-    0xb80, 0x80,
-    0xc49, 0x217,
+    0xb92, 0x7,
+    0xb9d, 0xc,
+    0xbb0, 0x50,
+    0xc49, 0x37,
+    0xcb3, 0xd,
+    0xcf3, 0x7,
+    0xd00, 0x160,
     0xe7f, 0x181,
     0x104e, 0x4,
-    0x1070, 0x10,
+    0x1070, 0xf,
     0x10c2, 0xe,
     0x10e9, 0x7,
     0x10fa, 0x6,
-    0x1144, 0x3c,
-    0x11c9, 0x7,
-    0x11da, 0x4a6,
+    0x1144, 0xc,
+    0x1177, 0x9,
+    0x11f5, 0xb,
+    0x123f, 0x41,
+    0x12aa, 0x6,
+    0x12eb, 0x5,
+    0x12fa, 0x6,
+    0x1351, 0x6,
+    0x1358, 0x5,
+    0x136d, 0x3,
+    0x1375, 0x8b,
+    0x145e, 0x22,
+    0x14c8, 0x8,
+    0x14da, 0xa6,
+    0x15de, 0x22,
+    0x1645, 0xb,
+    0x165a, 0x6,
+    0x166d, 0x13,
     0x16b8, 0x8,
-    0x16ca, 0x936,
-    0x236f, 0x91,
-    0x2463, 0xd,
-    0x2474, 0xb8c,
-    0x342f, 0x33d1,
-    0x6a39, 0x4c7,
+    0x16ca, 0x36,
+    0x171a, 0x3,
+    0x172c, 0x4,
+    0x1740, 0x160,
+    0x18f3, 0xc,
+    0x1900, 0x1c0,
+    0x1af9, 0x107,
+    0x1c46, 0xa,
+    0x1c6d, 0x3,
+    0x1cb7, 0x349,
+    0x239a, 0x66,
+    0x2475, 0xb,
+    0x2544, 0xabc,
+    0x342f, 0xfd1,
+    0x4647, 0x21b9,
+    0x6a39, 0x7,
+    0x6a6a, 0x4,
+    0x6a70, 0x60,
+    0x6af6, 0xa,
+    0x6b46, 0xa,
+    0x6b78, 0x5,
+    0x6b90, 0x370,
     0x6f45, 0xb,
     0x6f7f, 0x10,
-    0x6fa0, 0x4060,
-    0xb002, 0x1ffe,
+    0x6fa0, 0x40,
+    0x6fe1, 0x1f,
+    0x87ed, 0x13,
+    0x8af3, 0x250d,
+    0xb002, 0xbfe,
+    0xbc6b, 0x5,
+    0xbc7d, 0x3,
+    0xbc89, 0x7,
+    0xbca0, 0x1360,
     0xd0f6, 0xa,
     0xd173, 0x8,
-    0xd1de, 0x22,
+    0xd1e9, 0x17,
     0xd246, 0xba,
     0xd357, 0x9,
     0xd372, 0x8e,
     0xd547, 0x3,
-    0xd800, 0x1600,
+    0xda8c, 0xf,
+    0xdab0, 0x550,
+    0xe02b, 0x7d5,
+    0xe8d7, 0x29,
+    0xe94b, 0x5,
+    0xe95a, 0x4,
+    0xe960, 0x4a0,
     0xee3c, 0x6,
     0xee43, 0x4,
     0xee9c, 0x5,
@@ -670,24 +800,27 @@ pub fn is_printable(x: char) -> bool {
     0xeef2, 0x10e,
     0xf02c, 0x4,
     0xf094, 0xc,
-    0xf0e0, 0x20,
-    0xf10b, 0x5,
+    0xf0f6, 0xa,
+    0xf10d, 0x3,
     0xf16c, 0x4,
-    0xf19b, 0x4b,
+    0xf1ad, 0x39,
     0xf203, 0xd,
-    0xf23b, 0x5,
+    0xf23c, 0x4,
     0xf249, 0x7,
     0xf252, 0xae,
-    0xf321, 0xf,
-    0xf37d, 0x3,
-    0xf394, 0xc,
-    0xf3cb, 0x15,
-    0xf3f1, 0xf,
-    0xf4fd, 0x3,
-    0xf544, 0xc,
-    0xf568, 0x93,
-    0xf641, 0x4,
-    0xf650, 0x30,
-    0xf6c6, 0x3a,
-    0xf774, 0x88c,
+    0xf6d3, 0xd,
+    0xf6ed, 0x3,
+    0xf6f7, 0x9,
+    0xf774, 0xc,
+    0xf7d5, 0x2b,
+    0xf80c, 0x4,
+    0xf848, 0x8,
+    0xf85a, 0x6,
+    0xf888, 0x8,
+    0xf8ae, 0x62,
+    0xf928, 0x8,
+    0xf94c, 0x4,
+    0xf95f, 0x21,
+    0xf992, 0x2e,
+    0xf9c1, 0x63f,
 ];
index 7da0b6902f2717f5f6a584cd77d2c15eba74ed96..b4088ffbf89a952980558adde182ef1f6b6d8609 100644 (file)
@@ -162,6 +162,8 @@ fn string(c: char) -> String {
     assert_eq!(s, "~");
     let s = string('é');
     assert_eq!(s, "é");
+    let s = string('文');
+    assert_eq!(s, "文");
     let s = string('\x00');
     assert_eq!(s, "\\u{0}");
     let s = string('\x1f');