library/core/src/unicode/printable.py

   1 #!/usr/bin/env python
   2
   3 # This script uses the following Unicode tables:
   4 # - UnicodeData.txt
   5
   6
   7 from collections import namedtuple
   8 import csv
   9 import os
  10 import subprocess
  11
  12 NUM_CODEPOINTS=0x110000
  13
  14 def to_ranges(iter):
  15     current = None
  16     for i in iter:
  17         if current is None or i != current[1] or i in (0x10000, 0x20000):
  18             if current is not None:
  19                 yield tuple(current)
  20             current = [i, i + 1]
  21         else:
  22             current[1] += 1
  23     if current is not None:
  24         yield tuple(current)
  25
  26 def get_escaped(codepoints):
  27     for c in codepoints:
  28         if (c.class_ or "Cn") in "Cc Cf Cs Co Cn Zl Zp Zs".split() and c.value != ord(' '):
  29             yield c.value
  30
  31 def get_file(f):
  32     try:
  33         return open(os.path.basename(f))
  34     except FileNotFoundError:
  35         subprocess.run(["curl", "-O", f], check=True)
  36         return open(os.path.basename(f))
  37
  38 Codepoint = namedtuple('Codepoint', 'value class_')
  39
  40 def get_codepoints(f):
  41     r = csv.reader(f, delimiter=";")
  42     prev_codepoint = 0
  43     class_first = None
  44     for row in r:
  45         codepoint = int(row[0], 16)
  46         name = row[1]
  47         class_ = row[2]
  48
  49         if class_first is not None:
  50             if not name.endswith("Last>"):
  51                 raise ValueError("Missing Last after First")
  52
  53         for c in range(prev_codepoint + 1, codepoint):
  54             yield Codepoint(c, class_first)
  55
  56         class_first = None
  57         if name.endswith("First>"):
  58             class_first = class_
  59
  60         yield Codepoint(codepoint, class_)
  61         prev_codepoint = codepoint
  62
  63     if class_first is not None:
  64         raise ValueError("Missing Last after First")
  65
  66     for c in range(prev_codepoint + 1, NUM_CODEPOINTS):
  67         yield Codepoint(c, None)
  68
  69 def compress_singletons(singletons):
  70     uppers = [] # (upper, # items in lowers)
  71     lowers = []
  72
  73     for i in singletons:
  74         upper = i >> 8
  75         lower = i & 0xff
  76         if len(uppers) == 0 or uppers[-1][0] != upper:
  77             uppers.append((upper, 1))
  78         else:
  79             upper, count = uppers[-1]
  80             uppers[-1] = upper, count + 1
  81         lowers.append(lower)
  82
  83     return uppers, lowers
  84
  85 def compress_normal(normal):
  86     # lengths 0x00..0x7f are encoded as 00, 01, ..., 7e, 7f
  87     # lengths 0x80..0x7fff are encoded as 80 80, 80 81, ..., ff fe, ff ff
  88     compressed = [] # [truelen, (truelenaux), falselen, (falselenaux)]
  89
  90     prev_start = 0
  91     for start, count in normal:
  92         truelen = start - prev_start
  93         falselen = count
  94         prev_start = start + count
  95
  96         assert truelen < 0x8000 and falselen < 0x8000
  97         entry = []
  98         if truelen > 0x7f:
  99             entry.append(0x80 | (truelen >> 8))
 100             entry.append(truelen & 0xff)
 101         else:
 102             entry.append(truelen & 0x7f)
 103         if falselen > 0x7f:
 104             entry.append(0x80 | (falselen >> 8))
 105             entry.append(falselen & 0xff)
 106         else:
 107             entry.append(falselen & 0x7f)
 108
 109         compressed.append(entry)
 110
 111     return compressed
 112
 113 def print_singletons(uppers, lowers, uppersname, lowersname):
 114     print("#[rustfmt::skip]")
 115     print("const {}: &[(u8, u8)] = &[".format(uppersname))
 116     for u, c in uppers:
 117         print("    ({:#04x}, {}),".format(u, c))
 118     print("];")
 119     print("#[rustfmt::skip]")
 120     print("const {}: &[u8] = &[".format(lowersname))
 121     for i in range(0, len(lowers), 8):
 122         print("    {}".format(" ".join("{:#04x},".format(l) for l in lowers[i:i+8])))
 123     print("];")
 124
 125 def print_normal(normal, normalname):
 126     print("#[rustfmt::skip]")
 127     print("const {}: &[u8] = &[".format(normalname))
 128     for v in normal:
 129         print("    {}".format(" ".join("{:#04x},".format(i) for i in v)))
 130     print("];")
 131
 132 def main():
 133     file = get_file("https://www.unicode.org/Public/UNIDATA/UnicodeData.txt")
 134
 135     codepoints = get_codepoints(file)
 136
 137     CUTOFF=0x10000
 138     singletons0 = []
 139     singletons1 = []
 140     normal0 = []
 141     normal1 = []
 142     extra = []
 143
 144     for a, b in to_ranges(get_escaped(codepoints)):
 145         if a > 2 * CUTOFF:
 146             extra.append((a, b - a))
 147         elif a == b - 1:
 148             if a & CUTOFF:
 149                 singletons1.append(a & ~CUTOFF)
 150             else:
 151                 singletons0.append(a)
 152         elif a == b - 2:
 153             if a & CUTOFF:
 154                 singletons1.append(a & ~CUTOFF)
 155                 singletons1.append((a + 1) & ~CUTOFF)
 156             else:
 157                 singletons0.append(a)
 158                 singletons0.append(a + 1)
 159         else:
 160             if a >= 2 * CUTOFF:
 161                 extra.append((a, b - a))
 162             elif a & CUTOFF:
 163                 normal1.append((a & ~CUTOFF, b - a))
 164             else:
 165                 normal0.append((a, b - a))
 166
 167     singletons0u, singletons0l = compress_singletons(singletons0)
 168     singletons1u, singletons1l = compress_singletons(singletons1)
 169     normal0 = compress_normal(normal0)
 170     normal1 = compress_normal(normal1)
 171
 172     print("""\
 173 // NOTE: The following code was generated by "library/core/src/unicode/printable.py",
 174 //       do not edit directly!
 175
 176 fn check(x: u16, singletonuppers: &[(u8, u8)], singletonlowers: &[u8], normal: &[u8]) -> bool {
 177     let xupper = (x >> 8) as u8;
 178     let mut lowerstart = 0;
 179     for &(upper, lowercount) in singletonuppers {
 180         let lowerend = lowerstart + lowercount as usize;
 181         if xupper == upper {
 182             for &lower in &singletonlowers[lowerstart..lowerend] {
 183                 if lower == x as u8 {
 184                     return false;
 185                 }
 186             }
 187         } else if xupper < upper {
 188             break;
 189         }
 190         lowerstart = lowerend;
 191     }
 192
 193     let mut x = x as i32;
 194     let mut normal = normal.iter().cloned();
 195     let mut current = true;
 196     while let Some(v) = normal.next() {
 197         let len = if v & 0x80 != 0 {
 198             ((v & 0x7f) as i32) << 8 | normal.next().unwrap() as i32
 199         } else {
 200             v as i32
 201         };
 202         x -= len;
 203         if x < 0 {
 204             break;
 205         }
 206         current = !current;
 207     }
 208     current
 209 }
 210
 211 pub(crate) fn is_printable(x: char) -> bool {
 212     let x = x as u32;
 213     let lower = x as u16;
 214
 215     if x < 32 {
 216         // ASCII fast path
 217         false
 218     } else if x < 127 {
 219         // ASCII fast path
 220         true
 221     } else if x < 0x10000 {
 222         check(lower, SINGLETONS0U, SINGLETONS0L, NORMAL0)
 223     } else if x < 0x20000 {
 224         check(lower, SINGLETONS1U, SINGLETONS1L, NORMAL1)
 225     } else {\
 226 """)
 227     for a, b in extra:
 228         print("        if 0x{:x} <= x && x < 0x{:x} {{".format(a, a + b))
 229         print("            return false;")
 230         print("        }")
 231     print("""\
 232         true
 233     }
 234 }\
 235 """)
 236     print()
 237     print_singletons(singletons0u, singletons0l, 'SINGLETONS0U', 'SINGLETONS0L')
 238     print_singletons(singletons1u, singletons1l, 'SINGLETONS1U', 'SINGLETONS1L')
 239     print_normal(normal0, 'NORMAL0')
 240     print_normal(normal1, 'NORMAL1')
 241
 242 if __name__ == '__main__':
 243     main()