3 # This script uses the following Unicode tables:
7 from collections import namedtuple
12 NUM_CODEPOINTS=0x110000
17 if current is None or i != current[1] or i in (0x10000, 0x20000):
18 if current is not None:
23 if current is not None:
26 def get_escaped(codepoints):
28 if (c.class_ or "Cn") in "Cc Cf Cs Co Cn Zl Zp Zs".split() and c.value != ord(' '):
33 return open(os.path.basename(f))
34 except FileNotFoundError:
35 subprocess.run(["curl", "-O", f], check=True)
36 return open(os.path.basename(f))
38 Codepoint = namedtuple('Codepoint', 'value class_')
40 def get_codepoints(f):
41 r = csv.reader(f, delimiter=";")
45 codepoint = int(row[0], 16)
49 if class_first is not None:
50 if not name.endswith("Last>"):
51 raise ValueError("Missing Last after First")
53 for c in range(prev_codepoint + 1, codepoint):
54 yield Codepoint(c, class_first)
57 if name.endswith("First>"):
60 yield Codepoint(codepoint, class_)
61 prev_codepoint = codepoint
63 if class_first is not None:
64 raise ValueError("Missing Last after First")
66 for c in range(prev_codepoint + 1, NUM_CODEPOINTS):
67 yield Codepoint(c, None)
69 def compress_singletons(singletons):
70 uppers = [] # (upper, # items in lowers)
76 if len(uppers) == 0 or uppers[-1][0] != upper:
77 uppers.append((upper, 1))
79 upper, count = uppers[-1]
80 uppers[-1] = upper, count + 1
85 def compress_normal(normal):
86 # lengths 0x00..0x7f are encoded as 00, 01, ..., 7e, 7f
87 # lengths 0x80..0x7fff are encoded as 80 80, 80 81, ..., ff fe, ff ff
88 compressed = [] # [truelen, (truelenaux), falselen, (falselenaux)]
91 for start, count in normal:
92 truelen = start - prev_start
94 prev_start = start + count
96 assert truelen < 0x8000 and falselen < 0x8000
99 entry.append(0x80 | (truelen >> 8))
100 entry.append(truelen & 0xff)
102 entry.append(truelen & 0x7f)
104 entry.append(0x80 | (falselen >> 8))
105 entry.append(falselen & 0xff)
107 entry.append(falselen & 0x7f)
109 compressed.append(entry)
113 def print_singletons(uppers, lowers, uppersname, lowersname):
114 print("#[rustfmt::skip]")
115 print("const {}: &[(u8, u8)] = &[".format(uppersname))
117 print(" ({:#04x}, {}),".format(u, c))
119 print("#[rustfmt::skip]")
120 print("const {}: &[u8] = &[".format(lowersname))
121 for i in range(0, len(lowers), 8):
122 print(" {}".format(" ".join("{:#04x},".format(l) for l in lowers[i:i+8])))
125 def print_normal(normal, normalname):
126 print("#[rustfmt::skip]")
127 print("const {}: &[u8] = &[".format(normalname))
129 print(" {}".format(" ".join("{:#04x},".format(i) for i in v)))
133 file = get_file("https://www.unicode.org/Public/UNIDATA/UnicodeData.txt")
135 codepoints = get_codepoints(file)
144 for a, b in to_ranges(get_escaped(codepoints)):
146 extra.append((a, b - a))
149 singletons1.append(a & ~CUTOFF)
151 singletons0.append(a)
154 singletons1.append(a & ~CUTOFF)
155 singletons1.append((a + 1) & ~CUTOFF)
157 singletons0.append(a)
158 singletons0.append(a + 1)
161 extra.append((a, b - a))
163 normal1.append((a & ~CUTOFF, b - a))
165 normal0.append((a, b - a))
167 singletons0u, singletons0l = compress_singletons(singletons0)
168 singletons1u, singletons1l = compress_singletons(singletons1)
169 normal0 = compress_normal(normal0)
170 normal1 = compress_normal(normal1)
173 // NOTE: The following code was generated by "library/core/src/unicode/printable.py",
174 // do not edit directly!
176 fn check(x: u16, singletonuppers: &[(u8, u8)], singletonlowers: &[u8], normal: &[u8]) -> bool {
177 let xupper = (x >> 8) as u8;
178 let mut lowerstart = 0;
179 for &(upper, lowercount) in singletonuppers {
180 let lowerend = lowerstart + lowercount as usize;
182 for &lower in &singletonlowers[lowerstart..lowerend] {
183 if lower == x as u8 {
187 } else if xupper < upper {
190 lowerstart = lowerend;
193 let mut x = x as i32;
194 let mut normal = normal.iter().cloned();
195 let mut current = true;
196 while let Some(v) = normal.next() {
197 let len = if v & 0x80 != 0 {
198 ((v & 0x7f) as i32) << 8 | normal.next().unwrap() as i32
211 pub(crate) fn is_printable(x: char) -> bool {
213 let lower = x as u16;
221 } else if x < 0x10000 {
222 check(lower, SINGLETONS0U, SINGLETONS0L, NORMAL0)
223 } else if x < 0x20000 {
224 check(lower, SINGLETONS1U, SINGLETONS1L, NORMAL1)
228 print(" if 0x{:x} <= x && x < 0x{:x} {{".format(a, a + b))
229 print(" return false;")
237 print_singletons(singletons0u, singletons0l, 'SINGLETONS0U', 'SINGLETONS0L')
238 print_singletons(singletons1u, singletons1l, 'SINGLETONS1U', 'SINGLETONS1L')
239 print_normal(normal0, 'NORMAL0')
240 print_normal(normal1, 'NORMAL1')
242 if __name__ == '__main__':