3 # Copyright 2011-2013 The Rust Project Developers. See the COPYRIGHT
4 # file at the top-level directory of this distribution and at
5 # http://rust-lang.org/COPYRIGHT.
7 # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
8 # http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
9 # <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
10 # option. This file may not be copied, modified, or distributed
11 # except according to those terms.
13 # This digests UnicodeData.txt and DerivedCoreProperties.txt and emits rust
14 # code covering the core properties. Since this is a pretty rare event we
15 # just store this out-of-line and check the unicode.rs file into git.
17 # The emitted code is "the minimum we think is necessary for libstd", that
18 # is, to support basic operations of the compiler and "most nontrivial rust
19 # programs". It is not meant to be a complete implementation of unicode.
20 # For that we recommend you use a proper binding to libicu.
22 import fileinput, re, os, sys, operator
26 if not os.path.exists(f):
27 os.system("curl -O http://www.unicode.org/Public/UNIDATA/%s"
30 if not os.path.exists(f):
31 sys.stderr.write("cannot load %s" % f)
35 def load_unicode_data(f):
50 for line in fileinput.input(f):
51 fields = line.split(";")
54 [code, name, gencat, combine, bidi,
55 decomp, deci, digit, num, mirror,
56 old, iso, upcase, lowcase, titlecase ] = fields
61 # generate char to char direct common and simple conversions
62 # uppercase to lowercase
63 if gencat == "Lu" and lowcase != "" and code_org != lowcase:
64 upperlower[code] = int(lowcase, 16)
66 # lowercase to uppercase
67 if gencat == "Ll" and upcase != "" and code_org != upcase:
68 lowerupper[code] = int(upcase, 16)
71 if decomp.startswith('<'):
73 for i in decomp.split()[1:]:
74 seq.append(int(i, 16))
75 compat_decomp[code] = seq
78 for i in decomp.split():
79 seq.append(int(i, 16))
80 canon_decomp[code] = seq
87 if curr_cat == gencat:
90 if curr_cat not in gencats:
91 gencats[curr_cat] = []
93 gencats[curr_cat].append((c_lo, c_hi))
98 if curr_combine == "":
99 curr_combine = combine
103 if curr_combine == combine:
106 if curr_combine != "0":
107 combines.append((com_lo, com_hi, curr_combine))
108 curr_combine = combine
112 return (canon_decomp, compat_decomp, gencats, combines, lowerupper, upperlower)
114 def load_properties(f, interestingprops):
117 re1 = re.compile("^([0-9A-F]+) +; (\w+)")
118 re2 = re.compile("^([0-9A-F]+)\.\.([0-9A-F]+) +; (\w+)")
120 for line in fileinput.input(f):
137 if prop not in interestingprops:
141 if prop not in props:
143 props[prop].append((d_lo, d_hi))
148 return "'\\x%2.2x'" % c
150 return "'\\u%4.4x'" % c
151 return "'\\U%8.8x'" % c
161 def emit_bsearch_range_table(f):
163 fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
164 use cmp::{Equal, Less, Greater};
165 use slice::ImmutableVector;
167 r.bsearch(|&(lo,hi)| {
168 if lo <= c && c <= hi { Equal }
169 else if hi < c { Less }
175 def emit_property_module(f, mod, tbl):
176 f.write("pub mod %s {\n" % mod)
181 if cat not in ["Nd", "Nl", "No", "Cc",
182 "XID_Start", "XID_Continue", "Alphabetic",
183 "Lowercase", "Uppercase", "White_Space"]:
185 f.write(" static %s_table : &'static [(char,char)] = &[\n" % cat)
187 for pair in tbl[cat]:
188 f.write(ch_prefix(ix))
189 f.write("(%s, %s)" % (escape_char(pair[0]), escape_char(pair[1])))
193 f.write(" pub fn %s(c: char) -> bool {\n" % cat)
194 f.write(" super::bsearch_range_table(c, %s_table)\n" % cat)
199 def emit_conversions_module(f, lowerupper, upperlower):
200 f.write("pub mod conversions {")
202 use cmp::{Equal, Less, Greater};
203 use slice::ImmutableVector;
205 use option::{Option, Some, None};
207 pub fn to_lower(c: char) -> char {
208 match bsearch_case_table(c, LuLl_table) {
210 Some(index) => LuLl_table[index].val1()
214 pub fn to_upper(c: char) -> char {
215 match bsearch_case_table(c, LlLu_table) {
217 Some(index) => LlLu_table[index].val1()
221 fn bsearch_case_table(c: char, table: &'static [(char, char)]) -> Option<uint> {
222 table.bsearch(|&(key, _)| {
223 if c == key { Equal }
224 else if key < c { Less }
230 emit_caseconversion_table(f, "LuLl", upperlower)
231 emit_caseconversion_table(f, "LlLu", lowerupper)
234 def emit_caseconversion_table(f, name, table):
235 f.write(" static %s_table : &'static [(char, char)] = &[\n" % name)
236 sorted_table = sorted(table.iteritems(), key=operator.itemgetter(0))
238 for key, value in sorted_table:
239 f.write(ch_prefix(ix))
240 f.write("(%s, %s)" % (escape_char(key), escape_char(value)))
244 def format_table_content(f, content, indent):
247 for chunk in content.split(","):
248 if len(line) + len(chunk) < 98:
255 f.write(line + ",\n")
256 line = " "*indent + chunk
259 def emit_core_norm_module(f, canon, compat):
260 canon_keys = canon.keys()
263 compat_keys = compat.keys()
265 f.write("pub mod normalization {\n");
266 f.write(" use option::Option;\n");
267 f.write(" use option::{Some, None};\n");
268 f.write(" use slice::ImmutableVector;\n");
270 fn bsearch_table(c: char, r: &'static [(char, &'static [char])]) -> Option<&'static [char]> {
271 use cmp::{Equal, Less, Greater};
272 match r.bsearch(|&(val, _)| {
273 if c == val { Equal }
274 else if val < c { Less }
278 let (_, result) = r[idx];
286 f.write(" // Canonical decompositions\n")
287 f.write(" static canonical_table : &'static [(char, &'static [char])] = &[\n")
290 for char in canon_keys:
294 data += "(%s,&[" % escape_char(char)
296 for d in canon[char]:
300 data += escape_char(d)
302 format_table_content(f, data, 8)
305 f.write(" // Compatibility decompositions\n")
306 f.write(" static compatibility_table : &'static [(char, &'static [char])] = &[\n")
309 for char in compat_keys:
313 data += "(%s,&[" % escape_char(char)
315 for d in compat[char]:
319 data += escape_char(d)
321 format_table_content(f, data, 8)
325 pub fn decompose_canonical(c: char, i: |char|) { d(c, i, false); }
327 pub fn decompose_compatible(c: char, i: |char|) { d(c, i, true); }
329 fn d(c: char, i: |char|, k: bool) {
332 // 7-bit ASCII never decomposes
333 if c <= '\\x7f' { i(c); return; }
335 // Perform decomposition for Hangul
336 if (c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT) {
337 decompose_hangul(c, i);
341 // First check the canonical decompositions
342 match bsearch_table(c, canonical_table) {
344 for x in canon.iter() {
352 // Bottom out if we're not doing compat.
353 if !k { i(c); return; }
355 // Then check the compatibility decompositions
356 match bsearch_table(c, compatibility_table) {
358 for x in compat.iter() {
366 // Finally bottom out.
370 // Constants from Unicode 6.2.0 Section 3.12 Conjoining Jamo Behavior
371 static S_BASE: u32 = 0xAC00;
372 static L_BASE: u32 = 0x1100;
373 static V_BASE: u32 = 0x1161;
374 static T_BASE: u32 = 0x11A7;
375 static L_COUNT: u32 = 19;
376 static V_COUNT: u32 = 21;
377 static T_COUNT: u32 = 28;
378 static N_COUNT: u32 = (V_COUNT * T_COUNT);
379 static S_COUNT: u32 = (L_COUNT * N_COUNT);
381 // Decompose a precomposed Hangul syllable
382 fn decompose_hangul(s: char, f: |char|) {
385 let si = s as u32 - S_BASE;
387 let li = si / N_COUNT;
389 f(transmute(L_BASE + li));
391 let vi = (si % N_COUNT) / T_COUNT;
392 f(transmute(V_BASE + vi));
394 let ti = si % T_COUNT;
396 f(transmute(T_BASE + ti));
404 def emit_std_norm_module(f, combine):
405 f.write("pub mod normalization {\n");
406 f.write(" use option::{Some, None};\n");
407 f.write(" use slice::ImmutableVector;\n");
410 fn bsearch_range_value_table(c: char, r: &'static [(char, char, u8)]) -> u8 {
411 use cmp::{Equal, Less, Greater};
412 match r.bsearch(|&(lo, hi, _)| {
413 if lo <= c && c <= hi { Equal }
414 else if hi < c { Less }
418 let (_, _, result) = r[idx];
426 f.write(" static combining_class_table : &'static [(char, char, u8)] = &[\n")
429 f.write(ch_prefix(ix))
430 f.write("(%s, %s, %s)" % (escape_char(pair[0]), escape_char(pair[1]), pair[2]))
434 f.write(" pub fn canonical_combining_class(c: char) -> u8 {\n"
435 + " bsearch_range_value_table(c, combining_class_table)\n"
440 preamble = '''// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
441 // file at the top-level directory of this distribution and at
442 // http://rust-lang.org/COPYRIGHT.
444 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
445 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
446 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
447 // option. This file may not be copied, modified, or distributed
448 // except according to those terms.
450 // NOTE: The following code was generated by "src/etc/unicode.py", do not edit directly
452 #![allow(missing_doc, non_uppercase_statics)]
456 (canon_decomp, compat_decomp, gencats,
457 combines, lowerupper, upperlower) = load_unicode_data("UnicodeData.txt")
459 def gen_core_unicode():
460 r = "core_unicode.rs"
461 if os.path.exists(r):
463 with open(r, "w") as rf:
467 emit_bsearch_range_table(rf);
468 emit_property_module(rf, "general_category", gencats)
470 emit_core_norm_module(rf, canon_decomp, compat_decomp)
472 derived = load_properties("DerivedCoreProperties.txt",
473 ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"])
475 emit_property_module(rf, "derived_property", derived)
477 props = load_properties("PropList.txt", ["White_Space"])
478 emit_property_module(rf, "property", props)
479 emit_conversions_module(rf, lowerupper, upperlower)
481 def gen_std_unicode():
483 if os.path.exists(r):
485 with open(r, "w") as rf:
488 emit_std_norm_module(rf, combines)