4 Regenerate Unicode tables (tables.rs).
7 # This script uses the Unicode tables as defined
8 # in the UnicodeFiles class.
10 # Since this should not require frequent updates, we just store this
11 # out-of-line and check the tables.rs file into git.
13 # Note that the "curl" program is required for operation.
14 # This script is compatible with Python 2.7 and 3.x.
25 from collections import defaultdict, namedtuple
29 from itertools import zip_longest
30 from io import StringIO
32 # Python 2 compatibility
33 zip_longest = itertools.izip_longest
34 from StringIO import StringIO
37 # Completely optional type hinting
38 # (Python 2 compatible using comments,
39 # see: https://mypy.readthedocs.io/en/latest/python2.html)
40 # This is very helpful in typing-aware IDE like PyCharm.
41 from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Set, Tuple
46 # We don't use enum.Enum because of Python 2.7 compatibility.
47 class UnicodeFiles(object):
48 # ReadMe does not contain any Unicode data, we
49 # only use it to extract versions.
52 DERIVED_CORE_PROPERTIES = "DerivedCoreProperties.txt"
53 DERIVED_NORMALIZATION_PROPS = "DerivedNormalizationProps.txt"
54 PROPS = "PropList.txt"
55 SCRIPTS = "Scripts.txt"
56 SPECIAL_CASING = "SpecialCasing.txt"
57 UNICODE_DATA = "UnicodeData.txt"
60 # The order doesn't really matter (Python < 3.6 won't preserve it),
61 # we only want to aggregate all the file names.
62 ALL_UNICODE_FILES = tuple(
63 value for name, value in UnicodeFiles.__dict__.items()
64 if not name.startswith("_")
67 assert len(ALL_UNICODE_FILES) == 7, "Unexpected number of unicode files"
69 # The directory this file is located in.
70 THIS_DIR = os.path.dirname(os.path.realpath(__file__))
72 # Where to download the Unicode data. The downloaded files
73 # will be placed in sub-directories named after Unicode version.
74 FETCH_DIR = os.path.join(THIS_DIR, "downloaded")
76 FETCH_URL_LATEST = "ftp://ftp.unicode.org/Public/UNIDATA/{filename}"
77 FETCH_URL_VERSION = "ftp://ftp.unicode.org/Public/{version}/ucd/{filename}"
80 // NOTE: The following code was generated by "./unicode.py", do not edit directly
82 #![allow(missing_docs, non_upper_case_globals, non_snake_case, clippy::unreadable_literal)]
84 use crate::unicode::bool_trie::{{BoolTrie, SmallBoolTrie}};
85 use crate::unicode::version::UnicodeVersion;
86 """.format(year=datetime.datetime.now().year)
88 # Mapping taken from Table 12 from:
89 # http://www.unicode.org/reports/tr44/#General_Category_Values
90 EXPANDED_CATEGORIES = {
91 "Lu": ["LC", "L"], "Ll": ["LC", "L"], "Lt": ["LC", "L"],
92 "Lm": ["L"], "Lo": ["L"],
93 "Mn": ["M"], "Mc": ["M"], "Me": ["M"],
94 "Nd": ["N"], "Nl": ["N"], "No": ["N"],
95 "Pc": ["P"], "Pd": ["P"], "Ps": ["P"], "Pe": ["P"],
96 "Pi": ["P"], "Pf": ["P"], "Po": ["P"],
97 "Sm": ["S"], "Sc": ["S"], "Sk": ["S"], "So": ["S"],
98 "Zs": ["Z"], "Zl": ["Z"], "Zp": ["Z"],
99 "Cc": ["C"], "Cf": ["C"], "Cs": ["C"], "Co": ["C"], "Cn": ["C"],
102 # This is the (inclusive) range of surrogate codepoints.
103 # These are not valid Rust characters.
104 SURROGATE_CODEPOINTS_RANGE = (0xd800, 0xdfff)
106 UnicodeData = namedtuple(
109 "to_upper", "to_lower", "to_title",
111 # Decompositions: canonical decompositions, compatibility decomp
112 "canon_decomp", "compat_decomp",
114 # Grouped: general categories and combining characters
115 "general_categories", "combines",
119 UnicodeVersion = namedtuple(
120 "UnicodeVersion", ("major", "minor", "micro", "as_str")
124 def fetch_files(version=None):
125 # type: (str) -> UnicodeVersion
127 Fetch all the Unicode files from unicode.org.
129 This will use cached files (stored in `FETCH_DIR`) if they exist,
130 creating them if they don't. In any case, the Unicode version
133 :param version: The desired Unicode version, as string.
134 (If None, defaults to latest final release available,
135 querying the unicode.org service).
137 have_version = check_stored_version(version)
142 # Check if the desired version exists on the server.
143 get_fetch_url = lambda name: FETCH_URL_VERSION.format(version=version, filename=name)
145 # Extract the latest version.
146 get_fetch_url = lambda name: FETCH_URL_LATEST.format(filename=name)
148 readme_url = get_fetch_url(UnicodeFiles.README)
150 print("Fetching: {}".format(readme_url))
151 readme_content = subprocess.check_output(("curl", readme_url))
153 unicode_version = parse_readme_unicode_version(
154 readme_content.decode("utf8")
157 download_dir = get_unicode_dir(unicode_version)
158 if not os.path.exists(download_dir):
159 # For 2.7 compat, we don't use `exist_ok=True`.
160 os.makedirs(download_dir)
162 for filename in ALL_UNICODE_FILES:
163 file_path = get_unicode_file_path(unicode_version, filename)
165 if os.path.exists(file_path):
166 # Assume file on the server didn't change if it's been saved before.
169 if filename == UnicodeFiles.README:
170 with open(file_path, "wb") as fd:
171 fd.write(readme_content)
173 url = get_fetch_url(filename)
174 print("Fetching: {}".format(url))
175 subprocess.check_call(("curl", "-o", file_path, url))
177 return unicode_version
180 def check_stored_version(version):
181 # type: (Optional[str]) -> Optional[UnicodeVersion]
183 Given desired Unicode version, return the version
184 if stored files are all present, and `None` otherwise.
187 # If no desired version specified, we should check what's the latest
188 # version, skipping stored version checks.
191 fetch_dir = os.path.join(FETCH_DIR, version)
193 for filename in ALL_UNICODE_FILES:
194 file_path = os.path.join(fetch_dir, filename)
196 if not os.path.exists(file_path):
199 with open(os.path.join(fetch_dir, UnicodeFiles.README)) as fd:
200 return parse_readme_unicode_version(fd.read())
203 def parse_readme_unicode_version(readme_content):
204 # type: (str) -> UnicodeVersion
206 Parse the Unicode version contained in their `ReadMe.txt` file.
208 # "Raw string" is necessary for \d not being treated as escape char
209 # (for the sake of compat with future Python versions).
210 # See: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
211 pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
212 groups = re.search(pattern, readme_content).groups()
214 return UnicodeVersion(*map(int, groups), as_str=".".join(groups))
217 def get_unicode_dir(unicode_version):
218 # type: (UnicodeVersion) -> str
220 Indicate in which parent dir the Unicode data files should be stored.
222 This returns a full, absolute path.
224 return os.path.join(FETCH_DIR, unicode_version.as_str)
227 def get_unicode_file_path(unicode_version, filename):
228 # type: (UnicodeVersion, str) -> str
230 Indicate where the Unicode data file should be stored.
232 return os.path.join(get_unicode_dir(unicode_version), filename)
236 # type: (int) -> bool
238 Tell if given codepoint is a surrogate (not a valid Rust character).
240 return SURROGATE_CODEPOINTS_RANGE[0] <= n <= SURROGATE_CODEPOINTS_RANGE[1]
243 def load_unicode_data(file_path):
244 # type: (str) -> UnicodeData
246 Load main Unicode data.
249 to_lower = {} # type: Dict[int, Tuple[int, int, int]]
250 to_upper = {} # type: Dict[int, Tuple[int, int, int]]
251 to_title = {} # type: Dict[int, Tuple[int, int, int]]
254 compat_decomp = {} # type: Dict[int, List[int]]
255 canon_decomp = {} # type: Dict[int, List[int]]
257 # Combining characters
258 # FIXME: combines are not used
259 combines = defaultdict(set) # type: Dict[str, Set[int]]
262 general_categories = defaultdict(set) # type: Dict[str, Set[int]]
263 category_assigned_codepoints = set() # type: Set[int]
269 for line in fileinput.input(file_path):
270 data = line.split(";")
273 codepoint = int(data[0], 16)
274 if is_surrogate(codepoint):
277 for i in range(range_start, codepoint):
278 all_codepoints[i] = data
280 if data[1].endswith(", First>"):
281 range_start = codepoint
283 all_codepoints[codepoint] = data
285 for code, data in all_codepoints.items():
286 (code_org, name, gencat, combine, bidi,
287 decomp, deci, digit, num, mirror,
288 old, iso, upcase, lowcase, titlecase) = data
290 # Generate char to char direct common and simple conversions:
292 # Uppercase to lowercase
293 if lowcase != "" and code_org != lowcase:
294 to_lower[code] = (int(lowcase, 16), 0, 0)
296 # Lowercase to uppercase
297 if upcase != "" and code_org != upcase:
298 to_upper[code] = (int(upcase, 16), 0, 0)
301 if titlecase.strip() != "" and code_org != titlecase:
302 to_title[code] = (int(titlecase, 16), 0, 0)
304 # Store decomposition, if given
306 decompositions = decomp.split()[1:]
307 decomp_code_points = [int(i, 16) for i in decompositions]
309 if decomp.startswith("<"):
310 # Compatibility decomposition
311 compat_decomp[code] = decomp_code_points
313 # Canonical decomposition
314 canon_decomp[code] = decomp_code_points
316 # Place letter in categories as appropriate.
317 for cat in itertools.chain((gencat, ), EXPANDED_CATEGORIES.get(gencat, [])):
318 general_categories[cat].add(code)
319 category_assigned_codepoints.add(code)
321 # Record combining class, if any.
323 combines[combine].add(code)
325 # Generate Not_Assigned from Assigned.
326 general_categories["Cn"] = get_unassigned_codepoints(category_assigned_codepoints)
328 # Other contains Not_Assigned
329 general_categories["C"].update(general_categories["Cn"])
331 grouped_categories = group_categories(general_categories)
333 # FIXME: combines are not used
335 to_lower=to_lower, to_upper=to_upper, to_title=to_title,
336 compat_decomp=compat_decomp, canon_decomp=canon_decomp,
337 general_categories=grouped_categories, combines=combines,
341 def load_special_casing(file_path, unicode_data):
342 # type: (str, UnicodeData) -> None
344 Load special casing data and enrich given Unicode data.
346 for line in fileinput.input(file_path):
347 data = line.split("#")[0].split(";")
349 code, lower, title, upper, _comment = data
351 code, lower, title, upper, condition, _comment = data
352 if condition.strip(): # Only keep unconditional mappins
357 lower = lower.strip()
358 title = title.strip()
359 upper = upper.strip()
361 for (map_, values) in ((unicode_data.to_lower, lower),
362 (unicode_data.to_upper, upper),
363 (unicode_data.to_title, title)):
365 split = values.split()
367 codepoints = list(itertools.chain(
368 (int(i, 16) for i in split),
369 (0 for _ in range(len(split), 3))
372 assert len(codepoints) == 3
373 map_[key] = codepoints
376 def group_categories(mapping):
377 # type: (Dict[Any, Iterable[int]]) -> Dict[str, List[Tuple[int, int]]]
379 Group codepoints mapped in "categories".
381 return {category: group_codepoints(codepoints)
382 for category, codepoints in mapping.items()}
385 def group_codepoints(codepoints):
386 # type: (Iterable[int]) -> List[Tuple[int, int]]
388 Group integral values into continuous, disjoint value ranges.
390 Performs value deduplication.
392 :return: sorted list of pairs denoting start and end of codepoint
393 group values, both ends inclusive.
395 >>> group_codepoints([1, 2, 10, 11, 12, 3, 4])
397 >>> group_codepoints([1])
399 >>> group_codepoints([1, 5, 6])
401 >>> group_codepoints([])
404 sorted_codes = sorted(set(codepoints))
405 result = [] # type: List[Tuple[int, int]]
410 next_codes = sorted_codes[1:]
411 start_code = sorted_codes[0]
413 for code, next_code in zip_longest(sorted_codes, next_codes, fillvalue=None):
414 if next_code is None or next_code - code != 1:
415 result.append((start_code, code))
416 start_code = next_code
421 def ungroup_codepoints(codepoint_pairs):
422 # type: (Iterable[Tuple[int, int]]) -> List[int]
424 The inverse of group_codepoints -- produce a flat list of values
425 from value range pairs.
427 >>> ungroup_codepoints([(1, 4), (10, 12)])
428 [1, 2, 3, 4, 10, 11, 12]
429 >>> ungroup_codepoints([(1, 1), (5, 6)])
431 >>> ungroup_codepoints(group_codepoints([1, 2, 7, 8]))
433 >>> ungroup_codepoints([])
436 return list(itertools.chain.from_iterable(
437 range(lo, hi + 1) for lo, hi in codepoint_pairs
441 def get_unassigned_codepoints(assigned_codepoints):
442 # type: (Set[int]) -> Set[int]
444 Given a set of "assigned" codepoints, return a set
445 of these that are not in assigned and not surrogate.
447 return {i for i in range(0, 0x110000)
448 if i not in assigned_codepoints and not is_surrogate(i)}
451 def generate_table_lines(items, indent, wrap=98):
452 # type: (Iterable[str], int, int) -> Iterator[str]
454 Given table items, generate wrapped lines of text with comma-separated items.
456 This is a generator function.
458 :param wrap: soft wrap limit (characters per line), integer.
463 if len(line) + len(item) < wrap:
471 line = " " * indent + item
476 def load_properties(file_path, interesting_props):
477 # type: (str, Iterable[str]) -> Dict[str, List[Tuple[int, int]]]
479 Load properties data and return in grouped form.
481 props = defaultdict(list) # type: Dict[str, List[Tuple[int, int]]]
482 # "Raw string" is necessary for `\.` and `\w` not to be treated as escape chars
483 # (for the sake of compat with future Python versions).
484 # See: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
485 re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
486 re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
488 for line in fileinput.input(file_path):
489 match = re1.match(line) or re2.match(line)
491 groups = match.groups()
494 # `re1` matched (2 groups).
498 d_lo, d_hi, prop = groups
502 if interesting_props and prop not in interesting_props:
505 lo_value = int(d_lo, 16)
506 hi_value = int(d_hi, 16)
508 props[prop].append((lo_value, hi_value))
510 # Optimize if possible.
512 props[prop] = group_codepoints(ungroup_codepoints(props[prop]))
520 Escape a codepoint for use as Rust char literal.
522 Outputs are OK to use as Rust source code as char literals
523 and they also include necessary quotes.
530 return r"'\u{%x}'" % c if c != 0 else r"'\0'"
533 def format_char_pair(pair):
534 # type: (Tuple[int, int]) -> str
536 Format a pair of two Rust chars.
538 return "(%s,%s)" % (escape_char(pair[0]), escape_char(pair[1]))
543 items, # type: List[Tuple[int, int]]
544 decl_type="&[(char, char)]", # type: str
545 is_pub=True, # type: bool
546 format_item=format_char_pair, # type: Callable[[Tuple[int, int]], str]
548 # type: (...) -> Iterator[str]
550 Generate a nicely formatted Rust constant "table" array.
552 This generates actual Rust code.
559 yield " #[rustfmt::skip]\n"
560 yield " %sconst %s: %s = &[\n" % (pub_string, name, decl_type)
568 data.extend(format_item(item))
570 for table_line in generate_table_lines("".join(data).split(","), 8):
576 def compute_trie(raw_data, chunk_size):
577 # type: (List[int], int) -> Tuple[List[int], List[int]]
579 Compute postfix-compressed trie.
581 See: bool_trie.rs for more details.
583 >>> compute_trie([1, 2, 3, 1, 2, 3, 4, 5, 6], 3)
584 ([0, 0, 1], [1, 2, 3, 4, 5, 6])
585 >>> compute_trie([1, 2, 3, 1, 2, 4, 4, 5, 6], 3)
586 ([0, 1, 2], [1, 2, 3, 1, 2, 4, 4, 5, 6])
589 childmap = {} # type: Dict[Tuple[int, ...], int]
592 assert len(raw_data) % chunk_size == 0, "Chunks must be equally sized"
594 for i in range(len(raw_data) // chunk_size):
595 data = raw_data[i * chunk_size : (i + 1) * chunk_size]
597 # Postfix compression of child nodes (data chunks)
598 # (identical child nodes are shared).
600 # Make a tuple out of the list so it's hashable.
602 if child not in childmap:
603 childmap[child] = len(childmap)
604 child_data.extend(data)
606 root.append(childmap[child])
608 return root, child_data
611 def generate_bool_trie(name, codepoint_ranges, is_pub=False):
612 # type: (str, List[Tuple[int, int]], bool) -> Iterator[str]
614 Generate Rust code for BoolTrie struct.
616 This yields string fragments that should be joined to produce
622 rawdata = [False] * 0x110000
623 for (lo, hi) in codepoint_ranges:
624 for cp in range(lo, hi + 1):
627 # Convert to bitmap chunks of `chunk_size` bits each.
629 for i in range(0x110000 // chunk_size):
631 for j in range(chunk_size):
632 if rawdata[i * chunk_size + j]:
641 yield " #[rustfmt::skip]\n"
642 yield " %sconst %s: &super::BoolTrie = &super::BoolTrie {\n" % (pub_string, name)
644 data = ("0x%016x" % chunk for chunk in chunks[:0x800 // chunk_size])
645 for fragment in generate_table_lines(data, 12):
649 # 0x800..0x10000 trie
650 (r2, r3) = compute_trie(chunks[0x800 // chunk_size : 0x10000 // chunk_size], 64 // chunk_size)
653 for fragment in generate_table_lines(data, 12):
658 data = ("0x%016x" % node for node in r3)
659 for fragment in generate_table_lines(data, 12):
663 # 0x10000..0x110000 trie
664 (mid, r6) = compute_trie(chunks[0x10000 // chunk_size : 0x110000 // chunk_size],
666 (r4, r5) = compute_trie(mid, 64)
670 for fragment in generate_table_lines(data, 12):
676 for fragment in generate_table_lines(data, 12):
681 data = ("0x%016x" % node for node in r6)
682 for fragment in generate_table_lines(data, 12):
689 def generate_small_bool_trie(name, codepoint_ranges, is_pub=False):
690 # type: (str, List[Tuple[int, int]], bool) -> Iterator[str]
692 Generate Rust code for `SmallBoolTrie` struct.
696 last_chunk = max(hi // 64 for (lo, hi) in codepoint_ranges)
697 n_chunks = last_chunk + 1
698 chunks = [0] * n_chunks
699 for (lo, hi) in codepoint_ranges:
700 for cp in range(lo, hi + 1):
701 assert cp // 64 < len(chunks)
702 chunks[cp // 64] |= 1 << (cp & 63)
709 yield " #[rustfmt::skip]\n"
710 yield (" %sconst %s: &super::SmallBoolTrie = &super::SmallBoolTrie {\n"
711 % (pub_string, name))
713 (r1, r2) = compute_trie(chunks, 1)
716 data = (str(node) for node in r1)
717 for fragment in generate_table_lines(data, 12):
722 data = ("0x%016x" % node for node in r2)
723 for fragment in generate_table_lines(data, 12):
730 def generate_property_module(mod, grouped_categories, category_subset):
731 # type: (str, Dict[str, List[Tuple[int, int]]], Iterable[str]) -> Iterator[str]
733 Generate Rust code for module defining properties.
736 yield "pub(crate) mod %s {" % mod
737 for cat in sorted(category_subset):
738 if cat in ("Cc", "White_Space"):
739 generator = generate_small_bool_trie("%s_table" % cat, grouped_categories[cat])
741 generator = generate_bool_trie("%s_table" % cat, grouped_categories[cat])
743 for fragment in generator:
747 yield " pub fn %s(c: char) -> bool {\n" % cat
748 yield " %s_table.lookup(c)\n" % cat
754 def generate_conversions_module(unicode_data):
755 # type: (UnicodeData) -> Iterator[str]
757 Generate Rust code for module defining conversions.
760 yield "pub(crate) mod conversions {"
762 pub fn to_lower(c: char) -> [char; 3] {
763 match bsearch_case_table(c, to_lowercase_table) {
764 None => [c, '\\0', '\\0'],
765 Some(index) => to_lowercase_table[index].1,
769 pub fn to_upper(c: char) -> [char; 3] {
770 match bsearch_case_table(c, to_uppercase_table) {
771 None => [c, '\\0', '\\0'],
772 Some(index) => to_uppercase_table[index].1,
776 fn bsearch_case_table(c: char, table: &[(char, [char; 3])]) -> Option<usize> {
777 table.binary_search_by(|&(key, _)| key.cmp(&c)).ok()
780 decl_type = "&[(char, [char; 3])]"
781 format_conversion = lambda x: "({},[{},{},{}])".format(*(
782 escape_char(c) for c in (x[0], x[1][0], x[1][1], x[1][2])
785 for fragment in generate_table(
786 name="to_lowercase_table",
787 items=sorted(unicode_data.to_lower.items(), key=lambda x: x[0]),
790 format_item=format_conversion
794 for fragment in generate_table(
795 name="to_uppercase_table",
796 items=sorted(unicode_data.to_upper.items(), key=lambda x: x[0]),
799 format_item=format_conversion
807 # type: () -> argparse.Namespace
809 Parse command line arguments.
811 parser = argparse.ArgumentParser(description=__doc__)
812 parser.add_argument("-v", "--version", default=None, type=str,
813 help="Unicode version to use (if not specified,"
814 " defaults to latest release).")
816 return parser.parse_args()
826 unicode_version = fetch_files(args.version)
827 print("Using Unicode version: {}".format(unicode_version.as_str))
829 # All the writing happens entirely in memory, we only write to file
830 # once we have generated the file content (it's not very large, <1 MB).
834 unicode_version_notice = textwrap.dedent("""
835 /// The version of [Unicode](http://www.unicode.org/) that the Unicode parts of
836 /// `char` and `str` methods are based on.
837 #[unstable(feature = "unicode_version", issue = "49726")]
838 pub const UNICODE_VERSION: UnicodeVersion =
839 UnicodeVersion {{ major: {v.major}, minor: {v.minor}, micro: {v.micro}, _priv: () }};
840 """).format(v=unicode_version)
841 buf.write(unicode_version_notice)
843 get_path = lambda f: get_unicode_file_path(unicode_version, f)
845 unicode_data = load_unicode_data(get_path(UnicodeFiles.UNICODE_DATA))
846 load_special_casing(get_path(UnicodeFiles.SPECIAL_CASING), unicode_data)
848 want_derived = {"Alphabetic", "Lowercase", "Uppercase",
849 "Cased", "Case_Ignorable", "Grapheme_Extend"}
850 derived = load_properties(get_path(UnicodeFiles.DERIVED_CORE_PROPERTIES), want_derived)
852 props = load_properties(get_path(UnicodeFiles.PROPS),
853 {"White_Space", "Join_Control", "Noncharacter_Code_Point"})
856 for (name, categories, category_subset) in (
857 ("general_category", unicode_data.general_categories, ["N", "Cc"]),
858 ("derived_property", derived, want_derived),
859 ("property", props, ["White_Space"])
861 for fragment in generate_property_module(name, categories, category_subset):
864 for fragment in generate_conversions_module(unicode_data):
867 tables_rs_path = os.path.join(THIS_DIR, "tables.rs")
869 # Actually write out the file content.
870 # Will overwrite the file if it exists.
871 with open(tables_rs_path, "w") as fd:
872 fd.write(buf.getvalue())
874 print("Regenerated tables.rs.")
877 if __name__ == "__main__":