src/tools/unicode-table-generator/src/main.rs

   1 //! This implements the core logic of the compression scheme used to compactly
   2 //! encode Unicode properties.
   3 //!
   4 //! We have two primary goals with the encoding: we want to be compact, because
   5 //! these tables often end up in ~every Rust program (especially the
   6 //! grapheme_extend table, used for str debugging), including those for embedded
   7 //! targets (where space is important). We also want to be relatively fast,
   8 //! though this is more of a nice to have rather than a key design constraint.
   9 //! It is expected that libraries/applications which are performance-sensitive
  10 //! to Unicode property lookups are extremely rare, and those that care may find
  11 //! the tradeoff of the raw bitsets worth it. For most applications, a
  12 //! relatively fast but much smaller (and as such less cache-impacting, etc.)
  13 //! data set is likely preferable.
  14 //!
  15 //! We have two separate encoding schemes: a skiplist-like approach, and a
  16 //! compressed bitset. The datasets we consider mostly use the skiplist (it's
  17 //! smaller) but the lowercase and uppercase sets are sufficiently sparse for
  18 //! the bitset to be worthwhile -- for those sets the bitset is a 2x size win.
  19 //! Since the bitset is also faster, this seems an obvious choice. (As a
  20 //! historical note, the bitset was also the prior implementation, so its
  21 //! relative complexity had already been paid).
  22 //!
  23 //! ## The bitset
  24 //!
  25 //! The primary idea is that we 'flatten' the Unicode ranges into an enormous
  26 //! bitset. To represent any arbitrary codepoint in a raw bitset, we would need
  27 //! over 17 kilobytes of data per character set -- way too much for our
  28 //! purposes.
  29 //!
  30 //! First, the raw bitset (one bit for every valid `char`, from 0 to 0x10FFFF,
  31 //! not skipping the small 'gap') is associated into words (u64) and
  32 //! deduplicated. On random data, this would be useless; on our data, this is
  33 //! incredibly beneficial -- our data sets have (far) less than 256 unique
  34 //! words.
  35 //!
  36 //! This gives us an array that maps `u8 -> word`; the current algorithm does
  37 //! not handle the case of more than 256 unique words, but we are relatively far
  38 //! from coming that close.
  39 //!
  40 //! With that scheme, we now have a single byte for every 64 codepoints.
  41 //!
  42 //! We further chunk these by some constant N (between 1 and 64 per group,
  43 //! dynamically chosen for smallest size), and again deduplicate and store in an
  44 //! array (u8 -> [u8; N]).
  45 //!
  46 //! The bytes of this array map into the words from the bitset above, but we
  47 //! apply another trick here: some of these words are similar enough that they
  48 //! can be represented by some function of another word. The particular
  49 //! functions chosen are rotation, inversion, and shifting (right).
  50 //!
  51 //! ## The skiplist
  52 //!
  53 //! The skip list arose out of the desire for an even smaller encoding than the
  54 //! bitset -- and was the answer to the question "what is the smallest
  55 //! representation we can imagine?". However, it is not necessarily the
  56 //! smallest, and if you have a better proposal, please do suggest it!
  57 //!
  58 //! This is a relatively straightforward encoding. First, we break up all the
  59 //! ranges in the input data into offsets from each other, essentially a gap
  60 //! encoding. In practice, most gaps are small -- less than u8::MAX -- so we
  61 //! store those directly. We make use of the larger gaps (which are nicely
  62 //! interspersed already) throughout the dataset to index this data set.
  63 //!
  64 //! In particular, each run of small gaps (terminating in a large gap) is
  65 //! indexed in a separate dataset. That data set stores an index into the
  66 //! primary offset list and a prefix sum of that offset list. These are packed
  67 //! into a single u32 (11 bits for the offset, 21 bits for the prefix sum).
  68 //!
  69 //! Lookup proceeds via a binary search in the index and then a straightforward
  70 //! linear scan (adding up the offsets) until we reach the needle, and then the
  71 //! index of that offset is utilized as the answer to whether we're in the set
  72 //! or not.
  73
  74 use std::collections::{BTreeMap, HashMap};
  75 use std::ops::Range;
  76 use ucd_parse::Codepoints;
  77
  78 mod cascading_map;
  79 mod case_mapping;
  80 mod raw_emitter;
  81 mod skiplist;
  82 mod unicode_download;
  83
  84 use raw_emitter::{emit_codepoints, emit_whitespace, RawEmitter};
  85
  86 static PROPERTIES: &[&str] = &[
  87     "Alphabetic",
  88     "Lowercase",
  89     "Uppercase",
  90     "Cased",
  91     "Case_Ignorable",
  92     "Grapheme_Extend",
  93     "White_Space",
  94     "Cc",
  95     "N",
  96 ];
  97
  98 struct UnicodeData {
  99     ranges: Vec<(&'static str, Vec<Range<u32>>)>,
 100     to_upper: BTreeMap<u32, (u32, u32, u32)>,
 101     to_lower: BTreeMap<u32, (u32, u32, u32)>,
 102 }
 103
 104 fn to_mapping(origin: u32, codepoints: Vec<ucd_parse::Codepoint>) -> Option<(u32, u32, u32)> {
 105     let mut a = None;
 106     let mut b = None;
 107     let mut c = None;
 108
 109     for codepoint in codepoints {
 110         if origin == codepoint.value() {
 111             return None;
 112         }
 113
 114         if a.is_none() {
 115             a = Some(codepoint.value());
 116         } else if b.is_none() {
 117             b = Some(codepoint.value());
 118         } else if c.is_none() {
 119             c = Some(codepoint.value());
 120         } else {
 121             panic!("more than 3 mapped codepoints")
 122         }
 123     }
 124
 125     Some((a.unwrap(), b.unwrap_or(0), c.unwrap_or(0)))
 126 }
 127
 128 static UNICODE_DIRECTORY: &str = "unicode-downloads";
 129
 130 fn load_data() -> UnicodeData {
 131     unicode_download::fetch_latest();
 132
 133     let mut properties = HashMap::new();
 134     for row in ucd_parse::parse::<_, ucd_parse::CoreProperty>(&UNICODE_DIRECTORY).unwrap() {
 135         if let Some(name) = PROPERTIES.iter().find(|prop| **prop == row.property.as_str()) {
 136             properties.entry(*name).or_insert_with(Vec::new).push(row.codepoints);
 137         }
 138     }
 139     for row in ucd_parse::parse::<_, ucd_parse::Property>(&UNICODE_DIRECTORY).unwrap() {
 140         if let Some(name) = PROPERTIES.iter().find(|prop| **prop == row.property.as_str()) {
 141             properties.entry(*name).or_insert_with(Vec::new).push(row.codepoints);
 142         }
 143     }
 144
 145     let mut to_lower = BTreeMap::new();
 146     let mut to_upper = BTreeMap::new();
 147     for row in ucd_parse::UnicodeDataExpander::new(
 148         ucd_parse::parse::<_, ucd_parse::UnicodeData>(&UNICODE_DIRECTORY).unwrap(),
 149     ) {
 150         let general_category = if ["Nd", "Nl", "No"].contains(&row.general_category.as_str()) {
 151             "N"
 152         } else {
 153             row.general_category.as_str()
 154         };
 155         if let Some(name) = PROPERTIES.iter().find(|prop| **prop == general_category) {
 156             properties
 157                 .entry(*name)
 158                 .or_insert_with(Vec::new)
 159                 .push(Codepoints::Single(row.codepoint));
 160         }
 161
 162         if let Some(mapped) = row.simple_lowercase_mapping {
 163             if mapped != row.codepoint {
 164                 to_lower.insert(row.codepoint.value(), (mapped.value(), 0, 0));
 165             }
 166         }
 167         if let Some(mapped) = row.simple_uppercase_mapping {
 168             if mapped != row.codepoint {
 169                 to_upper.insert(row.codepoint.value(), (mapped.value(), 0, 0));
 170             }
 171         }
 172     }
 173
 174     for row in ucd_parse::parse::<_, ucd_parse::SpecialCaseMapping>(&UNICODE_DIRECTORY).unwrap() {
 175         if !row.conditions.is_empty() {
 176             // Skip conditional case mappings
 177             continue;
 178         }
 179
 180         let key = row.codepoint.value();
 181         if let Some(lower) = to_mapping(key, row.lowercase) {
 182             to_lower.insert(key, lower);
 183         }
 184         if let Some(upper) = to_mapping(key, row.uppercase) {
 185             to_upper.insert(key, upper);
 186         }
 187     }
 188
 189     let mut properties: HashMap<&'static str, Vec<Range<u32>>> = properties
 190         .into_iter()
 191         .map(|(k, v)| {
 192             (
 193                 k,
 194                 v.into_iter()
 195                     .flat_map(|codepoints| match codepoints {
 196                         Codepoints::Single(c) => c
 197                             .scalar()
 198                             .map(|ch| (ch as u32..ch as u32 + 1))
 199                             .into_iter()
 200                             .collect::<Vec<_>>(),
 201                         Codepoints::Range(c) => c
 202                             .into_iter()
 203                             .flat_map(|c| c.scalar().map(|ch| (ch as u32..ch as u32 + 1)))
 204                             .collect::<Vec<_>>(),
 205                     })
 206                     .collect::<Vec<Range<u32>>>(),
 207             )
 208         })
 209         .collect();
 210
 211     for ranges in properties.values_mut() {
 212         merge_ranges(ranges);
 213     }
 214
 215     let mut properties = properties.into_iter().collect::<Vec<_>>();
 216     properties.sort_by_key(|p| p.0);
 217     UnicodeData { ranges: properties, to_lower, to_upper }
 218 }
 219
 220 fn main() {
 221     let write_location = std::env::args().nth(1).unwrap_or_else(|| {
 222         eprintln!("Must provide path to write unicode tables to");
 223         eprintln!(
 224             "e.g. {} library/core/unicode/unicode_data.rs",
 225             std::env::args().next().unwrap_or_default()
 226         );
 227         std::process::exit(1);
 228     });
 229
 230     // Optional test path, which is a Rust source file testing that the unicode
 231     // property lookups are correct.
 232     let test_path = std::env::args().nth(2);
 233
 234     let unicode_data = load_data();
 235     let ranges_by_property = &unicode_data.ranges;
 236
 237     if let Some(path) = test_path {
 238         std::fs::write(&path, generate_tests(&write_location, &ranges_by_property)).unwrap();
 239     }
 240
 241     let mut total_bytes = 0;
 242     let mut modules = Vec::new();
 243     for (property, ranges) in ranges_by_property {
 244         let datapoints = ranges.iter().map(|r| r.end - r.start).sum::<u32>();
 245
 246         let mut emitter = RawEmitter::new();
 247         if property == &"White_Space" {
 248             emit_whitespace(&mut emitter, &ranges);
 249         } else {
 250             emit_codepoints(&mut emitter, &ranges);
 251         }
 252
 253         modules.push((property.to_lowercase().to_string(), emitter.file));
 254         println!(
 255             "{:15}: {} bytes, {} codepoints in {} ranges ({} - {}) using {}",
 256             property,
 257             emitter.bytes_used,
 258             datapoints,
 259             ranges.len(),
 260             ranges.first().unwrap().start,
 261             ranges.last().unwrap().end,
 262             emitter.desc,
 263         );
 264         total_bytes += emitter.bytes_used;
 265     }
 266
 267     let mut table_file = String::new();
 268
 269     table_file.push_str(
 270         "///! This file is generated by src/tools/unicode-table-generator; do not edit manually!\n",
 271     );
 272
 273     // Include the range search function
 274     table_file.push('\n');
 275     table_file.push_str(include_str!("range_search.rs"));
 276     table_file.push('\n');
 277
 278     table_file.push_str(&version());
 279
 280     table_file.push('\n');
 281
 282     modules.push((String::from("conversions"), case_mapping::generate_case_mapping(&unicode_data)));
 283
 284     for (name, contents) in modules {
 285         table_file.push_str("#[rustfmt::skip]\n");
 286         table_file.push_str(&format!("pub mod {name} {{\n"));
 287         for line in contents.lines() {
 288             if !line.trim().is_empty() {
 289                 table_file.push_str("    ");
 290                 table_file.push_str(&line);
 291             }
 292             table_file.push('\n');
 293         }
 294         table_file.push_str("}\n\n");
 295     }
 296
 297     std::fs::write(&write_location, format!("{}\n", table_file.trim_end())).unwrap();
 298
 299     println!("Total table sizes: {total_bytes} bytes");
 300 }
 301
 302 fn version() -> String {
 303     let mut out = String::new();
 304     out.push_str("pub const UNICODE_VERSION: (u8, u8, u8) = ");
 305
 306     let readme =
 307         std::fs::read_to_string(std::path::Path::new(UNICODE_DIRECTORY).join("ReadMe.txt"))
 308             .unwrap();
 309
 310     let prefix = "for Version ";
 311     let start = readme.find(prefix).unwrap() + prefix.len();
 312     let end = readme.find(" of the Unicode Standard.").unwrap();
 313     let version =
 314         readme[start..end].split('.').map(|v| v.parse::<u32>().expect(&v)).collect::<Vec<_>>();
 315     let [major, minor, micro] = [version[0], version[1], version[2]];
 316
 317     out.push_str(&format!("({major}, {minor}, {micro});\n"));
 318     out
 319 }
 320
 321 fn fmt_list<V: std::fmt::Debug>(values: impl IntoIterator<Item = V>) -> String {
 322     let pieces = values.into_iter().map(|b| format!("{:?}, ", b)).collect::<Vec<_>>();
 323     let mut out = String::new();
 324     let mut line = String::from("\n    ");
 325     for piece in pieces {
 326         if line.len() + piece.len() < 98 {
 327             line.push_str(&piece);
 328         } else {
 329             out.push_str(line.trim_end());
 330             out.push('\n');
 331             line = format!("    {piece}");
 332         }
 333     }
 334     out.push_str(line.trim_end());
 335     out.push('\n');
 336     out
 337 }
 338
 339 fn generate_tests(data_path: &str, ranges: &[(&str, Vec<Range<u32>>)]) -> String {
 340     let mut s = String::new();
 341     s.push_str("#![allow(incomplete_features, unused)]\n");
 342     s.push_str("#![feature(const_generics)]\n\n");
 343     s.push_str("\n#[allow(unused)]\nuse std::hint;\n");
 344     s.push_str(&format!("#[path = \"{data_path}\"]\n"));
 345     s.push_str("mod unicode_data;\n\n");
 346
 347     s.push_str("\nfn main() {\n");
 348
 349     for (property, ranges) in ranges {
 350         s.push_str(&format!(r#"    println!("Testing {}");"#, property));
 351         s.push('\n');
 352         s.push_str(&format!("    {}_true();\n", property.to_lowercase()));
 353         s.push_str(&format!("    {}_false();\n", property.to_lowercase()));
 354         let mut is_true = Vec::new();
 355         let mut is_false = Vec::new();
 356         for ch_num in 0..(std::char::MAX as u32) {
 357             if std::char::from_u32(ch_num).is_none() {
 358                 continue;
 359             }
 360             if ranges.iter().any(|r| r.contains(&ch_num)) {
 361                 is_true.push(ch_num);
 362             } else {
 363                 is_false.push(ch_num);
 364             }
 365         }
 366
 367         s.push_str(&format!("    fn {}_true() {{\n", property.to_lowercase()));
 368         generate_asserts(&mut s, property, &is_true, true);
 369         s.push_str("    }\n\n");
 370         s.push_str(&format!("    fn {}_false() {{\n", property.to_lowercase()));
 371         generate_asserts(&mut s, property, &is_false, false);
 372         s.push_str("    }\n\n");
 373     }
 374
 375     s.push_str("}");
 376     s
 377 }
 378
 379 fn generate_asserts(s: &mut String, property: &str, points: &[u32], truthy: bool) {
 380     for range in ranges_from_set(points) {
 381         if range.end == range.start + 1 {
 382             s.push_str(&format!(
 383                 "        assert!({}unicode_data::{}::lookup({:?}), \"{}\");\n",
 384                 if truthy { "" } else { "!" },
 385                 property.to_lowercase(),
 386                 std::char::from_u32(range.start).unwrap(),
 387                 range.start,
 388             ));
 389         } else {
 390             s.push_str(&format!("        for chn in {:?}u32 {{\n", range));
 391             s.push_str(&format!(
 392                 "            assert!({}unicode_data::{}::lookup(std::char::from_u32(chn).unwrap()), \"{{:?}}\", chn);\n",
 393                 if truthy { "" } else { "!" },
 394                 property.to_lowercase(),
 395             ));
 396             s.push_str("        }\n");
 397         }
 398     }
 399 }
 400
 401 fn ranges_from_set(set: &[u32]) -> Vec<Range<u32>> {
 402     let mut ranges = set.iter().map(|e| (*e)..(*e + 1)).collect::<Vec<Range<u32>>>();
 403     merge_ranges(&mut ranges);
 404     ranges
 405 }
 406
 407 fn merge_ranges(ranges: &mut Vec<Range<u32>>) {
 408     loop {
 409         let mut new_ranges = Vec::new();
 410         let mut idx_iter = 0..(ranges.len() - 1);
 411         let mut should_insert_last = true;
 412         while let Some(idx) = idx_iter.next() {
 413             let cur = ranges[idx].clone();
 414             let next = ranges[idx + 1].clone();
 415             if cur.end == next.start {
 416                 if idx_iter.next().is_none() {
 417                     // We're merging the last element
 418                     should_insert_last = false;
 419                 }
 420                 new_ranges.push(cur.start..next.end);
 421             } else {
 422                 // We're *not* merging the last element
 423                 should_insert_last = true;
 424                 new_ranges.push(cur);
 425             }
 426         }
 427         if should_insert_last {
 428             new_ranges.push(ranges.last().unwrap().clone());
 429         }
 430         if new_ranges.len() == ranges.len() {
 431             *ranges = new_ranges;
 432             break;
 433         } else {
 434             *ranges = new_ranges;
 435         }
 436     }
 437
 438     let mut last_end = None;
 439     for range in ranges {
 440         if let Some(last) = last_end {
 441             assert!(range.start > last, "{:?}", range);
 442         }
 443         last_end = Some(range.end);
 444     }
 445 }