src/tools/unicode-table-generator/src/main.rs

   1 use std::collections::{BTreeMap, HashMap};
   2 use std::ops::Range;
   3 use ucd_parse::Codepoints;
   4
   5 mod case_mapping;
   6 mod raw_emitter;
   7 mod unicode_download;
   8
   9 use raw_emitter::{emit_codepoints, RawEmitter};
  10
  11 static PROPERTIES: &[&str] = &[
  12     "Alphabetic",
  13     "Lowercase",
  14     "Uppercase",
  15     "Cased",
  16     "Case_Ignorable",
  17     "Grapheme_Extend",
  18     "White_Space",
  19     "Cc",
  20     "N",
  21 ];
  22
  23 struct UnicodeData {
  24     ranges: Vec<(&'static str, Vec<Range<u32>>)>,
  25     to_upper: BTreeMap<u32, (u32, u32, u32)>,
  26     to_lower: BTreeMap<u32, (u32, u32, u32)>,
  27 }
  28
  29 fn to_mapping(origin: u32, codepoints: Vec<ucd_parse::Codepoint>) -> Option<(u32, u32, u32)> {
  30     let mut a = None;
  31     let mut b = None;
  32     let mut c = None;
  33
  34     for codepoint in codepoints {
  35         if origin == codepoint.value() {
  36             return None;
  37         }
  38
  39         if a.is_none() {
  40             a = Some(codepoint.value());
  41         } else if b.is_none() {
  42             b = Some(codepoint.value());
  43         } else if c.is_none() {
  44             c = Some(codepoint.value());
  45         } else {
  46             panic!("more than 3 mapped codepoints")
  47         }
  48     }
  49
  50     Some((a.unwrap(), b.unwrap_or(0), c.unwrap_or(0)))
  51 }
  52
  53 static UNICODE_DIRECTORY: &str = "unicode-downloads";
  54
  55 fn load_data() -> UnicodeData {
  56     unicode_download::fetch_latest();
  57
  58     let mut properties = HashMap::new();
  59     for row in ucd_parse::parse::<_, ucd_parse::CoreProperty>(&UNICODE_DIRECTORY).unwrap() {
  60         if let Some(name) = PROPERTIES.iter().find(|prop| **prop == row.property.as_str()) {
  61             properties.entry(*name).or_insert_with(Vec::new).push(row.codepoints);
  62         }
  63     }
  64     for row in ucd_parse::parse::<_, ucd_parse::Property>(&UNICODE_DIRECTORY).unwrap() {
  65         if let Some(name) = PROPERTIES.iter().find(|prop| **prop == row.property.as_str()) {
  66             properties.entry(*name).or_insert_with(Vec::new).push(row.codepoints);
  67         }
  68     }
  69
  70     let mut to_lower = BTreeMap::new();
  71     let mut to_upper = BTreeMap::new();
  72     for row in ucd_parse::UnicodeDataExpander::new(
  73         ucd_parse::parse::<_, ucd_parse::UnicodeData>(&UNICODE_DIRECTORY).unwrap(),
  74     ) {
  75         let general_category = if ["Nd", "Nl", "No"].contains(&row.general_category.as_str()) {
  76             "N"
  77         } else {
  78             row.general_category.as_str()
  79         };
  80         if let Some(name) = PROPERTIES.iter().find(|prop| **prop == general_category) {
  81             properties
  82                 .entry(*name)
  83                 .or_insert_with(Vec::new)
  84                 .push(Codepoints::Single(row.codepoint));
  85         }
  86
  87         if let Some(mapped) = row.simple_lowercase_mapping {
  88             if mapped != row.codepoint {
  89                 to_lower.insert(row.codepoint.value(), (mapped.value(), 0, 0));
  90             }
  91         }
  92         if let Some(mapped) = row.simple_uppercase_mapping {
  93             if mapped != row.codepoint {
  94                 to_upper.insert(row.codepoint.value(), (mapped.value(), 0, 0));
  95             }
  96         }
  97     }
  98
  99     for row in ucd_parse::parse::<_, ucd_parse::SpecialCaseMapping>(&UNICODE_DIRECTORY).unwrap() {
 100         if !row.conditions.is_empty() {
 101             // Skip conditional case mappings
 102             continue;
 103         }
 104
 105         let key = row.codepoint.value();
 106         if let Some(lower) = to_mapping(key, row.lowercase) {
 107             to_lower.insert(key, lower);
 108         }
 109         if let Some(upper) = to_mapping(key, row.uppercase) {
 110             to_upper.insert(key, upper);
 111         }
 112     }
 113
 114     let mut properties: HashMap<&'static str, Vec<Range<u32>>> = properties
 115         .into_iter()
 116         .map(|(k, v)| {
 117             (
 118                 k,
 119                 v.into_iter()
 120                     .flat_map(|codepoints| match codepoints {
 121                         Codepoints::Single(c) => c
 122                             .scalar()
 123                             .map(|ch| (ch as u32..ch as u32 + 1))
 124                             .into_iter()
 125                             .collect::<Vec<_>>(),
 126                         Codepoints::Range(c) => c
 127                             .into_iter()
 128                             .flat_map(|c| c.scalar().map(|ch| (ch as u32..ch as u32 + 1)))
 129                             .collect::<Vec<_>>(),
 130                     })
 131                     .collect::<Vec<Range<u32>>>(),
 132             )
 133         })
 134         .collect();
 135
 136     for ranges in properties.values_mut() {
 137         merge_ranges(ranges);
 138     }
 139
 140     let mut properties = properties.into_iter().collect::<Vec<_>>();
 141     properties.sort_by_key(|p| p.0);
 142     UnicodeData { ranges: properties, to_lower, to_upper }
 143 }
 144
 145 fn main() {
 146     let write_location = std::env::args().nth(1).unwrap_or_else(|| {
 147         eprintln!("Must provide path to write unicode tables to");
 148         eprintln!(
 149             "e.g. {} src/libcore/unicode/unicode_data.rs",
 150             std::env::args().next().unwrap_or_default()
 151         );
 152         std::process::exit(1);
 153     });
 154
 155     // Optional test path, which is a Rust source file testing that the unicode
 156     // property lookups are correct.
 157     let test_path = std::env::args().nth(2);
 158
 159     let unicode_data = load_data();
 160     let ranges_by_property = &unicode_data.ranges;
 161
 162     if let Some(path) = test_path {
 163         std::fs::write(&path, generate_tests(&write_location, &ranges_by_property)).unwrap();
 164     }
 165
 166     let mut total_bytes = 0;
 167     let mut modules = Vec::new();
 168     for (property, ranges) in ranges_by_property {
 169         let datapoints = ranges.iter().map(|r| r.end - r.start).sum::<u32>();
 170         let mut emitter = RawEmitter::new();
 171         emit_codepoints(&mut emitter, &ranges);
 172
 173         modules.push((property.to_lowercase().to_string(), emitter.file));
 174         println!(
 175             "{:15}: {} bytes, {} codepoints in {} ranges ({} - {})",
 176             property,
 177             emitter.bytes_used,
 178             datapoints,
 179             ranges.len(),
 180             ranges.first().unwrap().start,
 181             ranges.last().unwrap().end
 182         );
 183         total_bytes += emitter.bytes_used;
 184     }
 185
 186     let mut table_file = String::new();
 187
 188     table_file.push_str(
 189         "///! This file is generated by src/tools/unicode-table-generator; do not edit manually!\n",
 190     );
 191
 192     // Include the range search function
 193     table_file.push('\n');
 194     table_file.push_str(include_str!("range_search.rs"));
 195     table_file.push('\n');
 196
 197     table_file.push_str(&version());
 198
 199     table_file.push('\n');
 200
 201     modules.push((String::from("conversions"), case_mapping::generate_case_mapping(&unicode_data)));
 202
 203     for (name, contents) in modules {
 204         table_file.push_str("#[rustfmt::skip]\n");
 205         table_file.push_str(&format!("pub mod {} {{\n", name));
 206         for line in contents.lines() {
 207             if !line.trim().is_empty() {
 208                 table_file.push_str("    ");
 209                 table_file.push_str(&line);
 210             }
 211             table_file.push('\n');
 212         }
 213         table_file.push_str("}\n\n");
 214     }
 215
 216     std::fs::write(&write_location, format!("{}\n", table_file.trim_end())).unwrap();
 217
 218     println!("Total table sizes: {} bytes", total_bytes);
 219 }
 220
 221 fn version() -> String {
 222     let mut out = String::new();
 223     out.push_str("pub const UNICODE_VERSION: (u32, u32, u32) = ");
 224
 225     let readme =
 226         std::fs::read_to_string(std::path::Path::new(UNICODE_DIRECTORY).join("ReadMe.txt"))
 227             .unwrap();
 228
 229     let prefix = "for Version ";
 230     let start = readme.find(prefix).unwrap() + prefix.len();
 231     let end = readme.find(" of the Unicode Standard.").unwrap();
 232     let version =
 233         readme[start..end].split('.').map(|v| v.parse::<u32>().expect(&v)).collect::<Vec<_>>();
 234     let [major, minor, micro] = [version[0], version[1], version[2]];
 235
 236     out.push_str(&format!("({}, {}, {});\n", major, minor, micro));
 237     out
 238 }
 239
 240 fn fmt_list<V: std::fmt::Debug>(values: impl IntoIterator<Item = V>) -> String {
 241     let pieces = values.into_iter().map(|b| format!("{:?}, ", b)).collect::<Vec<_>>();
 242     let mut out = String::new();
 243     let mut line = format!("\n    ");
 244     for piece in pieces {
 245         if line.len() + piece.len() < 98 {
 246             line.push_str(&piece);
 247         } else {
 248             out.push_str(line.trim_end());
 249             out.push('\n');
 250             line = format!("    {}", piece);
 251         }
 252     }
 253     out.push_str(line.trim_end());
 254     out.push('\n');
 255     out
 256 }
 257
 258 fn generate_tests(data_path: &str, ranges: &[(&str, Vec<Range<u32>>)]) -> String {
 259     let mut s = String::new();
 260     s.push_str("#![allow(incomplete_features, unused)]\n");
 261     s.push_str("#![feature(const_generics)]\n\n");
 262     s.push_str(&format!("#[path = \"{}\"]\n", data_path));
 263     s.push_str("mod unicode_data;\n\n");
 264
 265     s.push_str("\nfn main() {\n");
 266
 267     for (property, ranges) in ranges {
 268         s.push_str(&format!(r#"    println!("Testing {}");"#, property));
 269         s.push('\n');
 270         s.push_str(&format!("    {}();\n", property.to_lowercase()));
 271         let mut is_true = Vec::new();
 272         let mut is_false = Vec::new();
 273         for ch_num in 0..(std::char::MAX as u32) {
 274             if std::char::from_u32(ch_num).is_none() {
 275                 continue;
 276             }
 277             if ranges.iter().any(|r| r.contains(&ch_num)) {
 278                 is_true.push(ch_num);
 279             } else {
 280                 is_false.push(ch_num);
 281             }
 282         }
 283
 284         s.push_str(&format!("    fn {}() {{\n", property.to_lowercase()));
 285         generate_asserts(&mut s, property, &is_true, true);
 286         generate_asserts(&mut s, property, &is_false, false);
 287         s.push_str("    }\n\n");
 288     }
 289
 290     s.push_str("}");
 291     s
 292 }
 293
 294 fn generate_asserts(s: &mut String, property: &str, points: &[u32], truthy: bool) {
 295     for range in ranges_from_set(points) {
 296         if range.end == range.start + 1 {
 297             s.push_str(&format!(
 298                 "        assert!({}unicode_data::{}::lookup(std::char::from_u32({}).unwrap()), \"{}\");\n",
 299                 if truthy { "" } else { "!" },
 300                 property.to_lowercase(),
 301                 range.start,
 302                 std::char::from_u32(range.start).unwrap(),
 303         ));
 304         } else {
 305             s.push_str(&format!("        for chn in {:?}u32 {{\n", range));
 306             s.push_str(&format!(
 307                 "            assert!({}unicode_data::{}::lookup(std::char::from_u32(chn).unwrap()), \"{{:?}}\", chn);\n",
 308                 if truthy { "" } else { "!" },
 309                 property.to_lowercase(),
 310         ));
 311             s.push_str("        }\n");
 312         }
 313     }
 314 }
 315
 316 fn ranges_from_set(set: &[u32]) -> Vec<Range<u32>> {
 317     let mut ranges = set.iter().map(|e| (*e)..(*e + 1)).collect::<Vec<Range<u32>>>();
 318     merge_ranges(&mut ranges);
 319     ranges
 320 }
 321
 322 fn merge_ranges(ranges: &mut Vec<Range<u32>>) {
 323     loop {
 324         let mut new_ranges = Vec::new();
 325         let mut idx_iter = 0..(ranges.len() - 1);
 326         while let Some(idx) = idx_iter.next() {
 327             let cur = ranges[idx].clone();
 328             let next = ranges[idx + 1].clone();
 329             if cur.end == next.start {
 330                 let _ = idx_iter.next(); // skip next as we're merging it in
 331                 new_ranges.push(cur.start..next.end);
 332             } else {
 333                 new_ranges.push(cur);
 334             }
 335         }
 336         new_ranges.push(ranges.last().unwrap().clone());
 337         if new_ranges.len() == ranges.len() {
 338             *ranges = new_ranges;
 339             break;
 340         } else {
 341             *ranges = new_ranges;
 342         }
 343     }
 344 }