1 use std::collections::{BTreeMap, HashMap};
3 use ucd_parse::Codepoints;
9 use raw_emitter::{emit_codepoints, RawEmitter};
11 static PROPERTIES: &[&str] = &[
24 ranges: Vec<(&'static str, Vec<Range<u32>>)>,
25 to_upper: BTreeMap<u32, (u32, u32, u32)>,
26 to_lower: BTreeMap<u32, (u32, u32, u32)>,
29 fn to_mapping(origin: u32, codepoints: Vec<ucd_parse::Codepoint>) -> Option<(u32, u32, u32)> {
34 for codepoint in codepoints {
35 if origin == codepoint.value() {
40 a = Some(codepoint.value());
41 } else if b.is_none() {
42 b = Some(codepoint.value());
43 } else if c.is_none() {
44 c = Some(codepoint.value());
46 panic!("more than 3 mapped codepoints")
50 Some((a.unwrap(), b.unwrap_or(0), c.unwrap_or(0)))
53 static UNICODE_DIRECTORY: &str = "unicode-downloads";
55 fn load_data() -> UnicodeData {
56 unicode_download::fetch_latest();
58 let mut properties = HashMap::new();
59 for row in ucd_parse::parse::<_, ucd_parse::CoreProperty>(&UNICODE_DIRECTORY).unwrap() {
60 if let Some(name) = PROPERTIES.iter().find(|prop| **prop == row.property.as_str()) {
61 properties.entry(*name).or_insert_with(Vec::new).push(row.codepoints);
64 for row in ucd_parse::parse::<_, ucd_parse::Property>(&UNICODE_DIRECTORY).unwrap() {
65 if let Some(name) = PROPERTIES.iter().find(|prop| **prop == row.property.as_str()) {
66 properties.entry(*name).or_insert_with(Vec::new).push(row.codepoints);
70 let mut to_lower = BTreeMap::new();
71 let mut to_upper = BTreeMap::new();
72 for row in ucd_parse::UnicodeDataExpander::new(
73 ucd_parse::parse::<_, ucd_parse::UnicodeData>(&UNICODE_DIRECTORY).unwrap(),
75 let general_category = if ["Nd", "Nl", "No"].contains(&row.general_category.as_str()) {
78 row.general_category.as_str()
80 if let Some(name) = PROPERTIES.iter().find(|prop| **prop == general_category) {
83 .or_insert_with(Vec::new)
84 .push(Codepoints::Single(row.codepoint));
87 if let Some(mapped) = row.simple_lowercase_mapping {
88 if mapped != row.codepoint {
89 to_lower.insert(row.codepoint.value(), (mapped.value(), 0, 0));
92 if let Some(mapped) = row.simple_uppercase_mapping {
93 if mapped != row.codepoint {
94 to_upper.insert(row.codepoint.value(), (mapped.value(), 0, 0));
99 for row in ucd_parse::parse::<_, ucd_parse::SpecialCaseMapping>(&UNICODE_DIRECTORY).unwrap() {
100 if !row.conditions.is_empty() {
101 // Skip conditional case mappings
105 let key = row.codepoint.value();
106 if let Some(lower) = to_mapping(key, row.lowercase) {
107 to_lower.insert(key, lower);
109 if let Some(upper) = to_mapping(key, row.uppercase) {
110 to_upper.insert(key, upper);
114 let mut properties: HashMap<&'static str, Vec<Range<u32>>> = properties
120 .flat_map(|codepoints| match codepoints {
121 Codepoints::Single(c) => c
123 .map(|ch| (ch as u32..ch as u32 + 1))
125 .collect::<Vec<_>>(),
126 Codepoints::Range(c) => c
128 .flat_map(|c| c.scalar().map(|ch| (ch as u32..ch as u32 + 1)))
129 .collect::<Vec<_>>(),
131 .collect::<Vec<Range<u32>>>(),
136 for ranges in properties.values_mut() {
137 merge_ranges(ranges);
140 let mut properties = properties.into_iter().collect::<Vec<_>>();
141 properties.sort_by_key(|p| p.0);
142 UnicodeData { ranges: properties, to_lower, to_upper }
146 let write_location = std::env::args().nth(1).unwrap_or_else(|| {
147 eprintln!("Must provide path to write unicode tables to");
149 "e.g. {} src/libcore/unicode/unicode_data.rs",
150 std::env::args().next().unwrap_or_default()
152 std::process::exit(1);
155 // Optional test path, which is a Rust source file testing that the unicode
156 // property lookups are correct.
157 let test_path = std::env::args().nth(2);
159 let unicode_data = load_data();
160 let ranges_by_property = &unicode_data.ranges;
162 if let Some(path) = test_path {
163 std::fs::write(&path, generate_tests(&write_location, &ranges_by_property)).unwrap();
166 let mut total_bytes = 0;
167 let mut modules = Vec::new();
168 for (property, ranges) in ranges_by_property {
169 let datapoints = ranges.iter().map(|r| r.end - r.start).sum::<u32>();
170 let mut emitter = RawEmitter::new();
171 emit_codepoints(&mut emitter, &ranges);
173 modules.push((property.to_lowercase().to_string(), emitter.file));
175 "{:15}: {} bytes, {} codepoints in {} ranges ({} - {})",
180 ranges.first().unwrap().start,
181 ranges.last().unwrap().end
183 total_bytes += emitter.bytes_used;
186 let mut table_file = String::new();
189 "///! This file is generated by src/tools/unicode-table-generator; do not edit manually!\n",
192 // Include the range search function
193 table_file.push('\n');
194 table_file.push_str(include_str!("range_search.rs"));
195 table_file.push('\n');
197 table_file.push_str(&version());
199 table_file.push('\n');
201 modules.push((String::from("conversions"), case_mapping::generate_case_mapping(&unicode_data)));
203 for (name, contents) in modules {
204 table_file.push_str("#[rustfmt::skip]\n");
205 table_file.push_str(&format!("pub mod {} {{\n", name));
206 for line in contents.lines() {
207 if !line.trim().is_empty() {
208 table_file.push_str(" ");
209 table_file.push_str(&line);
211 table_file.push('\n');
213 table_file.push_str("}\n\n");
216 std::fs::write(&write_location, format!("{}\n", table_file.trim_end())).unwrap();
218 println!("Total table sizes: {} bytes", total_bytes);
221 fn version() -> String {
222 let mut out = String::new();
223 out.push_str("pub const UNICODE_VERSION: (u32, u32, u32) = ");
226 std::fs::read_to_string(std::path::Path::new(UNICODE_DIRECTORY).join("ReadMe.txt"))
229 let prefix = "for Version ";
230 let start = readme.find(prefix).unwrap() + prefix.len();
231 let end = readme.find(" of the Unicode Standard.").unwrap();
233 readme[start..end].split('.').map(|v| v.parse::<u32>().expect(&v)).collect::<Vec<_>>();
234 let [major, minor, micro] = [version[0], version[1], version[2]];
236 out.push_str(&format!("({}, {}, {});\n", major, minor, micro));
240 fn fmt_list<V: std::fmt::Debug>(values: impl IntoIterator<Item = V>) -> String {
241 let pieces = values.into_iter().map(|b| format!("{:?}, ", b)).collect::<Vec<_>>();
242 let mut out = String::new();
243 let mut line = format!("\n ");
244 for piece in pieces {
245 if line.len() + piece.len() < 98 {
246 line.push_str(&piece);
248 out.push_str(line.trim_end());
250 line = format!(" {}", piece);
253 out.push_str(line.trim_end());
258 fn generate_tests(data_path: &str, ranges: &[(&str, Vec<Range<u32>>)]) -> String {
259 let mut s = String::new();
260 s.push_str("#![allow(incomplete_features, unused)]\n");
261 s.push_str("#![feature(const_generics)]\n\n");
262 s.push_str(&format!("#[path = \"{}\"]\n", data_path));
263 s.push_str("mod unicode_data;\n\n");
265 s.push_str("\nfn main() {\n");
267 for (property, ranges) in ranges {
268 s.push_str(&format!(r#" println!("Testing {}");"#, property));
270 s.push_str(&format!(" {}();\n", property.to_lowercase()));
271 let mut is_true = Vec::new();
272 let mut is_false = Vec::new();
273 for ch_num in 0..(std::char::MAX as u32) {
274 if std::char::from_u32(ch_num).is_none() {
277 if ranges.iter().any(|r| r.contains(&ch_num)) {
278 is_true.push(ch_num);
280 is_false.push(ch_num);
284 s.push_str(&format!(" fn {}() {{\n", property.to_lowercase()));
285 generate_asserts(&mut s, property, &is_true, true);
286 generate_asserts(&mut s, property, &is_false, false);
287 s.push_str(" }\n\n");
294 fn generate_asserts(s: &mut String, property: &str, points: &[u32], truthy: bool) {
295 for range in ranges_from_set(points) {
296 if range.end == range.start + 1 {
298 " assert!({}unicode_data::{}::lookup(std::char::from_u32({}).unwrap()), \"{}\");\n",
299 if truthy { "" } else { "!" },
300 property.to_lowercase(),
302 std::char::from_u32(range.start).unwrap(),
305 s.push_str(&format!(" for chn in {:?}u32 {{\n", range));
307 " assert!({}unicode_data::{}::lookup(std::char::from_u32(chn).unwrap()), \"{{:?}}\", chn);\n",
308 if truthy { "" } else { "!" },
309 property.to_lowercase(),
316 fn ranges_from_set(set: &[u32]) -> Vec<Range<u32>> {
317 let mut ranges = set.iter().map(|e| (*e)..(*e + 1)).collect::<Vec<Range<u32>>>();
318 merge_ranges(&mut ranges);
322 fn merge_ranges(ranges: &mut Vec<Range<u32>>) {
324 let mut new_ranges = Vec::new();
325 let mut idx_iter = 0..(ranges.len() - 1);
326 while let Some(idx) = idx_iter.next() {
327 let cur = ranges[idx].clone();
328 let next = ranges[idx + 1].clone();
329 if cur.end == next.start {
330 let _ = idx_iter.next(); // skip next as we're merging it in
331 new_ranges.push(cur.start..next.end);
333 new_ranges.push(cur);
336 new_ranges.push(ranges.last().unwrap().clone());
337 if new_ranges.len() == ranges.len() {
338 *ranges = new_ranges;
341 *ranges = new_ranges;