1 //! This implements the core logic of the compression scheme used to compactly
2 //! encode Unicode properties.
4 //! We have two primary goals with the encoding: we want to be compact, because
5 //! these tables often end up in ~every Rust program (especially the
6 //! grapheme_extend table, used for str debugging), including those for embedded
7 //! targets (where space is important). We also want to be relatively fast,
8 //! though this is more of a nice to have rather than a key design constraint.
9 //! It is expected that libraries/applications which are performance-sensitive
10 //! to Unicode property lookups are extremely rare, and those that care may find
11 //! the tradeoff of the raw bitsets worth it. For most applications, a
12 //! relatively fast but much smaller (and as such less cache-impacting, etc.)
13 //! data set is likely preferable.
15 //! We have two separate encoding schemes: a skiplist-like approach, and a
16 //! compressed bitset. The datasets we consider mostly use the skiplist (it's
17 //! smaller) but the lowercase and uppercase sets are sufficiently sparse for
18 //! the bitset to be worthwhile -- for those sets the bitset is a 2x size win.
19 //! Since the bitset is also faster, this seems an obvious choice. (As a
20 //! historical note, the bitset was also the prior implementation, so its
21 //! relative complexity had already been paid).
25 //! The primary idea is that we 'flatten' the Unicode ranges into an enormous
26 //! bitset. To represent any arbitrary codepoint in a raw bitset, we would need
27 //! over 17 kilobytes of data per character set -- way too much for our
30 //! First, the raw bitset (one bit for every valid `char`, from 0 to 0x10FFFF,
31 //! not skipping the small 'gap') is associated into words (u64) and
32 //! deduplicated. On random data, this would be useless; on our data, this is
33 //! incredibly beneficial -- our data sets have (far) less than 256 unique
36 //! This gives us an array that maps `u8 -> word`; the current algorithm does
37 //! not handle the case of more than 256 unique words, but we are relatively far
38 //! from coming that close.
40 //! With that scheme, we now have a single byte for every 64 codepoints.
42 //! We further chunk these by some constant N (between 1 and 64 per group,
43 //! dynamically chosen for smallest size), and again deduplicate and store in an
44 //! array (u8 -> [u8; N]).
46 //! The bytes of this array map into the words from the bitset above, but we
47 //! apply another trick here: some of these words are similar enough that they
48 //! can be represented by some function of another word. The particular
49 //! functions chosen are rotation, inversion, and shifting (right).
53 //! The skip list arose out of the desire for an even smaller encoding than the
54 //! bitset -- and was the answer to the question "what is the smallest
55 //! representation we can imagine?". However, it is not necessarily the
56 //! smallest, and if you have a better proposal, please do suggest it!
58 //! This is a relatively straightforward encoding. First, we break up all the
59 //! ranges in the input data into offsets from each other, essentially a gap
60 //! encoding. In practice, most gaps are small -- less than u8::MAX -- so we
61 //! store those directly. We make use of the larger gaps (which are nicely
62 //! interspersed already) throughout the dataset to index this data set.
64 //! In particular, each run of small gaps (terminating in a large gap) is
65 //! indexed in a separate dataset. That data set stores an index into the
66 //! primary offset list and a prefix sum of that offset list. These are packed
67 //! into a single u32 (11 bits for the offset, 21 bits for the prefix sum).
69 //! Lookup proceeds via a binary search in the index and then a straightforward
70 //! linear scan (adding up the offsets) until we reach the needle, and then the
71 //! index of that offset is utilized as the answer to whether we're in the set
74 use std::collections::{BTreeMap, HashMap};
76 use ucd_parse::Codepoints;
84 use raw_emitter::{emit_codepoints, emit_whitespace, RawEmitter};
86 static PROPERTIES: &[&str] = &[
99 ranges: Vec<(&'static str, Vec<Range<u32>>)>,
100 to_upper: BTreeMap<u32, (u32, u32, u32)>,
101 to_lower: BTreeMap<u32, (u32, u32, u32)>,
104 fn to_mapping(origin: u32, codepoints: Vec<ucd_parse::Codepoint>) -> Option<(u32, u32, u32)> {
109 for codepoint in codepoints {
110 if origin == codepoint.value() {
115 a = Some(codepoint.value());
116 } else if b.is_none() {
117 b = Some(codepoint.value());
118 } else if c.is_none() {
119 c = Some(codepoint.value());
121 panic!("more than 3 mapped codepoints")
125 Some((a.unwrap(), b.unwrap_or(0), c.unwrap_or(0)))
128 static UNICODE_DIRECTORY: &str = "unicode-downloads";
130 fn load_data() -> UnicodeData {
131 unicode_download::fetch_latest();
133 let mut properties = HashMap::new();
134 for row in ucd_parse::parse::<_, ucd_parse::CoreProperty>(&UNICODE_DIRECTORY).unwrap() {
135 if let Some(name) = PROPERTIES.iter().find(|prop| **prop == row.property.as_str()) {
136 properties.entry(*name).or_insert_with(Vec::new).push(row.codepoints);
139 for row in ucd_parse::parse::<_, ucd_parse::Property>(&UNICODE_DIRECTORY).unwrap() {
140 if let Some(name) = PROPERTIES.iter().find(|prop| **prop == row.property.as_str()) {
141 properties.entry(*name).or_insert_with(Vec::new).push(row.codepoints);
145 let mut to_lower = BTreeMap::new();
146 let mut to_upper = BTreeMap::new();
147 for row in ucd_parse::UnicodeDataExpander::new(
148 ucd_parse::parse::<_, ucd_parse::UnicodeData>(&UNICODE_DIRECTORY).unwrap(),
150 let general_category = if ["Nd", "Nl", "No"].contains(&row.general_category.as_str()) {
153 row.general_category.as_str()
155 if let Some(name) = PROPERTIES.iter().find(|prop| **prop == general_category) {
158 .or_insert_with(Vec::new)
159 .push(Codepoints::Single(row.codepoint));
162 if let Some(mapped) = row.simple_lowercase_mapping {
163 if mapped != row.codepoint {
164 to_lower.insert(row.codepoint.value(), (mapped.value(), 0, 0));
167 if let Some(mapped) = row.simple_uppercase_mapping {
168 if mapped != row.codepoint {
169 to_upper.insert(row.codepoint.value(), (mapped.value(), 0, 0));
174 for row in ucd_parse::parse::<_, ucd_parse::SpecialCaseMapping>(&UNICODE_DIRECTORY).unwrap() {
175 if !row.conditions.is_empty() {
176 // Skip conditional case mappings
180 let key = row.codepoint.value();
181 if let Some(lower) = to_mapping(key, row.lowercase) {
182 to_lower.insert(key, lower);
184 if let Some(upper) = to_mapping(key, row.uppercase) {
185 to_upper.insert(key, upper);
189 let mut properties: HashMap<&'static str, Vec<Range<u32>>> = properties
195 .flat_map(|codepoints| match codepoints {
196 Codepoints::Single(c) => c
198 .map(|ch| (ch as u32..ch as u32 + 1))
200 .collect::<Vec<_>>(),
201 Codepoints::Range(c) => c
203 .flat_map(|c| c.scalar().map(|ch| (ch as u32..ch as u32 + 1)))
204 .collect::<Vec<_>>(),
206 .collect::<Vec<Range<u32>>>(),
211 for ranges in properties.values_mut() {
212 merge_ranges(ranges);
215 let mut properties = properties.into_iter().collect::<Vec<_>>();
216 properties.sort_by_key(|p| p.0);
217 UnicodeData { ranges: properties, to_lower, to_upper }
221 let write_location = std::env::args().nth(1).unwrap_or_else(|| {
222 eprintln!("Must provide path to write unicode tables to");
224 "e.g. {} library/core/unicode/unicode_data.rs",
225 std::env::args().next().unwrap_or_default()
227 std::process::exit(1);
230 // Optional test path, which is a Rust source file testing that the unicode
231 // property lookups are correct.
232 let test_path = std::env::args().nth(2);
234 let unicode_data = load_data();
235 let ranges_by_property = &unicode_data.ranges;
237 if let Some(path) = test_path {
238 std::fs::write(&path, generate_tests(&write_location, &ranges_by_property)).unwrap();
241 let mut total_bytes = 0;
242 let mut modules = Vec::new();
243 for (property, ranges) in ranges_by_property {
244 let datapoints = ranges.iter().map(|r| r.end - r.start).sum::<u32>();
246 let mut emitter = RawEmitter::new();
247 if property == &"White_Space" {
248 emit_whitespace(&mut emitter, &ranges);
250 emit_codepoints(&mut emitter, &ranges);
253 modules.push((property.to_lowercase().to_string(), emitter.file));
255 "{:15}: {} bytes, {} codepoints in {} ranges ({} - {}) using {}",
260 ranges.first().unwrap().start,
261 ranges.last().unwrap().end,
264 total_bytes += emitter.bytes_used;
267 let mut table_file = String::new();
270 "///! This file is generated by src/tools/unicode-table-generator; do not edit manually!\n",
273 // Include the range search function
274 table_file.push('\n');
275 table_file.push_str(include_str!("range_search.rs"));
276 table_file.push('\n');
278 table_file.push_str(&version());
280 table_file.push('\n');
282 modules.push((String::from("conversions"), case_mapping::generate_case_mapping(&unicode_data)));
284 for (name, contents) in modules {
285 table_file.push_str("#[rustfmt::skip]\n");
286 table_file.push_str(&format!("pub mod {name} {{\n"));
287 for line in contents.lines() {
288 if !line.trim().is_empty() {
289 table_file.push_str(" ");
290 table_file.push_str(&line);
292 table_file.push('\n');
294 table_file.push_str("}\n\n");
297 std::fs::write(&write_location, format!("{}\n", table_file.trim_end())).unwrap();
299 println!("Total table sizes: {total_bytes} bytes");
302 fn version() -> String {
303 let mut out = String::new();
304 out.push_str("pub const UNICODE_VERSION: (u8, u8, u8) = ");
307 std::fs::read_to_string(std::path::Path::new(UNICODE_DIRECTORY).join("ReadMe.txt"))
310 let prefix = "for Version ";
311 let start = readme.find(prefix).unwrap() + prefix.len();
312 let end = readme.find(" of the Unicode Standard.").unwrap();
314 readme[start..end].split('.').map(|v| v.parse::<u32>().expect(&v)).collect::<Vec<_>>();
315 let [major, minor, micro] = [version[0], version[1], version[2]];
317 out.push_str(&format!("({major}, {minor}, {micro});\n"));
321 fn fmt_list<V: std::fmt::Debug>(values: impl IntoIterator<Item = V>) -> String {
322 let pieces = values.into_iter().map(|b| format!("{:?}, ", b)).collect::<Vec<_>>();
323 let mut out = String::new();
324 let mut line = String::from("\n ");
325 for piece in pieces {
326 if line.len() + piece.len() < 98 {
327 line.push_str(&piece);
329 out.push_str(line.trim_end());
331 line = format!(" {piece}");
334 out.push_str(line.trim_end());
339 fn generate_tests(data_path: &str, ranges: &[(&str, Vec<Range<u32>>)]) -> String {
340 let mut s = String::new();
341 s.push_str("#![allow(incomplete_features, unused)]\n");
342 s.push_str("#![feature(const_generics)]\n\n");
343 s.push_str("\n#[allow(unused)]\nuse std::hint;\n");
344 s.push_str(&format!("#[path = \"{data_path}\"]\n"));
345 s.push_str("mod unicode_data;\n\n");
347 s.push_str("\nfn main() {\n");
349 for (property, ranges) in ranges {
350 s.push_str(&format!(r#" println!("Testing {}");"#, property));
352 s.push_str(&format!(" {}_true();\n", property.to_lowercase()));
353 s.push_str(&format!(" {}_false();\n", property.to_lowercase()));
354 let mut is_true = Vec::new();
355 let mut is_false = Vec::new();
356 for ch_num in 0..(std::char::MAX as u32) {
357 if std::char::from_u32(ch_num).is_none() {
360 if ranges.iter().any(|r| r.contains(&ch_num)) {
361 is_true.push(ch_num);
363 is_false.push(ch_num);
367 s.push_str(&format!(" fn {}_true() {{\n", property.to_lowercase()));
368 generate_asserts(&mut s, property, &is_true, true);
369 s.push_str(" }\n\n");
370 s.push_str(&format!(" fn {}_false() {{\n", property.to_lowercase()));
371 generate_asserts(&mut s, property, &is_false, false);
372 s.push_str(" }\n\n");
379 fn generate_asserts(s: &mut String, property: &str, points: &[u32], truthy: bool) {
380 for range in ranges_from_set(points) {
381 if range.end == range.start + 1 {
383 " assert!({}unicode_data::{}::lookup({:?}), \"{}\");\n",
384 if truthy { "" } else { "!" },
385 property.to_lowercase(),
386 std::char::from_u32(range.start).unwrap(),
390 s.push_str(&format!(" for chn in {:?}u32 {{\n", range));
392 " assert!({}unicode_data::{}::lookup(std::char::from_u32(chn).unwrap()), \"{{:?}}\", chn);\n",
393 if truthy { "" } else { "!" },
394 property.to_lowercase(),
401 fn ranges_from_set(set: &[u32]) -> Vec<Range<u32>> {
402 let mut ranges = set.iter().map(|e| (*e)..(*e + 1)).collect::<Vec<Range<u32>>>();
403 merge_ranges(&mut ranges);
407 fn merge_ranges(ranges: &mut Vec<Range<u32>>) {
409 let mut new_ranges = Vec::new();
410 let mut idx_iter = 0..(ranges.len() - 1);
411 let mut should_insert_last = true;
412 while let Some(idx) = idx_iter.next() {
413 let cur = ranges[idx].clone();
414 let next = ranges[idx + 1].clone();
415 if cur.end == next.start {
416 if idx_iter.next().is_none() {
417 // We're merging the last element
418 should_insert_last = false;
420 new_ranges.push(cur.start..next.end);
422 // We're *not* merging the last element
423 should_insert_last = true;
424 new_ranges.push(cur);
427 if should_insert_last {
428 new_ranges.push(ranges.last().unwrap().clone());
430 if new_ranges.len() == ranges.len() {
431 *ranges = new_ranges;
434 *ranges = new_ranges;
438 let mut last_end = None;
439 for range in ranges {
440 if let Some(last) = last_end {
441 assert!(range.start > last, "{:?}", range);
443 last_end = Some(range.end);