1 //! This implements the core logic of the compression scheme used to compactly
2 //! encode the Unicode character classes.
4 //! The primary idea is that we 'flatten' the Unicode ranges into an enormous
5 //! bitset. To represent any arbitrary codepoint in a raw bitset, we would need
6 //! over 17 kilobytes of data per character set -- way too much for our
9 //! We have two primary goals with the encoding: we want to be compact, because
10 //! these tables often end up in ~every Rust program (especially the
11 //! grapheme_extend table, used for str debugging), including those for embedded
12 //! targets (where space is important). We also want to be relatively fast,
13 //! though this is more of a nice to have rather than a key design constraint.
14 //! In practice, due to modern processor design these two are closely related.
16 //! The encoding scheme here compresses the bitset by first deduplicating the
17 //! "words" (64 bits on all platforms). In practice very few words are present
18 //! in most data sets.
20 //! This gives us an array that maps `u8 -> word` (if we ever went beyond 256
21 //! words, we could go to u16 -> word or have some dual compression scheme
22 //! mapping into two separate sets; currently this is not dealt with).
24 //! With that scheme, we now have a single byte for every 64 codepoints. We
25 //! further group these by some constant N (between 1 and 64 per group), and
26 //! again deduplicate and store in an array (u8 -> [u8; N]). The constant is
27 //! chosen to be optimal in bytes-in-memory for the given dataset.
29 //! The indices into this array represent ranges of 64*16 = 1024 codepoints.
31 //! This already reduces the top-level array to at most 1,086 bytes, but in
32 //! practice we usually can encode in far fewer (the first couple Unicode planes
35 //! The last byte of this top-level array is pulled out to a separate static
36 //! and trailing zeros are dropped; this is simply because grapheme_extend and
37 //! case_ignorable have a single entry in the 896th entry, so this shrinks them
38 //! down considerably.
41 use std::collections::{BTreeMap, BTreeSet, HashMap};
42 use std::convert::TryFrom;
43 use std::fmt::{self, Write};
47 pub struct RawEmitter {
50 pub bytes_used: usize,
54 pub fn new() -> RawEmitter {
55 RawEmitter { file: String::new(), bytes_used: 0, desc: String::new() }
58 fn blank_line(&mut self) {
59 if self.file.is_empty() || self.file.ends_with("\n\n") {
62 writeln!(&mut self.file, "").unwrap();
65 fn emit_bitset(&mut self, ranges: &[Range<u32>]) {
66 let last_code_point = ranges.last().unwrap().end;
67 // bitset for every bit in the codepoint range
69 // + 2 to ensure an all zero word to use for padding
70 let mut buckets = vec![0u64; (last_code_point as usize / 64) + 2];
72 for codepoint in range.clone() {
73 let bucket = codepoint as usize / 64;
74 let bit = codepoint as u64 % 64;
75 buckets[bucket] |= 1 << bit;
79 let mut words = buckets;
80 // Ensure that there's a zero word in the dataset, used for padding and
84 words.iter().cloned().collect::<BTreeSet<_>>().into_iter().collect::<Vec<_>>();
85 if unique_words.len() > u8::max_value() as usize {
86 panic!("cannot pack {} into 8 bits", unique_words.len());
88 // needed for the chunk mapping to work
89 assert_eq!(unique_words[0], 0, "has a zero word");
90 let canonicalized = Canonicalized::canonicalize(&unique_words);
92 let word_indices = canonicalized.unique_mapping.clone();
93 let compressed_words = words.iter().map(|w| word_indices[w]).collect::<Vec<u8>>();
96 for length in 1..=64 {
97 let mut temp = self.clone();
98 temp.emit_chunk_map(word_indices[&0], &compressed_words, length);
99 if let Some((_, size)) = best {
100 if temp.bytes_used < size {
101 best = Some((length, temp.bytes_used));
104 best = Some((length, temp.bytes_used));
107 self.emit_chunk_map(word_indices[&0], &compressed_words, best.unwrap().0);
110 impl fmt::Debug for Bits {
111 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
112 write!(f, "0b{:064b}", self.0)
118 "static BITSET_CANONICAL: [u64; {}] = [{}];",
119 canonicalized.canonical_words.len(),
120 fmt_list(canonicalized.canonical_words.iter().map(|v| Bits(*v))),
123 self.bytes_used += 8 * canonicalized.canonical_words.len();
126 "static BITSET_MAPPING: [(u8, u8); {}] = [{}];",
127 canonicalized.canonicalized_words.len(),
128 fmt_list(&canonicalized.canonicalized_words),
131 // 8 bit index into shifted words, 7 bits for shift + optional flip
132 // We only need it for the words that we removed by applying a shift and
134 self.bytes_used += 2 * canonicalized.canonicalized_words.len();
138 writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
139 writeln!(&mut self.file, " super::bitset_search(",).unwrap();
140 writeln!(&mut self.file, " c as u32,").unwrap();
141 writeln!(&mut self.file, " &BITSET_CHUNKS_MAP,").unwrap();
142 writeln!(&mut self.file, " &BITSET_INDEX_CHUNKS,").unwrap();
143 writeln!(&mut self.file, " &BITSET_CANONICAL,").unwrap();
144 writeln!(&mut self.file, " &BITSET_MAPPING,").unwrap();
145 writeln!(&mut self.file, " )").unwrap();
146 writeln!(&mut self.file, "}}").unwrap();
149 fn emit_chunk_map(&mut self, zero_at: u8, compressed_words: &[u8], chunk_length: usize) {
150 let mut compressed_words = compressed_words.to_vec();
151 for _ in 0..(chunk_length - (compressed_words.len() % chunk_length)) {
152 // pad out bitset index with zero words so we have all chunks of
154 compressed_words.push(zero_at);
157 let mut chunks = BTreeSet::new();
158 for chunk in compressed_words.chunks(chunk_length) {
159 chunks.insert(chunk);
161 let chunk_map = chunks
165 .map(|(idx, chunk)| (chunk, idx))
166 .collect::<HashMap<_, _>>();
167 let mut chunk_indices = Vec::new();
168 for chunk in compressed_words.chunks(chunk_length) {
169 chunk_indices.push(chunk_map[chunk]);
174 "static BITSET_CHUNKS_MAP: [u8; {}] = [{}];",
176 fmt_list(&chunk_indices),
179 self.bytes_used += chunk_indices.len();
182 "static BITSET_INDEX_CHUNKS: [[u8; {}]; {}] = [{}];",
185 fmt_list(chunks.iter()),
188 self.bytes_used += chunk_length * chunks.len();
192 pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range<u32>]) {
193 emitter.blank_line();
195 let mut bitset = emitter.clone();
196 bitset.emit_bitset(&ranges);
198 let mut skiplist = emitter.clone();
199 skiplist.emit_skiplist(&ranges);
201 if bitset.bytes_used <= skiplist.bytes_used {
203 emitter.desc = format!("bitset");
206 emitter.desc = format!("skiplist");
210 struct Canonicalized {
211 canonical_words: Vec<u64>,
212 canonicalized_words: Vec<(u8, u8)>,
214 /// Maps an input unique word to the associated index (u8) which is into
215 /// canonical_words or canonicalized_words (in order).
216 unique_mapping: HashMap<u64, u8>,
220 fn canonicalize(unique_words: &[u64]) -> Self {
221 #[derive(Copy, Clone, Debug)]
225 RotateAndInvert(u32),
229 // key is the word being mapped to
230 let mut mappings: BTreeMap<u64, Vec<(u64, Mapping)>> = BTreeMap::new();
231 for &a in unique_words {
232 'b: for &b in unique_words {
238 // All possible distinct rotations
239 for rotation in 1..64 {
240 if a.rotate_right(rotation) == b {
241 mappings.entry(b).or_default().push((a, Mapping::Rotate(rotation)));
242 // We're not interested in further mappings between a and b
248 mappings.entry(b).or_default().push((a, Mapping::Invert));
249 // We're not interested in further mappings between a and b
253 // All possible distinct rotations, inverted
254 for rotation in 1..64 {
255 if (!a.rotate_right(rotation)) == b {
259 .push((a, Mapping::RotateAndInvert(rotation)));
260 // We're not interested in further mappings between a and b
265 // All possible shifts
266 for shift_by in 1..64 {
267 if a == (b >> shift_by) {
271 .push((a, Mapping::ShiftRight(shift_by as u32)));
272 // We're not interested in further mappings between a and b
278 // These are the bitset words which will be represented "raw" (as a u64)
279 let mut canonical_words = Vec::new();
280 // These are mapped words, which will be represented by an index into
281 // the canonical_words and a Mapping; u16 when encoded.
282 let mut canonicalized_words = Vec::new();
283 let mut unique_mapping = HashMap::new();
285 #[derive(Debug, PartialEq, Eq)]
288 Canonicalized(usize),
291 // Map 0 first, so that it is the first canonical word.
292 // This is realistically not inefficient because 0 is not mapped to by
293 // anything else (a shift pattern could do it, but would be wasteful).
295 // However, 0s are quite common in the overall dataset, and it is quite
296 // wasteful to have to go through a mapping function to determine that
299 // FIXME: Experiment with choosing most common words in overall data set
300 // for canonical when possible.
301 while let Some((&to, _)) = mappings
303 .find(|(&to, _)| to == 0)
304 .or_else(|| mappings.iter().max_by_key(|m| m.1.len()))
306 // Get the mapping with the most entries. Currently, no mapping can
307 // only exist transitively (i.e., there is no A, B, C such that A
308 // does not map to C and but A maps to B maps to C), so this is
309 // guaranteed to be acceptable.
311 // In the future, we may need a more sophisticated algorithm to
312 // identify which keys to prefer as canonical.
313 let mapped_from = mappings.remove(&to).unwrap();
314 for (from, how) in &mapped_from {
315 // Remove the entries which mapped to this one.
316 // Noting that it should be associated with the Nth canonical word.
318 // We do not assert that this is present, because there may be
319 // no mappings to the `from` word; that's fine.
320 mappings.remove(from);
323 .insert(*from, UniqueMapping::Canonicalized(canonicalized_words.len())),
326 canonicalized_words.push((canonical_words.len(), *how));
328 // Remove the now-canonicalized word from other mappings,
329 // to ensure that we deprioritize them in the next iteration of
331 for (_, mapped) in &mut mappings {
333 while i != mapped.len() {
334 if mapped[i].0 == *from {
344 .insert(to, UniqueMapping::Canonical(canonical_words.len()))
347 canonical_words.push(to);
349 // Remove the now-canonical word from other mappings, to ensure that
350 // we deprioritize them in the next iteration of the while loop.
351 for (_, mapped) in &mut mappings {
353 while i != mapped.len() {
354 if mapped[i].0 == to {
363 // Any words which we couldn't shrink, just stick into the canonical
366 // FIXME: work harder -- there are more possibilities for mapping
367 // functions (e.g., multiplication, shifting instead of rotation, etc.)
368 // We'll probably always have some slack though so this loop will still
370 for &w in unique_words {
371 if !unique_mapping.contains_key(&w) {
374 .insert(w, UniqueMapping::Canonical(canonical_words.len()))
377 canonical_words.push(w);
380 assert_eq!(canonicalized_words.len() + canonical_words.len(), unique_words.len());
381 assert_eq!(unique_mapping.len(), unique_words.len());
383 let unique_mapping = unique_mapping
385 .map(|(key, value)| {
389 UniqueMapping::Canonicalized(idx) => {
390 u8::try_from(canonical_words.len() + idx).unwrap()
392 UniqueMapping::Canonical(idx) => u8::try_from(idx).unwrap(),
396 .collect::<HashMap<_, _>>();
398 let mut distinct_indices = BTreeSet::new();
399 for &w in unique_words {
400 let idx = unique_mapping.get(&w).unwrap();
401 assert!(distinct_indices.insert(idx));
404 const LOWER_6: u32 = (1 << 6) - 1;
406 let canonicalized_words = canonicalized_words
410 u8::try_from(v.0).unwrap(),
412 Mapping::RotateAndInvert(amount) => {
413 assert_eq!(amount, amount & LOWER_6);
414 1 << 6 | (amount as u8)
416 Mapping::Rotate(amount) => {
417 assert_eq!(amount, amount & LOWER_6);
420 Mapping::Invert => 1 << 6,
421 Mapping::ShiftRight(shift_by) => {
422 assert_eq!(shift_by, shift_by & LOWER_6);
423 1 << 7 | (shift_by as u8)
428 .collect::<Vec<(u8, u8)>>();
429 Canonicalized { unique_mapping, canonical_words, canonicalized_words }