1 //! This implements the core logic of the compression scheme used to compactly
2 //! encode the Unicode character classes.
4 //! The primary idea is that we 'flatten' the Unicode ranges into an enormous
5 //! bitset. To represent any arbitrary codepoint in a raw bitset, we would need
6 //! over 17 kilobytes of data per character set -- way too much for our
9 //! We have two primary goals with the encoding: we want to be compact, because
10 //! these tables often end up in ~every Rust program (especially the
11 //! grapheme_extend table, used for str debugging), including those for embedded
12 //! targets (where space is important). We also want to be relatively fast,
13 //! though this is more of a nice to have rather than a key design constraint.
14 //! In practice, due to modern processor design these two are closely related.
16 //! The encoding scheme here compresses the bitset by first deduplicating the
17 //! "words" (64 bits on all platforms). In practice very few words are present
18 //! in most data sets.
20 //! This gives us an array that maps `u8 -> word` (if we ever went beyond 256
21 //! words, we could go to u16 -> word or have some dual compression scheme
22 //! mapping into two separate sets; currently this is not dealt with).
24 //! With that scheme, we now have a single byte for every 64 codepoints. We
25 //! further group these by some constant N (between 1 and 64 per group), and
26 //! again deduplicate and store in an array (u8 -> [u8; N]). The constant is
27 //! chosen to be optimal in bytes-in-memory for the given dataset.
29 //! The indices into this array represent ranges of 64*16 = 1024 codepoints.
31 //! This already reduces the top-level array to at most 1,086 bytes, but in
32 //! practice we usually can encode in far fewer (the first couple Unicode planes
35 //! The last byte of this top-level array is pulled out to a separate static
36 //! and trailing zeros are dropped; this is simply because grapheme_extend and
37 //! case_ignorable have a single entry in the 896th entry, so this shrinks them
38 //! down considerably.
41 use std::collections::{BTreeMap, BTreeSet, HashMap};
42 use std::convert::TryFrom;
43 use std::fmt::{self, Write};
47 pub struct RawEmitter {
49 pub bytes_used: usize,
53 pub fn new() -> RawEmitter {
54 RawEmitter { file: String::new(), bytes_used: 0 }
57 fn blank_line(&mut self) {
58 if self.file.is_empty() || self.file.ends_with("\n\n") {
61 writeln!(&mut self.file, "").unwrap();
64 fn emit_bitset(&mut self, words: &[u64]) {
65 let mut words = words.to_vec();
66 // Ensure that there's a zero word in the dataset, used for padding and
70 words.iter().cloned().collect::<BTreeSet<_>>().into_iter().collect::<Vec<_>>();
71 if unique_words.len() > u8::max_value() as usize {
72 panic!("cannot pack {} into 8 bits", unique_words.len());
74 // needed for the chunk mapping to work
75 assert_eq!(unique_words[0], 0, "has a zero word");
76 let canonicalized = Canonicalized::canonicalize(&unique_words);
78 let word_indices = canonicalized.unique_mapping.clone();
79 let compressed_words = words.iter().map(|w| word_indices[w]).collect::<Vec<u8>>();
82 for length in 1..=64 {
83 let mut temp = self.clone();
84 temp.emit_chunk_map(word_indices[&0], &compressed_words, length);
85 if let Some((_, size)) = best {
86 if temp.bytes_used < size {
87 best = Some((length, temp.bytes_used));
90 best = Some((length, temp.bytes_used));
93 self.emit_chunk_map(word_indices[&0], &compressed_words, best.unwrap().0);
96 impl fmt::Debug for Bits {
97 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
98 write!(f, "0b{:064b}", self.0)
104 "static BITSET_CANONICAL: [u64; {}] = [{}];",
105 canonicalized.canonical_words.len(),
106 fmt_list(canonicalized.canonical_words.iter().map(|v| Bits(*v))),
109 self.bytes_used += 8 * canonicalized.canonical_words.len();
112 "static BITSET_MAPPING: [(u8, u8); {}] = [{}];",
113 canonicalized.canonicalized_words.len(),
114 fmt_list(&canonicalized.canonicalized_words),
117 // 8 bit index into shifted words, 7 bits for shift + optional flip
118 // We only need it for the words that we removed by applying a shift and
120 self.bytes_used += 2 * canonicalized.canonicalized_words.len();
123 fn emit_chunk_map(&mut self, zero_at: u8, compressed_words: &[u8], chunk_length: usize) {
124 let mut compressed_words = compressed_words.to_vec();
125 for _ in 0..(chunk_length - (compressed_words.len() % chunk_length)) {
126 // pad out bitset index with zero words so we have all chunks of
128 compressed_words.push(zero_at);
131 let mut chunks = BTreeSet::new();
132 for chunk in compressed_words.chunks(chunk_length) {
133 chunks.insert(chunk);
135 let chunk_map = chunks
139 .map(|(idx, chunk)| (chunk, idx))
140 .collect::<HashMap<_, _>>();
141 let mut chunk_indices = Vec::new();
142 for chunk in compressed_words.chunks(chunk_length) {
143 chunk_indices.push(chunk_map[chunk]);
146 // If one of the chunks has all of the entries point to the bitset
147 // word filled with zeros, then pop those off the end -- we know they
149 let zero_chunk_idx = chunks.iter().position(|chunk| chunk.iter().all(|e| *e == zero_at));
150 while zero_chunk_idx.is_some() && chunk_indices.last().cloned() == zero_chunk_idx {
155 "static BITSET_LAST_CHUNK_MAP: (u16, u8) = ({}, {});",
156 chunk_indices.len() - 1,
157 chunk_indices.pop().unwrap(),
160 self.bytes_used += 3;
161 // Try to pop again, now that we've recorded a non-zero pointing index
162 // into the LAST_CHUNK_MAP.
163 while zero_chunk_idx.is_some() && chunk_indices.last().cloned() == zero_chunk_idx {
168 "static BITSET_CHUNKS_MAP: [u8; {}] = [{}];",
170 fmt_list(&chunk_indices),
173 self.bytes_used += chunk_indices.len();
176 "static BITSET_INDEX_CHUNKS: [[u8; {}]; {}] = [{}];",
179 fmt_list(chunks.iter()),
182 self.bytes_used += chunk_length * chunks.len();
185 pub fn emit_lookup(&mut self) {
186 writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
187 writeln!(&mut self.file, " super::range_search(",).unwrap();
188 writeln!(&mut self.file, " c as u32,").unwrap();
189 writeln!(&mut self.file, " &BITSET_CHUNKS_MAP,").unwrap();
190 writeln!(&mut self.file, " BITSET_LAST_CHUNK_MAP,").unwrap();
191 writeln!(&mut self.file, " &BITSET_INDEX_CHUNKS,").unwrap();
192 writeln!(&mut self.file, " &BITSET_CANONICAL,").unwrap();
193 writeln!(&mut self.file, " &BITSET_MAPPING,").unwrap();
194 writeln!(&mut self.file, " )").unwrap();
195 writeln!(&mut self.file, "}}").unwrap();
199 pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range<u32>]) {
200 emitter.blank_line();
202 let last_code_point = ranges.last().unwrap().end;
203 // bitset for every bit in the codepoint range
205 // + 2 to ensure an all zero word to use for padding
206 let mut buckets = vec![0u64; (last_code_point as usize / 64) + 2];
207 for range in ranges {
208 for codepoint in range.clone() {
209 let bucket = codepoint as usize / 64;
210 let bit = codepoint as u64 % 64;
211 buckets[bucket] |= 1 << bit;
215 emitter.emit_bitset(&buckets);
216 emitter.blank_line();
217 emitter.emit_lookup();
220 struct Canonicalized {
221 canonical_words: Vec<u64>,
222 canonicalized_words: Vec<(u8, u8)>,
224 /// Maps an input unique word to the associated index (u8) which is into
225 /// canonical_words or canonicalized_words (in order).
226 unique_mapping: HashMap<u64, u8>,
230 fn canonicalize(unique_words: &[u64]) -> Self {
231 #[derive(Copy, Clone, Debug)]
235 RotateAndInvert(u32),
238 // key is the word being mapped to
239 let mut mappings: BTreeMap<u64, Vec<(u64, Mapping)>> = BTreeMap::new();
240 for &a in unique_words {
241 'b: for &b in unique_words {
247 // All possible distinct rotations
248 for rotation in 1..64 {
249 if a.rotate_right(rotation) == b {
250 mappings.entry(b).or_default().push((a, Mapping::Rotate(rotation)));
251 // We're not interested in further mappings between a and b
257 mappings.entry(b).or_default().push((a, Mapping::Invert));
258 // We're not interested in further mappings between a and b
262 // All possible distinct rotations, inverted
263 for rotation in 1..64 {
264 if (!a.rotate_right(rotation)) == b {
268 .push((a, Mapping::RotateAndInvert(rotation)));
269 // We're not interested in further mappings between a and b
275 // These are the bitset words which will be represented "raw" (as a u64)
276 let mut canonical_words = Vec::new();
277 // These are mapped words, which will be represented by an index into
278 // the canonical_words and a Mapping; u16 when encoded.
279 let mut canonicalized_words = Vec::new();
280 let mut unique_mapping = HashMap::new();
282 #[derive(Debug, PartialEq, Eq)]
285 Canonicalized(usize),
288 while let Some((&to, _)) = mappings.iter().max_by_key(|m| m.1.len()) {
289 // Get the mapping with the most entries. Currently, no mapping can
290 // only exist transitively (i.e., there is no A, B, C such that A
291 // does not map to C and but A maps to B maps to C), so this is
292 // guaranteed to be acceptable.
294 // In the future, we may need a more sophisticated algorithm to
295 // identify which keys to prefer as canonical.
296 let mapped_from = mappings.remove(&to).unwrap();
297 for (from, how) in &mapped_from {
298 // Remove the entries which mapped to this one.
299 // Noting that it should be associated with the Nth canonical word.
301 // We do not assert that this is present, because there may be
302 // no mappings to the `from` word; that's fine.
303 mappings.remove(from);
306 .insert(*from, UniqueMapping::Canonicalized(canonicalized_words.len())),
309 canonicalized_words.push((canonical_words.len(), *how));
311 // Remove the now-canonicalized word from other mappings,
312 // to ensure that we deprioritize them in the next iteration of
314 for (_, mapped) in &mut mappings {
316 while i != mapped.len() {
317 if mapped[i].0 == *from {
327 .insert(to, UniqueMapping::Canonical(canonical_words.len()))
330 canonical_words.push(to);
332 // Remove the now-canonical word from other mappings, to ensure that
333 // we deprioritize them in the next iteration of the while loop.
334 for (_, mapped) in &mut mappings {
336 while i != mapped.len() {
337 if mapped[i].0 == to {
346 // Any words which we couldn't shrink, just stick into the canonical
349 // FIXME: work harder -- there are more possibilities for mapping
350 // functions (e.g., multiplication, shifting instead of rotation, etc.)
351 // We'll probably always have some slack though so this loop will still
353 for &w in unique_words {
354 if !unique_mapping.contains_key(&w) {
357 .insert(w, UniqueMapping::Canonical(canonical_words.len()))
360 canonical_words.push(w);
363 assert_eq!(canonicalized_words.len() + canonical_words.len(), unique_words.len());
364 assert_eq!(unique_mapping.len(), unique_words.len());
366 let unique_mapping = unique_mapping
368 .map(|(key, value)| {
372 UniqueMapping::Canonicalized(idx) => {
373 u8::try_from(canonical_words.len() + idx).unwrap()
375 UniqueMapping::Canonical(idx) => u8::try_from(idx).unwrap(),
379 .collect::<HashMap<_, _>>();
381 let mut distinct_indices = BTreeSet::new();
382 for &w in unique_words {
383 let idx = unique_mapping.get(&w).unwrap();
384 assert!(distinct_indices.insert(idx));
387 let canonicalized_words = canonicalized_words
391 u8::try_from(v.0).unwrap(),
393 Mapping::RotateAndInvert(amount) => {
394 assert!(amount < (1 << 7));
395 1 << 7 | (amount as u8)
397 Mapping::Rotate(amount) => {
398 assert!(amount < (1 << 7));
401 Mapping::Invert => 1 << 7,
405 .collect::<Vec<(u8, u8)>>();
406 Canonicalized { unique_mapping, canonical_words, canonicalized_words }