2 use std::collections::{BTreeMap, BTreeSet, HashMap};
3 use std::convert::TryFrom;
4 use std::fmt::{self, Write};
8 pub struct RawEmitter {
11 pub bytes_used: usize,
15 pub fn new() -> RawEmitter {
16 RawEmitter { file: String::new(), bytes_used: 0, desc: String::new() }
19 fn blank_line(&mut self) {
20 if self.file.is_empty() || self.file.ends_with("\n\n") {
23 writeln!(&mut self.file, "").unwrap();
26 fn emit_bitset(&mut self, ranges: &[Range<u32>]) {
27 let last_code_point = ranges.last().unwrap().end;
28 // bitset for every bit in the codepoint range
30 // + 2 to ensure an all zero word to use for padding
31 let mut buckets = vec![0u64; (last_code_point as usize / 64) + 2];
33 for codepoint in range.clone() {
34 let bucket = codepoint as usize / 64;
35 let bit = codepoint as u64 % 64;
36 buckets[bucket] |= 1 << bit;
40 let mut words = buckets;
41 // Ensure that there's a zero word in the dataset, used for padding and
45 words.iter().cloned().collect::<BTreeSet<_>>().into_iter().collect::<Vec<_>>();
46 if unique_words.len() > u8::MAX as usize {
47 panic!("cannot pack {} into 8 bits", unique_words.len());
49 // needed for the chunk mapping to work
50 assert_eq!(unique_words[0], 0, "has a zero word");
51 let canonicalized = Canonicalized::canonicalize(&unique_words);
53 let word_indices = canonicalized.unique_mapping.clone();
54 let compressed_words = words.iter().map(|w| word_indices[w]).collect::<Vec<u8>>();
57 for length in 1..=64 {
58 let mut temp = self.clone();
59 temp.emit_chunk_map(word_indices[&0], &compressed_words, length);
60 if let Some((_, size)) = best {
61 if temp.bytes_used < size {
62 best = Some((length, temp.bytes_used));
65 best = Some((length, temp.bytes_used));
68 self.emit_chunk_map(word_indices[&0], &compressed_words, best.unwrap().0);
71 impl fmt::Debug for Bits {
72 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
73 write!(f, "0b{:064b}", self.0)
79 "static BITSET_CANONICAL: [u64; {}] = [{}];",
80 canonicalized.canonical_words.len(),
81 fmt_list(canonicalized.canonical_words.iter().map(|v| Bits(*v))),
84 self.bytes_used += 8 * canonicalized.canonical_words.len();
87 "static BITSET_MAPPING: [(u8, u8); {}] = [{}];",
88 canonicalized.canonicalized_words.len(),
89 fmt_list(&canonicalized.canonicalized_words),
92 // 8 bit index into shifted words, 7 bits for shift + optional flip
93 // We only need it for the words that we removed by applying a shift and
95 self.bytes_used += 2 * canonicalized.canonicalized_words.len();
99 writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
100 writeln!(&mut self.file, " super::bitset_search(",).unwrap();
101 writeln!(&mut self.file, " c as u32,").unwrap();
102 writeln!(&mut self.file, " &BITSET_CHUNKS_MAP,").unwrap();
103 writeln!(&mut self.file, " &BITSET_INDEX_CHUNKS,").unwrap();
104 writeln!(&mut self.file, " &BITSET_CANONICAL,").unwrap();
105 writeln!(&mut self.file, " &BITSET_MAPPING,").unwrap();
106 writeln!(&mut self.file, " )").unwrap();
107 writeln!(&mut self.file, "}}").unwrap();
110 fn emit_chunk_map(&mut self, zero_at: u8, compressed_words: &[u8], chunk_length: usize) {
111 let mut compressed_words = compressed_words.to_vec();
112 for _ in 0..(chunk_length - (compressed_words.len() % chunk_length)) {
113 // pad out bitset index with zero words so we have all chunks of
115 compressed_words.push(zero_at);
118 let mut chunks = BTreeSet::new();
119 for chunk in compressed_words.chunks(chunk_length) {
120 chunks.insert(chunk);
122 let chunk_map = chunks
126 .map(|(idx, chunk)| (chunk, idx))
127 .collect::<HashMap<_, _>>();
128 let mut chunk_indices = Vec::new();
129 for chunk in compressed_words.chunks(chunk_length) {
130 chunk_indices.push(chunk_map[chunk]);
135 "static BITSET_CHUNKS_MAP: [u8; {}] = [{}];",
137 fmt_list(&chunk_indices),
140 self.bytes_used += chunk_indices.len();
143 "static BITSET_INDEX_CHUNKS: [[u8; {}]; {}] = [{}];",
146 fmt_list(chunks.iter()),
149 self.bytes_used += chunk_length * chunks.len();
153 pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range<u32>]) {
154 emitter.blank_line();
156 let mut bitset = emitter.clone();
157 bitset.emit_bitset(&ranges);
159 let mut skiplist = emitter.clone();
160 skiplist.emit_skiplist(&ranges);
162 if bitset.bytes_used <= skiplist.bytes_used {
164 emitter.desc = format!("bitset");
167 emitter.desc = format!("skiplist");
171 struct Canonicalized {
172 canonical_words: Vec<u64>,
173 canonicalized_words: Vec<(u8, u8)>,
175 /// Maps an input unique word to the associated index (u8) which is into
176 /// canonical_words or canonicalized_words (in order).
177 unique_mapping: HashMap<u64, u8>,
181 fn canonicalize(unique_words: &[u64]) -> Self {
182 #[derive(Copy, Clone, Debug)]
186 RotateAndInvert(u32),
190 // key is the word being mapped to
191 let mut mappings: BTreeMap<u64, Vec<(u64, Mapping)>> = BTreeMap::new();
192 for &a in unique_words {
193 'b: for &b in unique_words {
199 // All possible distinct rotations
200 for rotation in 1..64 {
201 if a.rotate_right(rotation) == b {
202 mappings.entry(b).or_default().push((a, Mapping::Rotate(rotation)));
203 // We're not interested in further mappings between a and b
209 mappings.entry(b).or_default().push((a, Mapping::Invert));
210 // We're not interested in further mappings between a and b
214 // All possible distinct rotations, inverted
215 for rotation in 1..64 {
216 if (!a.rotate_right(rotation)) == b {
220 .push((a, Mapping::RotateAndInvert(rotation)));
221 // We're not interested in further mappings between a and b
226 // All possible shifts
227 for shift_by in 1..64 {
228 if a == (b >> shift_by) {
232 .push((a, Mapping::ShiftRight(shift_by as u32)));
233 // We're not interested in further mappings between a and b
239 // These are the bitset words which will be represented "raw" (as a u64)
240 let mut canonical_words = Vec::new();
241 // These are mapped words, which will be represented by an index into
242 // the canonical_words and a Mapping; u16 when encoded.
243 let mut canonicalized_words = Vec::new();
244 let mut unique_mapping = HashMap::new();
246 #[derive(Debug, PartialEq, Eq)]
249 Canonicalized(usize),
252 // Map 0 first, so that it is the first canonical word.
253 // This is realistically not inefficient because 0 is not mapped to by
254 // anything else (a shift pattern could do it, but would be wasteful).
256 // However, 0s are quite common in the overall dataset, and it is quite
257 // wasteful to have to go through a mapping function to determine that
260 // FIXME: Experiment with choosing most common words in overall data set
261 // for canonical when possible.
262 while let Some((&to, _)) = mappings
264 .find(|(&to, _)| to == 0)
265 .or_else(|| mappings.iter().max_by_key(|m| m.1.len()))
267 // Get the mapping with the most entries. Currently, no mapping can
268 // only exist transitively (i.e., there is no A, B, C such that A
269 // does not map to C and but A maps to B maps to C), so this is
270 // guaranteed to be acceptable.
272 // In the future, we may need a more sophisticated algorithm to
273 // identify which keys to prefer as canonical.
274 let mapped_from = mappings.remove(&to).unwrap();
275 for (from, how) in &mapped_from {
276 // Remove the entries which mapped to this one.
277 // Noting that it should be associated with the Nth canonical word.
279 // We do not assert that this is present, because there may be
280 // no mappings to the `from` word; that's fine.
281 mappings.remove(from);
284 .insert(*from, UniqueMapping::Canonicalized(canonicalized_words.len())),
287 canonicalized_words.push((canonical_words.len(), *how));
289 // Remove the now-canonicalized word from other mappings,
290 // to ensure that we deprioritize them in the next iteration of
292 for (_, mapped) in &mut mappings {
294 while i != mapped.len() {
295 if mapped[i].0 == *from {
305 .insert(to, UniqueMapping::Canonical(canonical_words.len()))
308 canonical_words.push(to);
310 // Remove the now-canonical word from other mappings, to ensure that
311 // we deprioritize them in the next iteration of the while loop.
312 for (_, mapped) in &mut mappings {
314 while i != mapped.len() {
315 if mapped[i].0 == to {
324 // Any words which we couldn't shrink, just stick into the canonical
327 // FIXME: work harder -- there are more possibilities for mapping
328 // functions (e.g., multiplication, shifting instead of rotation, etc.)
329 // We'll probably always have some slack though so this loop will still
331 for &w in unique_words {
332 if !unique_mapping.contains_key(&w) {
335 .insert(w, UniqueMapping::Canonical(canonical_words.len()))
338 canonical_words.push(w);
341 assert_eq!(canonicalized_words.len() + canonical_words.len(), unique_words.len());
342 assert_eq!(unique_mapping.len(), unique_words.len());
344 let unique_mapping = unique_mapping
346 .map(|(key, value)| {
350 UniqueMapping::Canonicalized(idx) => {
351 u8::try_from(canonical_words.len() + idx).unwrap()
353 UniqueMapping::Canonical(idx) => u8::try_from(idx).unwrap(),
357 .collect::<HashMap<_, _>>();
359 let mut distinct_indices = BTreeSet::new();
360 for &w in unique_words {
361 let idx = unique_mapping.get(&w).unwrap();
362 assert!(distinct_indices.insert(idx));
365 const LOWER_6: u32 = (1 << 6) - 1;
367 let canonicalized_words = canonicalized_words
371 u8::try_from(v.0).unwrap(),
373 Mapping::RotateAndInvert(amount) => {
374 assert_eq!(amount, amount & LOWER_6);
375 1 << 6 | (amount as u8)
377 Mapping::Rotate(amount) => {
378 assert_eq!(amount, amount & LOWER_6);
381 Mapping::Invert => 1 << 6,
382 Mapping::ShiftRight(shift_by) => {
383 assert_eq!(shift_by, shift_by & LOWER_6);
384 1 << 7 | (shift_by as u8)
389 .collect::<Vec<(u8, u8)>>();
390 Canonicalized { unique_mapping, canonical_words, canonicalized_words }