src/tools/unicode-table-generator/src/raw_emitter.rs

   1 use crate::fmt_list;
   2 use std::collections::{BTreeMap, BTreeSet, HashMap};
   3 use std::convert::TryFrom;
   4 use std::fmt::{self, Write};
   5 use std::ops::Range;
   6
   7 #[derive(Clone)]
   8 pub struct RawEmitter {
   9     pub file: String,
  10     pub desc: String,
  11     pub bytes_used: usize,
  12 }
  13
  14 impl RawEmitter {
  15     pub fn new() -> RawEmitter {
  16         RawEmitter { file: String::new(), bytes_used: 0, desc: String::new() }
  17     }
  18
  19     fn blank_line(&mut self) {
  20         if self.file.is_empty() || self.file.ends_with("\n\n") {
  21             return;
  22         }
  23         writeln!(&mut self.file).unwrap();
  24     }
  25
  26     fn emit_bitset(&mut self, ranges: &[Range<u32>]) {
  27         let last_code_point = ranges.last().unwrap().end;
  28         // bitset for every bit in the codepoint range
  29         //
  30         // + 2 to ensure an all zero word to use for padding
  31         let mut buckets = vec![0u64; (last_code_point as usize / 64) + 2];
  32         for range in ranges {
  33             for codepoint in range.clone() {
  34                 let bucket = codepoint as usize / 64;
  35                 let bit = codepoint as u64 % 64;
  36                 buckets[bucket] |= 1 << bit;
  37             }
  38         }
  39
  40         let mut words = buckets;
  41         // Ensure that there's a zero word in the dataset, used for padding and
  42         // such.
  43         words.push(0);
  44         let unique_words =
  45             words.iter().cloned().collect::<BTreeSet<_>>().into_iter().collect::<Vec<_>>();
  46         if unique_words.len() > u8::MAX as usize {
  47             panic!("cannot pack {} into 8 bits", unique_words.len());
  48         }
  49         // needed for the chunk mapping to work
  50         assert_eq!(unique_words[0], 0, "has a zero word");
  51         let canonicalized = Canonicalized::canonicalize(&unique_words);
  52
  53         let word_indices = canonicalized.unique_mapping.clone();
  54         let compressed_words = words.iter().map(|w| word_indices[w]).collect::<Vec<u8>>();
  55
  56         let mut best = None;
  57         for length in 1..=64 {
  58             let mut temp = self.clone();
  59             temp.emit_chunk_map(word_indices[&0], &compressed_words, length);
  60             if let Some((_, size)) = best {
  61                 if temp.bytes_used < size {
  62                     best = Some((length, temp.bytes_used));
  63                 }
  64             } else {
  65                 best = Some((length, temp.bytes_used));
  66             }
  67         }
  68         self.emit_chunk_map(word_indices[&0], &compressed_words, best.unwrap().0);
  69
  70         struct Bits(u64);
  71         impl fmt::Debug for Bits {
  72             fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
  73                 write!(f, "0b{:064b}", self.0)
  74             }
  75         }
  76
  77         writeln!(
  78             &mut self.file,
  79             "static BITSET_CANONICAL: [u64; {}] = [{}];",
  80             canonicalized.canonical_words.len(),
  81             fmt_list(canonicalized.canonical_words.iter().map(|v| Bits(*v))),
  82         )
  83         .unwrap();
  84         self.bytes_used += 8 * canonicalized.canonical_words.len();
  85         writeln!(
  86             &mut self.file,
  87             "static BITSET_MAPPING: [(u8, u8); {}] = [{}];",
  88             canonicalized.canonicalized_words.len(),
  89             fmt_list(&canonicalized.canonicalized_words),
  90         )
  91         .unwrap();
  92         // 8 bit index into shifted words, 7 bits for shift + optional flip
  93         // We only need it for the words that we removed by applying a shift and
  94         // flip to them.
  95         self.bytes_used += 2 * canonicalized.canonicalized_words.len();
  96
  97         self.blank_line();
  98
  99         writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
 100         writeln!(&mut self.file, "    super::bitset_search(",).unwrap();
 101         writeln!(&mut self.file, "        c as u32,").unwrap();
 102         writeln!(&mut self.file, "        &BITSET_CHUNKS_MAP,").unwrap();
 103         writeln!(&mut self.file, "        &BITSET_INDEX_CHUNKS,").unwrap();
 104         writeln!(&mut self.file, "        &BITSET_CANONICAL,").unwrap();
 105         writeln!(&mut self.file, "        &BITSET_MAPPING,").unwrap();
 106         writeln!(&mut self.file, "    )").unwrap();
 107         writeln!(&mut self.file, "}}").unwrap();
 108     }
 109
 110     fn emit_chunk_map(&mut self, zero_at: u8, compressed_words: &[u8], chunk_length: usize) {
 111         let mut compressed_words = compressed_words.to_vec();
 112         for _ in 0..(chunk_length - (compressed_words.len() % chunk_length)) {
 113             // pad out bitset index with zero words so we have all chunks of
 114             // chunkchunk_length
 115             compressed_words.push(zero_at);
 116         }
 117
 118         let mut chunks = BTreeSet::new();
 119         for chunk in compressed_words.chunks(chunk_length) {
 120             chunks.insert(chunk);
 121         }
 122         let chunk_map = chunks
 123             .clone()
 124             .into_iter()
 125             .enumerate()
 126             .map(|(idx, chunk)| (chunk, idx))
 127             .collect::<HashMap<_, _>>();
 128         let mut chunk_indices = Vec::new();
 129         for chunk in compressed_words.chunks(chunk_length) {
 130             chunk_indices.push(chunk_map[chunk]);
 131         }
 132
 133         writeln!(
 134             &mut self.file,
 135             "static BITSET_CHUNKS_MAP: [u8; {}] = [{}];",
 136             chunk_indices.len(),
 137             fmt_list(&chunk_indices),
 138         )
 139         .unwrap();
 140         self.bytes_used += chunk_indices.len();
 141         writeln!(
 142             &mut self.file,
 143             "static BITSET_INDEX_CHUNKS: [[u8; {}]; {}] = [{}];",
 144             chunk_length,
 145             chunks.len(),
 146             fmt_list(chunks.iter()),
 147         )
 148         .unwrap();
 149         self.bytes_used += chunk_length * chunks.len();
 150     }
 151 }
 152
 153 pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range<u32>]) {
 154     emitter.blank_line();
 155
 156     let mut bitset = emitter.clone();
 157     bitset.emit_bitset(&ranges);
 158
 159     let mut skiplist = emitter.clone();
 160     skiplist.emit_skiplist(&ranges);
 161
 162     if bitset.bytes_used <= skiplist.bytes_used {
 163         *emitter = bitset;
 164         emitter.desc = String::from("bitset");
 165     } else {
 166         *emitter = skiplist;
 167         emitter.desc = String::from("skiplist");
 168     }
 169 }
 170
 171 struct Canonicalized {
 172     canonical_words: Vec<u64>,
 173     canonicalized_words: Vec<(u8, u8)>,
 174
 175     /// Maps an input unique word to the associated index (u8) which is into
 176     /// canonical_words or canonicalized_words (in order).
 177     unique_mapping: HashMap<u64, u8>,
 178 }
 179
 180 impl Canonicalized {
 181     fn canonicalize(unique_words: &[u64]) -> Self {
 182         #[derive(Copy, Clone, Debug)]
 183         enum Mapping {
 184             Rotate(u32),
 185             Invert,
 186             RotateAndInvert(u32),
 187             ShiftRight(u32),
 188         }
 189
 190         // key is the word being mapped to
 191         let mut mappings: BTreeMap<u64, Vec<(u64, Mapping)>> = BTreeMap::new();
 192         for &a in unique_words {
 193             'b: for &b in unique_words {
 194                 // skip self
 195                 if a == b {
 196                     continue;
 197                 }
 198
 199                 // All possible distinct rotations
 200                 for rotation in 1..64 {
 201                     if a.rotate_right(rotation) == b {
 202                         mappings.entry(b).or_default().push((a, Mapping::Rotate(rotation)));
 203                         // We're not interested in further mappings between a and b
 204                         continue 'b;
 205                     }
 206                 }
 207
 208                 if (!a) == b {
 209                     mappings.entry(b).or_default().push((a, Mapping::Invert));
 210                     // We're not interested in further mappings between a and b
 211                     continue 'b;
 212                 }
 213
 214                 // All possible distinct rotations, inverted
 215                 for rotation in 1..64 {
 216                     if (!a.rotate_right(rotation)) == b {
 217                         mappings
 218                             .entry(b)
 219                             .or_default()
 220                             .push((a, Mapping::RotateAndInvert(rotation)));
 221                         // We're not interested in further mappings between a and b
 222                         continue 'b;
 223                     }
 224                 }
 225
 226                 // All possible shifts
 227                 for shift_by in 1..64 {
 228                     if a == (b >> shift_by) {
 229                         mappings
 230                             .entry(b)
 231                             .or_default()
 232                             .push((a, Mapping::ShiftRight(shift_by as u32)));
 233                         // We're not interested in further mappings between a and b
 234                         continue 'b;
 235                     }
 236                 }
 237             }
 238         }
 239         // These are the bitset words which will be represented "raw" (as a u64)
 240         let mut canonical_words = Vec::new();
 241         // These are mapped words, which will be represented by an index into
 242         // the canonical_words and a Mapping; u16 when encoded.
 243         let mut canonicalized_words = Vec::new();
 244         let mut unique_mapping = HashMap::new();
 245
 246         #[derive(Debug, PartialEq, Eq)]
 247         enum UniqueMapping {
 248             Canonical(usize),
 249             Canonicalized(usize),
 250         }
 251
 252         // Map 0 first, so that it is the first canonical word.
 253         // This is realistically not inefficient because 0 is not mapped to by
 254         // anything else (a shift pattern could do it, but would be wasteful).
 255         //
 256         // However, 0s are quite common in the overall dataset, and it is quite
 257         // wasteful to have to go through a mapping function to determine that
 258         // we have a zero.
 259         //
 260         // FIXME: Experiment with choosing most common words in overall data set
 261         // for canonical when possible.
 262         while let Some((&to, _)) = mappings
 263             .iter()
 264             .find(|(&to, _)| to == 0)
 265             .or_else(|| mappings.iter().max_by_key(|m| m.1.len()))
 266         {
 267             // Get the mapping with the most entries. Currently, no mapping can
 268             // only exist transitively (i.e., there is no A, B, C such that A
 269             // does not map to C and but A maps to B maps to C), so this is
 270             // guaranteed to be acceptable.
 271             //
 272             // In the future, we may need a more sophisticated algorithm to
 273             // identify which keys to prefer as canonical.
 274             let mapped_from = mappings.remove(&to).unwrap();
 275             for (from, how) in &mapped_from {
 276                 // Remove the entries which mapped to this one.
 277                 // Noting that it should be associated with the Nth canonical word.
 278                 //
 279                 // We do not assert that this is present, because there may be
 280                 // no mappings to the `from` word; that's fine.
 281                 mappings.remove(from);
 282                 assert_eq!(
 283                     unique_mapping
 284                         .insert(*from, UniqueMapping::Canonicalized(canonicalized_words.len())),
 285                     None
 286                 );
 287                 canonicalized_words.push((canonical_words.len(), *how));
 288
 289                 // Remove the now-canonicalized word from other mappings,
 290                 // to ensure that we deprioritize them in the next iteration of
 291                 // the while loop.
 292                 for mapped in mappings.values_mut() {
 293                     let mut i = 0;
 294                     while i != mapped.len() {
 295                         if mapped[i].0 == *from {
 296                             mapped.remove(i);
 297                         } else {
 298                             i += 1;
 299                         }
 300                     }
 301                 }
 302             }
 303             assert!(
 304                 unique_mapping
 305                     .insert(to, UniqueMapping::Canonical(canonical_words.len()))
 306                     .is_none()
 307             );
 308             canonical_words.push(to);
 309
 310             // Remove the now-canonical word from other mappings, to ensure that
 311             // we deprioritize them in the next iteration of the while loop.
 312             for mapped in mappings.values_mut() {
 313                 let mut i = 0;
 314                 while i != mapped.len() {
 315                     if mapped[i].0 == to {
 316                         mapped.remove(i);
 317                     } else {
 318                         i += 1;
 319                     }
 320                 }
 321             }
 322         }
 323
 324         // Any words which we couldn't shrink, just stick into the canonical
 325         // words.
 326         //
 327         // FIXME: work harder -- there are more possibilities for mapping
 328         // functions (e.g., multiplication, shifting instead of rotation, etc.)
 329         // We'll probably always have some slack though so this loop will still
 330         // be needed.
 331         for &w in unique_words {
 332             if !unique_mapping.contains_key(&w) {
 333                 assert!(
 334                     unique_mapping
 335                         .insert(w, UniqueMapping::Canonical(canonical_words.len()))
 336                         .is_none()
 337                 );
 338                 canonical_words.push(w);
 339             }
 340         }
 341         assert_eq!(canonicalized_words.len() + canonical_words.len(), unique_words.len());
 342         assert_eq!(unique_mapping.len(), unique_words.len());
 343
 344         let unique_mapping = unique_mapping
 345             .into_iter()
 346             .map(|(key, value)| {
 347                 (
 348                     key,
 349                     match value {
 350                         UniqueMapping::Canonicalized(idx) => {
 351                             u8::try_from(canonical_words.len() + idx).unwrap()
 352                         }
 353                         UniqueMapping::Canonical(idx) => u8::try_from(idx).unwrap(),
 354                     },
 355                 )
 356             })
 357             .collect::<HashMap<_, _>>();
 358
 359         let mut distinct_indices = BTreeSet::new();
 360         for &w in unique_words {
 361             let idx = unique_mapping.get(&w).unwrap();
 362             assert!(distinct_indices.insert(idx));
 363         }
 364
 365         const LOWER_6: u32 = (1 << 6) - 1;
 366
 367         let canonicalized_words = canonicalized_words
 368             .into_iter()
 369             .map(|v| {
 370                 (
 371                     u8::try_from(v.0).unwrap(),
 372                     match v.1 {
 373                         Mapping::RotateAndInvert(amount) => {
 374                             assert_eq!(amount, amount & LOWER_6);
 375                             1 << 6 | (amount as u8)
 376                         }
 377                         Mapping::Rotate(amount) => {
 378                             assert_eq!(amount, amount & LOWER_6);
 379                             amount as u8
 380                         }
 381                         Mapping::Invert => 1 << 6,
 382                         Mapping::ShiftRight(shift_by) => {
 383                             assert_eq!(shift_by, shift_by & LOWER_6);
 384                             1 << 7 | (shift_by as u8)
 385                         }
 386                     },
 387                 )
 388             })
 389             .collect::<Vec<(u8, u8)>>();
 390         Canonicalized { unique_mapping, canonical_words, canonicalized_words }
 391     }
 392 }