From b6bc9060041bb5de18d9b31fe935d29193d9bad5 Mon Sep 17 00:00:00 2001 From: Mark Rousskov Date: Fri, 27 Mar 2020 18:01:14 -0400 Subject: [PATCH] Remove separate encoding for a single nonzero-mapping byte In practice, for the two data sets that still use the bitset encoding (uppercase and lowercase) this is not a significant win, so just drop it entirely. It costs us about 5 bytes, and the complexity is nontrivial. --- src/libcore/unicode/unicode_data.rs | 22 ++++++----------- .../src/range_search.rs | 9 ++----- .../src/raw_emitter.rs | 24 ------------------- 3 files changed, 9 insertions(+), 46 deletions(-) diff --git a/src/libcore/unicode/unicode_data.rs b/src/libcore/unicode/unicode_data.rs index 72ea8ce0381..48caa21fb0a 100644 --- a/src/libcore/unicode/unicode_data.rs +++ b/src/libcore/unicode/unicode_data.rs @@ -10,7 +10,6 @@ fn bitset_search< >( needle: u32, chunk_idx_map: &[u8; N], - last_chunk_idx: u16, bitset_chunk_idx: &[[u8; CHUNK_SIZE]; N1], bitset_canonical: &[u64; CANONICAL], bitset_canonicalized: &[(u8, u8); CANONICALIZED], @@ -18,12 +17,8 @@ fn bitset_search< let bucket_idx = (needle / 64) as usize; let chunk_map_idx = bucket_idx / CHUNK_SIZE; let chunk_piece = bucket_idx % CHUNK_SIZE; - // The last entry of `chunk_idx_map` actually should be at `last_chunk_idx`, - // so we need to remap it - let chunk_idx = if chunk_map_idx < (chunk_idx_map.len() - 1) { - chunk_idx_map[chunk_map_idx] - } else if chunk_map_idx == last_chunk_idx as usize { - chunk_idx_map[chunk_idx_map.len() - 1] + let chunk_idx = if let Some(&v) = chunk_idx_map.get(chunk_map_idx) { + v } else { return false; }; @@ -317,12 +312,12 @@ pub fn lookup(c: char) -> bool { #[rustfmt::skip] pub mod lowercase { - const BITSET_LAST_CHUNK_MAP: u16 = 122; - static BITSET_CHUNKS_MAP: [u8; 119] = [ + static BITSET_CHUNKS_MAP: [u8; 123] = [ 13, 16, 0, 0, 8, 0, 0, 11, 12, 9, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 0, 14, 0, 7, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 6, + 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 0, 0, + 0, 0, 6, ]; static BITSET_INDEX_CHUNKS: [[u8; 16]; 18] = [ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], @@ -408,7 +403,6 @@ pub fn lookup(c: char) -> bool { super::bitset_search( c as u32, &BITSET_CHUNKS_MAP, - BITSET_LAST_CHUNK_MAP, &BITSET_INDEX_CHUNKS, &BITSET_CANONICAL, &BITSET_MAPPING, @@ -449,13 +443,12 @@ pub fn lookup(c: char) -> bool { #[rustfmt::skip] pub mod uppercase { - const BITSET_LAST_CHUNK_MAP: u16 = 124; - static BITSET_CHUNKS_MAP: [u8; 124] = [ + static BITSET_CHUNKS_MAP: [u8; 125] = [ 12, 15, 5, 5, 0, 5, 5, 2, 4, 11, 5, 14, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 8, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 5, 13, 5, 10, 5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 7, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 16, 5, 5, - 5, 5, 9, 3, + 5, 5, 9, 5, 3, ]; static BITSET_INDEX_CHUNKS: [[u8; 16]; 17] = [ [41, 41, 5, 33, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 5, 0], @@ -529,7 +522,6 @@ pub fn lookup(c: char) -> bool { super::bitset_search( c as u32, &BITSET_CHUNKS_MAP, - BITSET_LAST_CHUNK_MAP, &BITSET_INDEX_CHUNKS, &BITSET_CANONICAL, &BITSET_MAPPING, diff --git a/src/tools/unicode-table-generator/src/range_search.rs b/src/tools/unicode-table-generator/src/range_search.rs index 49e65521c98..39b47ce703f 100644 --- a/src/tools/unicode-table-generator/src/range_search.rs +++ b/src/tools/unicode-table-generator/src/range_search.rs @@ -8,7 +8,6 @@ fn bitset_search< >( needle: u32, chunk_idx_map: &[u8; N], - last_chunk_idx: u16, bitset_chunk_idx: &[[u8; CHUNK_SIZE]; N1], bitset_canonical: &[u64; CANONICAL], bitset_canonicalized: &[(u8, u8); CANONICALIZED], @@ -16,12 +15,8 @@ fn bitset_search< let bucket_idx = (needle / 64) as usize; let chunk_map_idx = bucket_idx / CHUNK_SIZE; let chunk_piece = bucket_idx % CHUNK_SIZE; - // The last entry of `chunk_idx_map` actually should be at `last_chunk_idx`, - // so we need to remap it - let chunk_idx = if chunk_map_idx < (chunk_idx_map.len() - 1) { - chunk_idx_map[chunk_map_idx] - } else if chunk_map_idx == last_chunk_idx as usize { - chunk_idx_map[chunk_idx_map.len() - 1] + let chunk_idx = if let Some(&v) = chunk_idx_map.get(chunk_map_idx) { + v } else { return false; }; diff --git a/src/tools/unicode-table-generator/src/raw_emitter.rs b/src/tools/unicode-table-generator/src/raw_emitter.rs index db9d04b3fa9..dd0746cf695 100644 --- a/src/tools/unicode-table-generator/src/raw_emitter.rs +++ b/src/tools/unicode-table-generator/src/raw_emitter.rs @@ -139,7 +139,6 @@ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { writeln!(&mut self.file, " super::bitset_search(",).unwrap(); writeln!(&mut self.file, " c as u32,").unwrap(); writeln!(&mut self.file, " &BITSET_CHUNKS_MAP,").unwrap(); - writeln!(&mut self.file, " BITSET_LAST_CHUNK_MAP,").unwrap(); writeln!(&mut self.file, " &BITSET_INDEX_CHUNKS,").unwrap(); writeln!(&mut self.file, " &BITSET_CANONICAL,").unwrap(); writeln!(&mut self.file, " &BITSET_MAPPING,").unwrap(); @@ -170,29 +169,6 @@ fn emit_chunk_map(&mut self, zero_at: u8, compressed_words: &[u8], chunk_length: chunk_indices.push(chunk_map[chunk]); } - // If one of the chunks has all of the entries point to the bitset - // word filled with zeros, then pop those off the end -- we know they - // are useless. - let zero_chunk_idx = chunks.iter().position(|chunk| chunk.iter().all(|e| *e == zero_at)); - while zero_chunk_idx.is_some() && chunk_indices.last().cloned() == zero_chunk_idx { - chunk_indices.pop(); - } - // We do not count the LAST_CHUNK_MAP as adding bytes because it's a - // small constant whose values are inlined directly into the instruction - // stream. - writeln!( - &mut self.file, - "const BITSET_LAST_CHUNK_MAP: u16 = {};", - chunk_indices.len() - 1, - ) - .unwrap(); - let nonzero = chunk_indices.pop().unwrap(); - // Try to pop again, now that we've recorded a non-zero pointing index - // into the LAST_CHUNK_MAP. - while zero_chunk_idx.is_some() && chunk_indices.last().cloned() == zero_chunk_idx { - chunk_indices.pop(); - } - chunk_indices.push(nonzero); writeln!( &mut self.file, "static BITSET_CHUNKS_MAP: [u8; {}] = [{}];", -- 2.44.0