--- /dev/null
+// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+/// BoolTrie is a trie for representing a set of Unicode codepoints. It is
+/// implemented with postfix compression (sharing of identical child nodes),
+/// which gives both compact size and fast lookup.
+///
+/// The space of Unicode codepoints is divided into 3 subareas, each
+/// represented by a trie with different depth. In the first (0..0x800), there
+/// is no trie structure at all; each u64 entry corresponds to a bitvector
+/// effectively holding 64 bool values.
+///
+/// In the second (0x800..0x10000), each child of the root node represents a
+/// 64-wide subrange, but instead of storing the full 64-bit value of the leaf,
+/// the trie stores an 8-bit index into a shared table of leaf values. This
+/// exploits the fact that in reasonable sets, many such leaves can be shared.
+///
+/// In the third (0x10000..0x110000), each child of the root node represents a
+/// 4096-wide subrange, and the trie stores an 8-bit index into a 64-byte slice
+/// of a child tree. Each of these 64 bytes represents an index into the table
+/// of shared 64-bit leaf values. This exploits the sparse structure in the
+/// non-BMP range of most Unicode sets.
+pub struct BoolTrie {
+ // 0..0x800 (corresponding to 1 and 2 byte utf-8 sequences)
+ pub r1: [u64; 32], // leaves
+
+ // 0x800..0x10000 (corresponding to 3 byte utf-8 sequences)
+ pub r2: [u8; 992], // first level
+ pub r3: &'static [u64], // leaves
+
+ // 0x10000..0x110000 (corresponding to 4 byte utf-8 sequences)
+ pub r4: [u8; 256], // first level
+ pub r5: &'static [u8], // second level
+ pub r6: &'static [u64], // leaves
+}
+impl BoolTrie {
+ pub fn lookup(&self, c: char) -> bool {
+ let c = c as usize;
+ if c < 0x800 {
+ trie_range_leaf(c, self.r1[c >> 6])
+ } else if c < 0x10000 {
+ let child = self.r2[(c >> 6) - 0x20];
+ trie_range_leaf(c, self.r3[child as usize])
+ } else {
+ let child = self.r4[(c >> 12) - 0x10];
+ let leaf = self.r5[((child as usize) << 6) + ((c >> 6) & 0x3f)];
+ trie_range_leaf(c, self.r6[leaf as usize])
+ }
+ }
+}
+
+pub struct SmallBoolTrie {
+ pub(crate) r1: &'static [u8], // first level
+ pub(crate) r2: &'static [u64], // leaves
+}
+
+impl SmallBoolTrie {
+ pub fn lookup(&self, c: char) -> bool {
+ let c = c as usize;
+ match self.r1.get(c >> 6) {
+ Some(&child) => trie_range_leaf(c, self.r2[child as usize]),
+ None => false,
+ }
+ }
+}
+
+fn trie_range_leaf(c: usize, bitmap_chunk: u64) -> bool {
+ ((bitmap_chunk >> (c & 63)) & 1) != 0
+}
#![allow(missing_docs, non_upper_case_globals, non_snake_case)]
-/// Represents a Unicode Version.
-///
-/// See also: <http://www.unicode.org/versions/>
-#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
-pub struct UnicodeVersion {
- /// Major version.
- pub major: u32,
-
- /// Minor version.
- pub minor: u32,
-
- /// Micro (or Update) version.
- pub micro: u32,
-
- // Private field to keep struct expandable.
- _priv: (),
-}
+use version::UnicodeVersion;
+use bool_trie::{BoolTrie, SmallBoolTrie};
/// The version of [Unicode](http://www.unicode.org/) that the Unicode parts of
/// `CharExt` and `UnicodeStrPrelude` traits are based on.
micro: 0,
_priv: (),
};
-
-
-// BoolTrie is a trie for representing a set of Unicode codepoints. It is
-// implemented with postfix compression (sharing of identical child nodes),
-// which gives both compact size and fast lookup.
-//
-// The space of Unicode codepoints is divided into 3 subareas, each
-// represented by a trie with different depth. In the first (0..0x800), there
-// is no trie structure at all; each u64 entry corresponds to a bitvector
-// effectively holding 64 bool values.
-//
-// In the second (0x800..0x10000), each child of the root node represents a
-// 64-wide subrange, but instead of storing the full 64-bit value of the leaf,
-// the trie stores an 8-bit index into a shared table of leaf values. This
-// exploits the fact that in reasonable sets, many such leaves can be shared.
-//
-// In the third (0x10000..0x110000), each child of the root node represents a
-// 4096-wide subrange, and the trie stores an 8-bit index into a 64-byte slice
-// of a child tree. Each of these 64 bytes represents an index into the table
-// of shared 64-bit leaf values. This exploits the sparse structure in the
-// non-BMP range of most Unicode sets.
-pub struct BoolTrie {
- // 0..0x800 (corresponding to 1 and 2 byte utf-8 sequences)
- r1: [u64; 32], // leaves
-
- // 0x800..0x10000 (corresponding to 3 byte utf-8 sequences)
- r2: [u8; 992], // first level
- r3: &'static [u64], // leaves
-
- // 0x10000..0x110000 (corresponding to 4 byte utf-8 sequences)
- r4: [u8; 256], // first level
- r5: &'static [u8], // second level
- r6: &'static [u64], // leaves
-}
-
-fn trie_range_leaf(c: usize, bitmap_chunk: u64) -> bool {
- ((bitmap_chunk >> (c & 63)) & 1) != 0
-}
-
-fn trie_lookup_range_table(c: char, r: &'static BoolTrie) -> bool {
- let c = c as usize;
- if c < 0x800 {
- trie_range_leaf(c, r.r1[c >> 6])
- } else if c < 0x10000 {
- let child = r.r2[(c >> 6) - 0x20];
- trie_range_leaf(c, r.r3[child as usize])
- } else {
- let child = r.r4[(c >> 12) - 0x10];
- let leaf = r.r5[((child as usize) << 6) + ((c >> 6) & 0x3f)];
- trie_range_leaf(c, r.r6[leaf as usize])
- }
-}
-
-pub struct SmallBoolTrie {
- r1: &'static [u8], // first level
- r2: &'static [u64], // leaves
-}
-
-impl SmallBoolTrie {
- fn lookup(&self, c: char) -> bool {
- let c = c as usize;
- match self.r1.get(c >> 6) {
- Some(&child) => trie_range_leaf(c, self.r2[child as usize]),
- None => false,
- }
- }
-}
-
pub mod general_category {
- pub const Cc_table: &'static super::SmallBoolTrie = &super::SmallBoolTrie {
+ pub const Cc_table: &super::SmallBoolTrie = &super::SmallBoolTrie {
r1: &[
0, 1, 0
],
Cc_table.lookup(c)
}
- pub const N_table: &'static super::BoolTrie = &super::BoolTrie {
+ pub const N_table: &super::BoolTrie = &super::BoolTrie {
r1: [
0x03ff000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
};
pub fn N(c: char) -> bool {
- super::trie_lookup_range_table(c, N_table)
+ N_table.lookup(c)
}
}
pub mod derived_property {
- pub const Alphabetic_table: &'static super::BoolTrie = &super::BoolTrie {
+ pub const Alphabetic_table: &super::BoolTrie = &super::BoolTrie {
r1: [
0x0000000000000000, 0x07fffffe07fffffe, 0x0420040000000000, 0xff7fffffff7fffff,
0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
};
pub fn Alphabetic(c: char) -> bool {
- super::trie_lookup_range_table(c, Alphabetic_table)
+ Alphabetic_table.lookup(c)
}
- pub const Case_Ignorable_table: &'static super::BoolTrie = &super::BoolTrie {
+ pub const Case_Ignorable_table: &super::BoolTrie = &super::BoolTrie {
r1: [
0x0400408000000000, 0x0000000140000000, 0x0190a10000000000, 0x0000000000000000,
0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
};
pub fn Case_Ignorable(c: char) -> bool {
- super::trie_lookup_range_table(c, Case_Ignorable_table)
+ Case_Ignorable_table.lookup(c)
}
- pub const Cased_table: &'static super::BoolTrie = &super::BoolTrie {
+ pub const Cased_table: &super::BoolTrie = &super::BoolTrie {
r1: [
0x0000000000000000, 0x07fffffe07fffffe, 0x0420040000000000, 0xff7fffffff7fffff,
0xffffffffffffffff, 0xffffffffffffffff, 0xf7ffffffffffffff, 0xfffffffffffffff0,
};
pub fn Cased(c: char) -> bool {
- super::trie_lookup_range_table(c, Cased_table)
+ Cased_table.lookup(c)
}
- pub const Lowercase_table: &'static super::BoolTrie = &super::BoolTrie {
+ pub const Lowercase_table: &super::BoolTrie = &super::BoolTrie {
r1: [
0x0000000000000000, 0x07fffffe00000000, 0x0420040000000000, 0xff7fffff80000000,
0x55aaaaaaaaaaaaaa, 0xd4aaaaaaaaaaab55, 0xe6512d2a4e243129, 0xaa29aaaab5555240,
};
pub fn Lowercase(c: char) -> bool {
- super::trie_lookup_range_table(c, Lowercase_table)
+ Lowercase_table.lookup(c)
}
- pub const Uppercase_table: &'static super::BoolTrie = &super::BoolTrie {
+ pub const Uppercase_table: &super::BoolTrie = &super::BoolTrie {
r1: [
0x0000000000000000, 0x0000000007fffffe, 0x0000000000000000, 0x000000007f7fffff,
0xaa55555555555555, 0x2b555555555554aa, 0x11aed2d5b1dbced6, 0x55d255554aaaa490,
};
pub fn Uppercase(c: char) -> bool {
- super::trie_lookup_range_table(c, Uppercase_table)
+ Uppercase_table.lookup(c)
}
- pub const XID_Continue_table: &'static super::BoolTrie = &super::BoolTrie {
+ pub const XID_Continue_table: &super::BoolTrie = &super::BoolTrie {
r1: [
0x03ff000000000000, 0x07fffffe87fffffe, 0x04a0040000000000, 0xff7fffffff7fffff,
0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
};
pub fn XID_Continue(c: char) -> bool {
- super::trie_lookup_range_table(c, XID_Continue_table)
+ XID_Continue_table.lookup(c)
}
- pub const XID_Start_table: &'static super::BoolTrie = &super::BoolTrie {
+ pub const XID_Start_table: &super::BoolTrie = &super::BoolTrie {
r1: [
0x0000000000000000, 0x07fffffe07fffffe, 0x0420040000000000, 0xff7fffffff7fffff,
0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
};
pub fn XID_Start(c: char) -> bool {
- super::trie_lookup_range_table(c, XID_Start_table)
+ XID_Start_table.lookup(c)
}
}
pub mod property {
- pub const Pattern_White_Space_table: &'static super::SmallBoolTrie = &super::SmallBoolTrie {
+ pub const Pattern_White_Space_table: &super::SmallBoolTrie = &super::SmallBoolTrie {
r1: &[
0, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Pattern_White_Space_table.lookup(c)
}
- pub const White_Space_table: &'static super::SmallBoolTrie = &super::SmallBoolTrie {
+ pub const White_Space_table: &super::SmallBoolTrie = &super::SmallBoolTrie {
r1: &[
0, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
}
}
- fn bsearch_case_table(c: char, table: &'static [(char, [char; 3])]) -> Option<usize> {
+ fn bsearch_case_table(c: char, table: &[(char, [char; 3])]) -> Option<usize> {
table.binary_search_by(|&(key, _)| key.cmp(&c)).ok()
}
- const to_lowercase_table: &'static [(char, [char; 3])] = &[
+ const to_lowercase_table: &[(char, [char; 3])] = &[
('\u{41}', ['\u{61}', '\0', '\0']), ('\u{42}', ['\u{62}', '\0', '\0']), ('\u{43}',
['\u{63}', '\0', '\0']), ('\u{44}', ['\u{64}', '\0', '\0']), ('\u{45}', ['\u{65}', '\0',
'\0']), ('\u{46}', ['\u{66}', '\0', '\0']), ('\u{47}', ['\u{67}', '\0', '\0']), ('\u{48}',
('\u{1e920}', ['\u{1e942}', '\0', '\0']), ('\u{1e921}', ['\u{1e943}', '\0', '\0'])
];
- const to_uppercase_table: &'static [(char, [char; 3])] = &[
+ const to_uppercase_table: &[(char, [char; 3])] = &[
('\u{61}', ['\u{41}', '\0', '\0']), ('\u{62}', ['\u{42}', '\0', '\0']), ('\u{63}',
['\u{43}', '\0', '\0']), ('\u{64}', ['\u{44}', '\0', '\0']), ('\u{65}', ['\u{45}', '\0',
'\0']), ('\u{66}', ['\u{46}', '\0', '\0']), ('\u{67}', ['\u{47}', '\0', '\0']), ('\u{68}',
// NOTE: The following code was generated by "./unicode.py", do not edit directly
#![allow(missing_docs, non_upper_case_globals, non_snake_case)]
+
+use version::UnicodeVersion;
+use bool_trie::{BoolTrie, SmallBoolTrie};
'''
# Mapping taken from Table 12 from:
def escape_char(c):
return "'\\u{%x}'" % c if c != 0 else "'\\0'"
-def emit_bsearch_range_table(f):
- f.write("""
-fn bsearch_range_table(c: char, r: &'static [(char, char)]) -> bool {
- use core::cmp::Ordering::{Equal, Less, Greater};
- r.binary_search_by(|&(lo, hi)| {
- if c < lo {
- Greater
- } else if hi < c {
- Less
- } else {
- Equal
- }
- })
- .is_ok()
-}\n
-""")
-
-def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
+def emit_table(f, name, t_data, t_type = "&[(char, char)]", is_pub=True,
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1]))):
pub_string = ""
if is_pub:
format_table_content(f, data, 8)
f.write("\n ];\n\n")
-def emit_trie_lookup_range_table(f):
- f.write("""
-
-// BoolTrie is a trie for representing a set of Unicode codepoints. It is
-// implemented with postfix compression (sharing of identical child nodes),
-// which gives both compact size and fast lookup.
-//
-// The space of Unicode codepoints is divided into 3 subareas, each
-// represented by a trie with different depth. In the first (0..0x800), there
-// is no trie structure at all; each u64 entry corresponds to a bitvector
-// effectively holding 64 bool values.
-//
-// In the second (0x800..0x10000), each child of the root node represents a
-// 64-wide subrange, but instead of storing the full 64-bit value of the leaf,
-// the trie stores an 8-bit index into a shared table of leaf values. This
-// exploits the fact that in reasonable sets, many such leaves can be shared.
-//
-// In the third (0x10000..0x110000), each child of the root node represents a
-// 4096-wide subrange, and the trie stores an 8-bit index into a 64-byte slice
-// of a child tree. Each of these 64 bytes represents an index into the table
-// of shared 64-bit leaf values. This exploits the sparse structure in the
-// non-BMP range of most Unicode sets.
-pub struct BoolTrie {
- // 0..0x800 (corresponding to 1 and 2 byte utf-8 sequences)
- r1: [u64; 32], // leaves
-
- // 0x800..0x10000 (corresponding to 3 byte utf-8 sequences)
- r2: [u8; 992], // first level
- r3: &'static [u64], // leaves
-
- // 0x10000..0x110000 (corresponding to 4 byte utf-8 sequences)
- r4: [u8; 256], // first level
- r5: &'static [u8], // second level
- r6: &'static [u64], // leaves
-}
-
-fn trie_range_leaf(c: usize, bitmap_chunk: u64) -> bool {
- ((bitmap_chunk >> (c & 63)) & 1) != 0
-}
-
-fn trie_lookup_range_table(c: char, r: &'static BoolTrie) -> bool {
- let c = c as usize;
- if c < 0x800 {
- trie_range_leaf(c, r.r1[c >> 6])
- } else if c < 0x10000 {
- let child = r.r2[(c >> 6) - 0x20];
- trie_range_leaf(c, r.r3[child as usize])
- } else {
- let child = r.r4[(c >> 12) - 0x10];
- let leaf = r.r5[((child as usize) << 6) + ((c >> 6) & 0x3f)];
- trie_range_leaf(c, r.r6[leaf as usize])
- }
-}
-
-pub struct SmallBoolTrie {
- r1: &'static [u8], // first level
- r2: &'static [u64], // leaves
-}
-
-impl SmallBoolTrie {
- fn lookup(&self, c: char) -> bool {
- let c = c as usize;
- match self.r1.get(c >> 6) {
- Some(&child) => trie_range_leaf(c, self.r2[child as usize]),
- None => false,
- }
- }
-}
-
-""")
-
def compute_trie(rawdata, chunksize):
root = []
childmap = {}
pub_string = ""
if is_pub:
pub_string = "pub "
- f.write(" %sconst %s: &'static super::BoolTrie = &super::BoolTrie {\n" % (pub_string, name))
+ f.write(" %sconst %s: &super::BoolTrie = &super::BoolTrie {\n" % (pub_string, name))
f.write(" r1: [\n")
data = ','.join('0x%016x' % chunk for chunk in chunks[0:0x800 // CHUNK])
format_table_content(f, data, 12)
pub_string = ""
if is_pub:
pub_string = "pub "
- f.write(" %sconst %s: &'static super::SmallBoolTrie = &super::SmallBoolTrie {\n"
+ f.write(" %sconst %s: &super::SmallBoolTrie = &super::SmallBoolTrie {\n"
% (pub_string, name))
(r1, r2) = compute_trie(chunks, 1)
else:
emit_bool_trie(f, "%s_table" % cat, tbl[cat])
f.write(" pub fn %s(c: char) -> bool {\n" % cat)
- f.write(" super::trie_lookup_range_table(c, %s_table)\n" % cat)
+ f.write(" %s_table.lookup(c)\n" % cat)
f.write(" }\n\n")
f.write("}\n\n")
}
}
- fn bsearch_case_table(c: char, table: &'static [(char, [char; 3])]) -> Option<usize> {
+ fn bsearch_case_table(c: char, table: &[(char, [char; 3])]) -> Option<usize> {
table.binary_search_by(|&(key, _)| key.cmp(&c)).ok()
}
""")
- t_type = "&'static [(char, [char; 3])]"
+ t_type = "&[(char, [char; 3])]"
pfun = lambda x: "(%s,[%s,%s,%s])" % (
escape_char(x[0]), escape_char(x[1][0]), escape_char(x[1][1]), escape_char(x[1][2]))
emit_table(f, "to_lowercase_table",
pattern = "for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
unicode_version = re.search(pattern, readme.read()).groups()
rf.write("""
-/// Represents a Unicode Version.
-///
-/// See also: <http://www.unicode.org/versions/>
-#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
-pub struct UnicodeVersion {
- /// Major version.
- pub major: u32,
-
- /// Minor version.
- pub minor: u32,
-
- /// Micro (or Update) version.
- pub micro: u32,
-
- // Private field to keep struct expandable.
- _priv: (),
-}
-
/// The version of [Unicode](http://www.unicode.org/) that the Unicode parts of
/// `CharExt` and `UnicodeStrPrelude` traits are based on.
pub const UNICODE_VERSION: UnicodeVersion = UnicodeVersion {
norm_props = load_properties("DerivedNormalizationProps.txt",
["Full_Composition_Exclusion"])
- # trie_lookup_table is used in all the property modules below
- emit_trie_lookup_range_table(rf)
- # emit_bsearch_range_table(rf)
-
# category tables
for (name, cat, pfuns) in ("general_category", gencats, ["N", "Cc"]), \
("derived_property", derived, want_derived), \