std: Refine and document HashMap's code

author Piotr Czarnecki <pioczarn@gmail.com>

Tue, 15 Jul 2014 20:58:35 +0000 (21:58 +0100)

committer Piotr Czarnecki <pioczarn@gmail.com>

Thu, 4 Sep 2014 22:22:32 +0000 (23:22 +0100)
author Piotr Czarnecki <pioczarn@gmail.com>
Tue, 15 Jul 2014 20:58:35 +0000 (21:58 +0100)
committer Piotr Czarnecki <pioczarn@gmail.com>
Thu, 4 Sep 2014 22:22:32 +0000 (23:22 +0100)
diff --git a/src/libstd/collections/hashmap/bench.rs b/src/libstd/collections/hashmap/bench.rs

index 66d97ba044847c20735d8f4e9dff128313b71f15..21bbb38f4893671686fff78f2230e2a42c09a6ba 100644 (file)
--- a/src/libstd/collections/hashmap/bench.rs
+++ b/src/libstd/collections/hashmap/bench.rs
@@ -38,7 +38,7 @@ fn new_insert_drop(b : &mut Bencher) {
  }
  
  #[bench]
-fn insert(b: &mut Bencher) {
+fn grow_by_insertion(b: &mut Bencher) {
      use super::HashMap;
  
      let mut m = HashMap::new();
diff --git a/src/libstd/collections/hashmap/map.rs b/src/libstd/collections/hashmap/map.rs

index 7a3779a91a080f42554c66d335e071b5ab423f65..a50c6a59f7e04b463411fcbb00f1a073c6b16e95 100644 (file)
--- a/src/libstd/collections/hashmap/map.rs
+++ b/src/libstd/collections/hashmap/map.rs
@@ -16,19 +16,27 @@
  use default::Default;
  use fmt::Show;
  use fmt;
-use RandomSipHasher;
-use hash::{Hash, Hasher};
-use iter::{Iterator, FromIterator, Extendable, range};
+use hash::{Hash, Hasher, RandomSipHasher};
+use iter::{Iterator, FromIterator, Extendable};
  use iter;
  use mem::replace;
  use num;
-use ops::Deref;
+use ops::{Deref, DerefMut};
  use option::{Some, None, Option};
  use result::{Ok, Err};
  use ops::Index;
  
-use super::table::{BucketWithTable, FullBucketImm, RawTable, FullBucket, FullBucketMut, Bucket};
  use super::table;
+use super::table::{
+    Bucket,
+    Empty,
+    Full,
+    FullBucket,
+    FullBucketImm,
+    FullBucketMut,
+    RawTable,
+    SafeHash
+};
  
  static INITIAL_LOG2_CAP: uint = 5;
  pub static INITIAL_CAPACITY: uint = 1 << INITIAL_LOG2_CAP; // 2^5
@@ -36,8 +44,9 @@
  /// The default behavior of HashMap implements a load factor of 90.9%.
  /// This behavior is characterized by the following conditions:
  ///
-/// - if `size * 1.1 < cap < size * 4` then shouldn't resize
-/// - if `cap < minimum_capacity * 2` then shouldn't shrink
+/// - if size > 0.909 * capacity: grow
+/// - if size < 0.25 * capacity: shrink (if this won't bring capacity lower
+///   than the minimum)
  #[deriving(Clone)]
  struct DefaultResizePolicy {
      /// Doubled minimal capacity. The capacity must never drop below
@@ -55,7 +64,12 @@ fn new(new_capacity: uint) -> DefaultResizePolicy {
  
      #[inline]
      fn capacity_range(&self, new_size: uint) -> (uint, uint) {
-        ((new_size * 11) / 10, max(new_size << 3, self.minimum_capacity2))
+        // Here, we are rephrasing the logic by specifying the ranges:
+        //
+        // - if `size * 1.1 < cap < size * 4`: don't resize
+        // - if `cap < minimum_capacity * 2`: don't shrink
+        // - otherwise, resize accordingly
+        ((new_size * 11) / 10, max(new_size << 2, self.minimum_capacity2))
      }
  
      #[inline]
@@ -65,9 +79,9 @@ fn reserve(&mut self, new_capacity: uint) {
  }
  
  // The main performance trick in this hashmap is called Robin Hood Hashing.
-// It gains its excellent performance from one crucial operation:
+// It gains its excellent performance from one essential operation:
  //
-//    If an insertion collides with an existing element, and that elements
+//    If an insertion collides with an existing element, and that element's
  //    "probe distance" (how far away the element is from its ideal location)
  //    is higher than how far we've already probed, swap the elements.
  //
@@ -94,6 +108,15 @@ fn reserve(&mut self, new_capacity: uint) {
  // α^3, etc. Therefore, the odds of colliding k times is α^k. The odds of NOT
  // colliding after k tries is 1-α^k.
  //
+// The paper from 1986 cited below mentions an implementation which keeps track
+// of the distance-to-initial-bucket histogram. This approach is not suitable
+// for modern architectures because it requires maintaining an internal data
+// structure. This allows very good first guesses, but we are most concerned
+// with guessing entire cache lines, not individual indexes. Furthermore, array
+// accesses are no longer linear and in one direction, as we have now. There
+// is also memory and cache pressure that this would entail that would be very
+// difficult to properly see in a microbenchmark.
+//
  // Future Improvements (FIXME!)
  // ============================
  //
@@ -106,15 +129,6 @@ fn reserve(&mut self, new_capacity: uint) {
  // Future Optimizations (FIXME!)
  // =============================
  //
-// The paper cited below mentions an implementation which keeps track of the
-// distance-to-initial-bucket histogram. I'm suspicious of this approach because
-// it requires maintaining an internal map. If this map were replaced with a
-// hashmap, it would be faster, but now our data structure is self-referential
-// and blows up. Also, this allows very good first guesses, but array accesses
-// are no longer linear and in one direction, as we have now. There is also
-// memory and cache pressure that this map would entail that would be very
-// difficult to properly see in a microbenchmark.
-//
  // Another possible design choice that I made without any real reason is
  // parameterizing the raw table over keys and values. Technically, all we need
  // is the size and alignment of keys and values, and the code should be just as
@@ -125,12 +139,56 @@ fn reserve(&mut self, new_capacity: uint) {
  // This would definitely be an avenue worth exploring if people start complaining
  // about the size of rust executables.
  //
-// There's also an "optimization" that has been omitted regarding how the
-// hashtable allocates. The vector type has set the expectation that a hashtable
-// which never has an element inserted should not allocate. I'm suspicious of
-// implementing this for hashtables, because supporting it has no performance
-// benefit over using an `Option<HashMap<K, V>>`, and is significantly more
-// complicated.
+// Annotate exceedingly likely branches in `table::make_hash`
+// and `search_hashed_generic` to reduce instruction cache pressure
+// and mispredictions once it becomes possible (blocked on issue #11092).
+//
+// Shrinking the table could simply reallocate in place after moving buckets
+// to the first half.
+//
+// The growth algorithm (fragment of the Proof of Correctness)
+// --------------------
+//
+// The growth algorithm is basically a fast path of the naive reinsertion-
+// during-resize algorithm. Other paths should never be taken.
+//
+// Consider growing a robin hood hashtable of capacity n. Normally, we do this
+// by allocating a new table of capacity `2n`, and then individually reinsert
+// each element in the old table into the new one. This guarantees that the
+// new table is a valid robin hood hashtable with all the desired statistical
+// properties. Remark that the order we reinsert the elements in should not
+// matter. For simplicity and efficiency, we will consider only linear
+// reinsertions, which consist of reinserting all elements in the old table
+// into the new one by increasing order of index. However we will not be
+// starting our reinsertions from index 0 in general. If we start from index
+// i, for the purpose of reinsertion we will consider all elements with real
+// index j < i to have virtual index n + j.
+//
+// Our hash generation scheme consists of generating a 64-bit hash and
+// truncating the most significant bits. When moving to the new table, we
+// simply introduce a new bit to the front of the hash. Therefore, if an
+// elements has ideal index i in the old table, it can have one of two ideal
+// locations in the new table. If the new bit is 0, then the new ideal index
+// is i. If the new bit is 1, then the new ideal index is n + i. Intutively,
+// we are producing two independent tables of size n, and for each element we
+// independently choose which table to insert it into with equal probability.
+// However the rather than wrapping around themselves on overflowing their
+// indexes, the first table overflows into the first, and the first into the
+// second. Visually, our new table will look something like:
+//
+// [yy_xxx_xxxx_xxx|xx_yyy_yyyy_yyy]
+//
+// Where x's are elements inserted into the first table, y's are elements
+// inserted into the second, and _'s are empty sections. We now define a few
+// key concepts that we will use later. Note that this is a very abstract
+// perspective of the table. A real resized table would be at least half
+// empty.
+//
+// Theorem: A linear robin hood reinsertion from the first ideal element
+// produces identical results to a linear naive reinsertion from the same
+// element.
+//
+// FIXME(Gankro, pczarn): review the proof and put it all in a separate doc.rs
  
  /// A hash map implementation which uses linear probing with Robin
  /// Hood bucket stealing.
@@ -219,27 +277,31 @@ pub struct HashMap<K, V, H = RandomSipHasher> {
      // All hashes are keyed on these values, to prevent hash collision attacks.
      hasher: H,
  
-    table: table::RawTable<K, V>,
+    table: RawTable<K, V>,
  
      // We keep this at the end since it might as well have tail padding.
      resize_policy: DefaultResizePolicy,
  }
  
  /// Search for a pre-hashed key.
-fn search_hashed_generic<K, V, M: Deref<RawTable<K, V>>>(table: M, hash: &table::SafeHash, is_match: |&K| -> bool)
-                        -> Option<FullBucket<K, V, M>> {
+fn search_hashed_generic<K, V, M: Deref<RawTable<K, V>>>(table: M,
+                                                         hash: &SafeHash,
+                                                         is_match: |&K| -> bool)
+                                                         -> SearchResult<K, V, M> {
      let size = table.size();
      let mut probe = Bucket::new(table, hash);
      let ib = probe.index();
  
      while probe.index() != ib + size {
          let full = match probe.peek() {
-            table::Empty(_) => return None, // hit an empty bucket
-            table::Full(b) => b
+            Empty(b) => return TableRef(b.into_table()), // hit an empty bucket
+            Full(b) => b
          };
  
          if full.distance() + ib < full.index() {
-            return None;
+            // We can finish the search early if we hit any bucket
+            // with a lower distance to initial bucket than we've probed.
+            return TableRef(full.into_table());
          }
  
          // If the hash doesn't match, it can't be this one..
@@ -251,65 +313,149 @@ fn search_hashed_generic<K, V, M: Deref<RawTable<K, V>>>(table: M, hash: &table:
  
              // If the key doesn't match, it can't be this one..
              if matched {
-                return Some(full);
+                return FoundExisting(full);
              }
          }
  
          probe = full.next();
      }
  
-    None
+    TableRef(probe.into_table())
  }
  
-fn search_hashed<K: Eq, V, M: Deref<RawTable<K, V>>>(table: M, hash: &table::SafeHash, k: &K)
-                -> Option<table::FullBucket<K, V, M>> {
+fn search_hashed<K: Eq, V, M: Deref<RawTable<K, V>>>(table: M, hash: &SafeHash, k: &K)
+                                                     -> SearchResult<K, V, M> {
      search_hashed_generic(table, hash, |k_| *k == *k_)
  }
  
  fn pop_internal<K, V>(starting_bucket: FullBucketMut<K, V>) -> V {
-    let size = {
-        let table = starting_bucket.table();
-        table.size()
-    };
      let (empty, _k, retval) = starting_bucket.take();
      let mut gap = match empty.gap_peek() {
          Some(b) => b,
          None => return retval
      };
-    // COMPILER error! wrong enum optimization. sets ptr to 0
  
-    for _ in range(0, size) {
-        if gap.full().distance() != 0 {
-            gap = match gap.shift() {
-                Some(b) => b,
-                None => return retval
+    while gap.full().distance() != 0 {
+        gap = match gap.shift() {
+            Some(b) => b,
+            None => break
+        };
+    }
+
+    // Now we've done all our shifting. Return the value we grabbed earlier.
+    return retval;
+}
+
+/// Perform robin hood bucket stealing at the given `bucket`. You must
+/// also pass the position of that bucket's initial bucket so we don't have
+/// to recalculate it.
+///
+/// `hash`, `k`, and `v` are the elements to "robin hood" into the hashtable.
+fn robin_hood<'a, K: 'a, V: 'a>(mut bucket: FullBucketMut<'a, K, V>,
+                        mut ib: uint,
+                        mut hash: SafeHash,
+                        mut k: K,
+                        mut v: V)
+                        -> &'a mut V {
+    let starting_index = bucket.index();
+    let size = {
+        let table = bucket.table(); // FIXME "lifetime too short".
+        table.size()
+    };
+    // There can be at most `size - dib` buckets to displace, because
+    // in the worst case, there are `size` elements and we already are
+    // `distance` buckets away from the initial one.
+    let idx_end = starting_index + size - bucket.distance();
+
+    loop {
+        let (old_hash, old_key, old_val) = bucket.replace(hash, k, v);
+        loop {
+            let probe = bucket.next();
+            assert!(probe.index() != idx_end);
+
+            let full_bucket = match probe.peek() {
+                table::Empty(bucket) => {
+                    // Found a hole!
+                    let b = bucket.put(old_hash, old_key, old_val);
+                    // Now that it's stolen, just read the value's pointer
+                    // right out of the table!
+                    let (_, v) = Bucket::at_index(b.into_table(), starting_index).peek()
+                                                                                 .expect_full()
+                                                                                 .into_mut_refs();
+                    return v;
+                },
+                table::Full(bucket) => bucket
              };
-            continue;
+
+            let probe_ib = full_bucket.index() - full_bucket.distance();
+
+            bucket = full_bucket;
+
+            // Robin hood! Steal the spot.
+            if ib < probe_ib {
+                ib = probe_ib;
+                hash = old_hash;
+                k = old_key;
+                v = old_val;
+                break;
+            }
+        }
+    }
+}
+
+/// A result that works like Option<FullBucket<..>> but preserves
+/// the reference that grants us access to the table in any case.
+enum SearchResult<K, V, M> {
+    // This is an entry that holds the given key:
+    FoundExisting(FullBucket<K, V, M>),
+
+    // There was no such entry. The reference is given back:
+    TableRef(M)
+}
+
+impl<K, V, M> SearchResult<K, V, M> {
+    fn into_option(self) -> Option<FullBucket<K, V, M>> {
+        match self {
+            FoundExisting(bucket) => Some(bucket),
+            TableRef(_) => None
          }
+    }
+}
  
-        break;
+/// A newtyped mutable reference to the hashmap that allows e.g. Deref to be
+/// implemented without making changes to the visible interface of HashMap.
+/// Used internally because it's accepted by the search functions above.
+struct MapMutRef<'a, K: 'a, V: 'a, H: 'a> {
+    map_ref: &'a mut HashMap<K, V, H>
+}
+
+impl<'a, K, V, H> Deref<RawTable<K, V>> for MapMutRef<'a, K, V, H> {
+    fn deref(&self) -> &RawTable<K, V> {
+        &self.map_ref.table
      }
+}
  
-    // Now we're done all our shifting. Return the value we grabbed
-    // earlier.
-    return retval;
+impl<'a, K, V, H> DerefMut<RawTable<K, V>> for MapMutRef<'a, K, V, H> {
+    fn deref_mut(&mut self) -> &mut RawTable<K, V> {
+        &mut self.map_ref.table
+    }
  }
  
  impl<K: Eq + Hash<S>, V, S, H: Hasher<S>> HashMap<K, V, H> {
-    fn make_hash<X: Hash<S>>(&self, x: &X) -> table::SafeHash {
+    fn make_hash<X: Hash<S>>(&self, x: &X) -> SafeHash {
          table::make_hash(&self.hasher, x)
      }
  
      fn search_equiv<'a, Q: Hash<S> + Equiv<K>>(&'a self, q: &Q)
                      -> Option<FullBucketImm<'a, K, V>> {
          let hash = self.make_hash(q);
-        search_hashed_generic(&self.table, &hash, |k| q.equiv(k))
+        search_hashed_generic(&self.table, &hash, |k| q.equiv(k)).into_option()
      }
  
      fn search_equiv_mut<'a, Q: Hash<S> + Equiv<K>>(&'a mut self, q: &Q)
                      -> Option<FullBucketMut<'a, K, V>> {
          let hash = self.make_hash(q);
-        search_hashed_generic(&mut self.table, &hash, |k| q.equiv(k))
+        search_hashed_generic(&mut self.table, &hash, |k| q.equiv(k)).into_option()
      }
  
      /// Search for a key, yielding the index if it's found in the hashtable.
@@ -317,25 +463,29 @@ fn search_equiv_mut<'a, Q: Hash<S> + Equiv<K>>(&'a mut self, q: &Q)
      /// search_hashed.
      fn search<'a>(&'a self, k: &K) -> Option<FullBucketImm<'a, K, V>> {
          let hash = self.make_hash(k);
-        search_hashed(&self.table, &hash, k)
+        search_hashed(&self.table, &hash, k).into_option()
      }
  
      fn search_mut<'a>(&'a mut self, k: &K) -> Option<FullBucketMut<'a, K, V>> {
          let hash = self.make_hash(k);
-        search_hashed(&mut self.table, &hash, k)
+        search_hashed(&mut self.table, &hash, k).into_option()
      }
  
-    fn insert_hashed_ordered(&mut self, hash: table::SafeHash, k: K, v: V) {
+    // The caller should ensure that invariants by Robin Hood Hashing hold.
+    fn insert_hashed_ordered(&mut self, hash: SafeHash, k: K, v: V) {
          let cap = self.table.capacity();
          let mut buckets = Bucket::new(&mut self.table, &hash);
          let ib = buckets.index();
+
          while buckets.index() != ib + cap {
+            // We don't need to compare hashes for value swap.
+            // Not even DIBs for Robin Hood.
              buckets = match buckets.peek() {
-                table::Empty(empty) => {
+                Empty(empty) => {
                      empty.put(hash, k, v);
                      return;
                  }
-                table::Full(b) => b.into_bucket()
+                Full(b) => b.into_bucket()
              };
              buckets.next();
          }
@@ -361,8 +511,8 @@ fn clear(&mut self) {
  
          while buckets.index() != cap {
              buckets = match buckets.peek() {
-                table::Empty(b)  => b.next(),
-                table::Full(full) => {
+                Empty(b)  => b.next(),
+                Full(full) => {
                      let (b, _, _) = full.take();
                      b.next()
                  }
@@ -401,7 +551,7 @@ fn swap(&mut self, k: K, v: V) -> Option<V> {
          self.make_some_room(potential_new_size);
  
          let mut retval = None;
-        self.insert_or_replace_with(hash, k, v, |val_ref, val| {
+        self.insert_or_replace_with(hash, k, v, |_, val_ref, val| {
              retval = Some(replace(val_ref, val));
          });
          retval
@@ -472,7 +622,7 @@ pub fn with_hasher(hasher: H) -> HashMap<K, V, H> {
          HashMap {
              hasher:        hasher,
              resize_policy: DefaultResizePolicy::new(INITIAL_CAPACITY),
-            table:         table::RawTable::new(0),
+            table:         RawTable::new(0),
          }
      }
  
@@ -500,7 +650,7 @@ pub fn with_capacity_and_hasher(capacity: uint, hasher: H) -> HashMap<K, V, H> {
          HashMap {
              hasher:        hasher,
              resize_policy: DefaultResizePolicy::new(cap),
-            table:         table::RawTable::new(cap),
+            table:         RawTable::new(cap),
          }
      }
  
@@ -537,49 +687,78 @@ fn resize(&mut self, new_capacity: uint) {
          assert!(self.table.size() <= new_capacity);
          assert!(num::is_power_of_two(new_capacity));
  
-        let mut old_table = replace(&mut self.table, table::RawTable::new(new_capacity));
+        let mut old_table = replace(&mut self.table, RawTable::new(new_capacity));
          let old_size = old_table.size();
  
-        if old_table.capacity() == 0 {
+        if old_table.capacity() == 0 || old_table.size() == 0 {
              return;
          }
  
          if new_capacity < old_table.capacity() {
+            // Shrink the table. Naive algorithm for resizing:
              for (h, k, v) in old_table.move_iter() {
                  self.insert_hashed_nocheck(h, k, v);
              }
          } else {
+            // Grow the table.
+            // Specialization of the other branch.
              let mut bucket = Bucket::first(&mut old_table);
  
+            // "So a few of the first shall be last: for many be called,
+            // but few chosen."
+            //
+            // We'll most likely encounter a few buckets at the beginning that
+            // have their initial buckets near the end of the table. They were
+            // placed at the beginning as the probe wrapped around the table
+            // during insertion. We must skip forward to a bucket that won't
+            // get reinserted too early and won't unfairly steal others spot.
+            // This eliminates the need for robin hood.
              loop {
-                match bucket.peek() {
-                    table::Full(full) => {
+                bucket = match bucket.peek() {
+                    Full(full) => {
                          if full.distance() == 0 {
+                            // This bucket occupies its ideal spot.
+                            // It indicates the start of another "cluster".
                              bucket = full.into_bucket();
                              break;
                          }
-                        bucket = full.next();
+                        // Leaving this bucket in the last cluster for later.
+                        full.into_bucket()
                      }
-                    table::Empty(b) => {
-                        bucket = b.next();
-                        break;
+                    Empty(b) => {
+                        // Encountered a hole between clusters.
+                        b.into_bucket()
                      }
                  };
+                bucket.next();
              }
  
+            // This is how the buckets might be laid out in memory:
+            // ($ marks an initialized bucket)
+            //  ________________
+            // |$$$_$$$$$$_$$$$$|
+            //
+            // But we've skipped the entire initial cluster of buckets
+            // and will continue iteration in this order:
+            //  ________________
+            //     |$$$$$$_$$$$$
+            //                  ^ wrap around once end is reached
+            //  ________________
+            //  $$$_____________|
+            //    ^ exit once table.size == 0
              loop {
                  bucket = match bucket.peek() {
-                    table::Full(bucket) => {
-                        {
-                            let t = bucket.table();
-                            if t.size() == 0 { break }
-                        }
+                    Full(bucket) => {
                          let h = bucket.hash();
                          let (b, k, v) = bucket.take();
                          self.insert_hashed_ordered(h, k, v);
+                        {
+                            let t = b.table(); // FIXME "lifetime too short".
+                            if t.size() == 0 { break }
+                        };
                          b.into_bucket()
                      }
-                    table::Empty(b) => b.into_bucket()
+                    Empty(b) => b.into_bucket()
                  };
                  bucket.next();
              }
@@ -612,41 +791,43 @@ fn make_some_room(&mut self, new_size: uint) {
      ///
      /// If the key already exists, the hashtable will be returned untouched
      /// and a reference to the existing element will be returned.
-    fn insert_hashed_nocheck<'a>(
-        &'a mut self, hash: table::SafeHash, k: K, v: V) -> &'a mut V {
-        self.insert_or_replace_with(hash, k, v, |_, _| ())
+    fn insert_hashed_nocheck(&mut self, hash: SafeHash, k: K, v: V) -> &mut V {
+        self.insert_or_replace_with(hash, k, v, |_, _, _| ())
      }
  
-    fn insert_or_replace_with<'a>(
-        &'a mut self, hash: table::SafeHash, k: K, v: V,
-        found_existing: |&mut V, V|
-    ) -> &'a mut V {
-
+    fn insert_or_replace_with<'a>(&'a mut self,
+                                  hash: SafeHash,
+                                  k: K,
+                                  v: V,
+                                  found_existing: |&mut K, &mut V, V|)
+                                  -> &'a mut V {
          // Worst case, we'll find one empty bucket among `size + 1` buckets.
          let size = self.table.size();
-        let mut rbucket = Bucket::new(&mut self.table, &hash);
-        let ib = rbucket.index();
+        let mut probe = Bucket::new(&mut self.table, &hash);
+        let ib = probe.index();
  
          loop {
-            let mut bucket = match rbucket.peek() {
-                table::Empty(bucket) => {
+            let mut bucket = match probe.peek() {
+                Empty(bucket) => {
                      // Found a hole!
                      let bucket = bucket.put(hash, k, v);
                      let (_, val) = bucket.into_mut_refs();
                      return val;
                  },
-                table::Full(bucket) => bucket
+                Full(bucket) => bucket
              };
  
              if bucket.hash() == hash {
-                let (bucket_k, bucket_v) = bucket.read_mut();
-                // FIXME #12147 the conditional return confuses
-                // borrowck if we return bucket_v directly
-                let bv: *mut V = bucket_v;
-                if k == *bucket_k {
+                let found_match = {
+                    let (bucket_k, _) = bucket.read_mut();
+                    k == *bucket_k
+                };
+                if found_match {
+                    let (bucket_k, bucket_v) = bucket.into_mut_refs();
+                    debug_assert!(k == *bucket_k);
                      // Key already exists. Get its reference.
-                    found_existing(bucket_v, v);
-                    return unsafe {&mut *bv};
+                    found_existing(bucket_k, bucket_v, v);
+                    return bucket_v;
                  }
              }
  
@@ -654,53 +835,18 @@ fn insert_or_replace_with<'a>(
  
              if (ib as int) < robin_ib {
                  // Found a luckier bucket than me. Better steal his spot.
-                let (mut hash, mut k, mut v) = bucket.replace(hash, k, v);
-                let robin_index = bucket.index();
-                let mut robin_ib = robin_ib as uint;
-                let mut rbucket = bucket.next();
-                loop {
-                    let mut bucket = match rbucket.peek() {
-                        table::Empty(bucket) => {
-                            // Found a hole!
-                            let b = bucket.put(hash, k, v);
-                            // Now that it's stolen, just read the value's pointer
-                            // right out of the table!
-                            let (_, v) = match Bucket::at_index(b.into_table(), robin_index).peek() {
-                                table::Full(b) => b.into_mut_refs(),
-                                _ => fail!()
-                            };
-                            return v;
-                        },
-                        table::Full(bucket) => bucket
-                    };
-
-                    let probe_ib = bucket.index() - bucket.distance();
-
-                    // Robin hood! Steal the spot.
-                    if robin_ib < probe_ib {
-                        robin_ib = probe_ib;
-                        let (old_hash, old_key, old_val) = bucket.replace(hash, k, v);
-                        hash = old_hash;
-                        k = old_key;
-                        v = old_val;
-                    }
-                    rbucket = bucket.next();
-                    if rbucket.index() == ib + size + 1 {
-                        fail!("HashMap fatal error: 100% load factor?")
-                    }
-                }
-            }
-            rbucket = bucket.next();
-            if rbucket.index() == ib + size + 1 {
-                fail!("Internal HashMap error: Out of space.")
+                return robin_hood(bucket, robin_ib as uint, hash, k, v);
              }
+
+            probe = bucket.next();
+            assert!(probe.index() != ib + size + 1);
          }
      }
  
      /// Inserts an element which has already been hashed, returning a reference
      /// to that element inside the hashtable. This is more efficient that using
      /// `insert`, since the key will not be rehashed.
-    fn insert_hashed<'a>(&'a mut self, hash: table::SafeHash, k: K, v: V) -> &'a mut V {
+    fn insert_hashed(&mut self, hash: SafeHash, k: K, v: V) -> &mut V {
          let potential_new_size = self.table.size() + 1;
          self.make_some_room(potential_new_size);
          self.insert_hashed_nocheck(hash, k, v)
@@ -721,7 +867,7 @@ fn insert_hashed<'a>(&'a mut self, hash: table::SafeHash, k: K, v: V) -> &'a mut
      /// // Find the existing key
      /// assert_eq!(*map.find_or_insert("a", -2), 1);
      /// ```
-    pub fn find_or_insert<'a>(&'a mut self, k: K, v: V) -> &'a mut V {
+    pub fn find_or_insert(&mut self, k: K, v: V) -> &mut V {
          self.find_with_or_insert_with(k, v, |_k, _v, _a| (), |_k, a| a)
      }
  
@@ -768,7 +914,11 @@ pub fn insert_or_update_with<'a>(
                                   v: V,
                                   f: |&K, &mut V|)
                                   -> &'a mut V {
-        self.find_with_or_insert_with(k, v, |k, v, _a| f(k, v), |_k, a| a)
+        let potential_new_size = self.table.size() + 1;
+        self.make_some_room(potential_new_size);
+
+        let hash = self.make_hash(&k);
+        self.insert_or_replace_with(hash, k, v, |kref, vref, _v| f(kref, vref))
      }
  
      /// Modify and return the value corresponding to the key in the map, or
@@ -820,21 +970,22 @@ pub fn find_with_or_insert_with<'a, A>(&'a mut self,
                                             a: A,
                                             found: |&K, &mut V, A|,
                                             not_found: |&K, A| -> V)
-                                          -> &'a mut V {
+                                          -> &'a mut V
+    {
          let hash = self.make_hash(&k);
-        {
-            match search_hashed(&mut self.table, &hash, &k) {
-                Some(bucket) => {
-                    let (_, v_ref) = bucket.into_mut_refs();
-                    found(&k, v_ref, a);
-                    return v_ref;
-                }
-                _ => {
-                }
-            };
+        let this = MapMutRef { map_ref: self };
+
+        match search_hashed(this, &hash, &k) {
+            FoundExisting(bucket) => {
+                let (_, v_ref) = bucket.into_mut_refs();
+                found(&k, v_ref, a);
+                v_ref
+            }
+            TableRef(this) => {
+                let v = not_found(&k, a);
+                this.map_ref.insert_hashed(hash, k, v)
+            }
          }
-        let v = not_found(&k, a);
-        self.insert_hashed(hash, k, v)
      }
  
      /// Retrieves a value for the given key.
@@ -996,7 +1147,7 @@ pub fn pop_equiv<Q:Hash<S> + Equiv<K>>(&mut self, k: &Q) -> Option<V> {
      ///     println!("{}", key);
      /// }
      /// ```
-    pub fn keys<'a>(&'a self) -> Keys<'a, K, V> {
+    pub fn keys(&self) -> Keys<K, V> {
          self.iter().map(|(k, _v)| k)
      }
  
@@ -1017,7 +1168,7 @@ pub fn keys<'a>(&'a self) -> Keys<'a, K, V> {
      ///     println!("{}", key);
      /// }
      /// ```
-    pub fn values<'a>(&'a self) -> Values<'a, K, V> {
+    pub fn values(&self) -> Values<K, V> {
          self.iter().map(|(_k, v)| v)
      }
  
@@ -1038,8 +1189,8 @@ pub fn values<'a>(&'a self) -> Values<'a, K, V> {
      ///     println!("key: {} val: {}", key, val);
      /// }
      /// ```
-    pub fn iter<'a>(&'a self) -> Entries<'a, K, V> {
-        self.table.iter()
+    pub fn iter(&self) -> Entries<K, V> {
+        Entries { inner: self.table.iter() }
      }
  
      /// An iterator visiting all key-value pairs in arbitrary order,
@@ -1065,8 +1216,8 @@ pub fn iter<'a>(&'a self) -> Entries<'a, K, V> {
      ///     println!("key: {} val: {}", key, val);
      /// }
      /// ```
-    pub fn mut_iter<'a>(&'a mut self) -> MutEntries<'a, K, V> {
-        self.table.mut_iter()
+    pub fn mut_iter(&mut self) -> MutEntries<K, V> {
+        MutEntries { inner: self.table.mut_iter() }
      }
  
      /// Creates a consuming iterator, that is, one that moves each key-value
@@ -1087,7 +1238,9 @@ pub fn mut_iter<'a>(&'a mut self) -> MutEntries<'a, K, V> {
      /// let vec: Vec<(&str, int)> = map.move_iter().collect();
      /// ```
      pub fn move_iter(self) -> MoveEntries<K, V> {
-        self.table.move_iter().map(|(_, k, v)| (k, v))
+        MoveEntries {
+            inner: self.table.move_iter().map(|(_, k, v)| (k, v))
+        }
      }
  }
  
@@ -1131,13 +1284,9 @@ impl<K: Eq + Hash<S>, V: PartialEq, S, H: Hasher<S>> PartialEq for HashMap<K, V,
      fn eq(&self, other: &HashMap<K, V, H>) -> bool {
          if self.len() != other.len() { return false; }
  
-        self.iter()
-          .all(|(key, value)| {
-            match other.find(key) {
-                None    => false,
-                Some(v) => *value == *v
-            }
-        })
+        self.iter().all(|(key, value)|
+            other.find(key).map_or(false, |v| *value == *v)
+        )
      }
  }
  
@@ -1178,14 +1327,52 @@ fn index_mut<'a>(&'a mut self, index: &K) -> &'a mut V {
  }*/
  
  /// HashMap iterator
-pub type Entries<'a, K, V> = table::Entries<'a, K, V>;
+pub struct Entries<'a, K: 'a, V: 'a> {
+    inner: table::Entries<'a, K, V>
+}
  
  /// HashMap mutable values iterator
-pub type MutEntries<'a, K, V> = table::MutEntries<'a, K, V>;
+pub struct MutEntries<'a, K: 'a, V: 'a> {
+    inner: table::MutEntries<'a, K, V>
+}
  
  /// HashMap move iterator
-pub type MoveEntries<K, V> =
-    iter::Map<'static, (table::SafeHash, K, V), (K, V), table::MoveEntries<K, V>>;
+pub struct MoveEntries<K, V> {
+    inner: iter::Map<'static, (SafeHash, K, V), (K, V), table::MoveEntries<K, V>>
+}
+
+impl<'a, K, V> Iterator<(&'a K, &'a V)> for Entries<'a, K, V> {
+    #[inline]
+    fn next(&mut self) -> Option<(&'a K, &'a V)> {
+        self.inner.next()
+    }
+    #[inline]
+    fn size_hint(&self) -> (uint, Option<uint>) {
+        self.inner.size_hint()
+    }
+}
+
+impl<'a, K, V> Iterator<(&'a K, &'a mut V)> for MutEntries<'a, K, V> {
+    #[inline]
+    fn next(&mut self) -> Option<(&'a K, &'a mut V)> {
+        self.inner.next()
+    }
+    #[inline]
+    fn size_hint(&self) -> (uint, Option<uint>) {
+        self.inner.size_hint()
+    }
+}
+
+impl<K, V> Iterator<(K, V)> for MoveEntries<K, V> {
+    #[inline]
+    fn next(&mut self) -> Option<(K, V)> {
+        self.inner.next()
+    }
+    #[inline]
+    fn size_hint(&self) -> (uint, Option<uint>) {
+        self.inner.size_hint()
+    }
+}
  
  /// HashMap keys iterator
  pub type Keys<'a, K, V> =
@@ -1266,7 +1453,6 @@ struct Dropable {
          k: uint
      }
  
-
      impl Dropable {
          fn new(k: uint) -> Dropable {
              let v = drop_vector.get().unwrap();
@@ -1371,6 +1557,7 @@ fn test_move_iter_drops() {
              hm
          };
  
+        // By the way, ensure that cloning doesn't screw up the dropping.
          drop(hm.clone());
  
          {
@@ -1505,6 +1692,28 @@ fn test_insert_conflicts() {
          assert_eq!(*m.find(&1).unwrap(), 2);
      }
  
+    #[test]
+    fn test_update_with() {
+        let mut m = HashMap::with_capacity(4);
+        assert!(m.insert(1i, 2i));
+
+        for i in range(1i, 1000) {
+            assert_eq!(
+                i + 2,
+                *m.insert_or_update_with(i + 1, i + 2, |_k, _v| {
+                    fail!("Key not yet present");
+                })
+            );
+            assert_eq!(
+                i + 1,
+                *m.insert_or_update_with(i, i + 3, |k, v| {
+                    assert_eq!(*k, i);
+                    assert_eq!(*v, i + 1);
+                })
+            );
+        }
+    }
+
      #[test]
      fn test_conflict_remove() {
          let mut m = HashMap::with_capacity(4);
@@ -1698,6 +1907,7 @@ fn test_resize_policy() {
              m.insert(i, i);
              i += 1;
          }
+        // three quarters full
  
          assert_eq!(m.len(), i);
          assert_eq!(m.table.capacity(), cap);
@@ -1706,16 +1916,18 @@ fn test_resize_policy() {
              m.insert(i, i);
              i += 1;
          }
+        // half full
  
          let new_cap = m.table.capacity();
          assert_eq!(new_cap, cap * 2);
  
-        for _ in range(0, cap / 2) {
+        for _ in range(0, cap / 2 - 1) {
              i -= 1;
              m.remove(&i);
              assert_eq!(m.table.capacity(), new_cap);
          }
-
+        // A little more than one quarter full.
+        // Shrinking starts as we remove more elements:
          for _ in range(0, cap / 2 - 1) {
              i -= 1;
              m.remove(&i);
diff --git a/src/libstd/collections/hashmap/mod.rs b/src/libstd/collections/hashmap/mod.rs

index f493e844526ee5c781e56030fc71aeb2d382771e..b5612ce0f077d05cf8f525c2983ec1c836085cb2 100644 (file)
--- a/src/libstd/collections/hashmap/mod.rs
+++ b/src/libstd/collections/hashmap/mod.rs
@@ -12,6 +12,7 @@
  
  pub use self::map::HashMap;
  pub use self::map::Entries;
+pub use self::map::MutEntries;
  pub use self::map::MoveEntries;
  pub use self::map::Keys;
  pub use self::map::Values;
diff --git a/src/libstd/collections/hashmap/set.rs b/src/libstd/collections/hashmap/set.rs

index a1f71e333033f3c0a1d1e211247d56309b5fee56..4a2a04cbc9f66b0ee452926fc5cda3361e39e37a 100644 (file)
--- a/src/libstd/collections/hashmap/set.rs
+++ b/src/libstd/collections/hashmap/set.rs
@@ -16,8 +16,7 @@
  use default::Default;
  use fmt::Show;
  use fmt;
-use RandomSipHasher;
-use hash::{Hash, Hasher};
+use hash::{Hash, Hasher, RandomSipHasher};
  use iter::{Iterator, FromIterator, FilterMap, Chain, Repeat, Zip, Extendable};
  use iter;
  use option::{Some, None};
@@ -25,13 +24,13 @@
  
  use super::{HashMap, Entries, MoveEntries, INITIAL_CAPACITY};
  
-/// HashSet iterator
-pub type SetItems<'a, K> =
-    iter::Map<'static, (&'a K, &'a ()), &'a K, Entries<'a, K, ()>>;
  
-/// HashSet move iterator
-pub type SetMoveItems<K> =
-    iter::Map<'static, (K, ()), K, MoveEntries<K, ()>>;
+// Future Optimization (FIXME!)
+// =============================
+//
+// Iteration over zero sized values is a noop. There is no need
+// for `bucket.val` in the case of HashSet. I suppose we would need HKT
+// to get rid of it properly.
  
  /// An implementation of a hash set using the underlying representation of a
  /// HashMap where the value is (). As with the `HashMap` type, a `HashSet`
@@ -444,6 +443,14 @@ fn default() -> HashSet<T, H> {
      }
  }
  
+/// HashSet iterator
+pub type SetItems<'a, K> =
+    iter::Map<'static, (&'a K, &'a ()), &'a K, Entries<'a, K, ()>>;
+
+/// HashSet move iterator
+pub type SetMoveItems<K> =
+    iter::Map<'static, (K, ()), K, MoveEntries<K, ()>>;
+
  // `Repeat` is used to feed the filter closure an explicit capture
  // of a reference to the other set
  /// Set operations iterator
diff --git a/src/libstd/collections/hashmap/table.rs b/src/libstd/collections/hashmap/table.rs

index 96d1a9ba2fbb534a441bb702392676fb768755ca..54469baaef5fa06d555bffaba1b1149adb66df43 100644 (file)
--- a/src/libstd/collections/hashmap/table.rs
+++ b/src/libstd/collections/hashmap/table.rs
@@ -14,14 +14,13 @@
  use cmp;
  use hash::{Hash, Hasher};
  use iter::{Iterator, count};
+use kinds::marker;
  use mem::{min_align_of, size_of};
  use mem;
-use num::{CheckedMul, is_power_of_two};
+use num::{CheckedAdd, CheckedMul, is_power_of_two};
  use ops::{Deref, DerefMut, Drop};
  use option::{Some, None, Option};
-use ptr::RawPtr;
-use ptr::set_memory;
-use ptr::write;
+use ptr::{RawPtr, copy_nonoverlapping_memory, zero_memory};
  use ptr;
  use rt::heap::{allocate, deallocate};
  
@@ -34,17 +33,17 @@
  /// `Vec<Option<u64, K, V>>`, because we don't pay for the overhead of an
  /// option on every element, and we get a generally more cache-aware design.
  ///
-/// Key invariants of this structure:
+/// Essential invariants of this structure:
  ///
-///   - if hashes[i] == EMPTY_BUCKET, then keys[i] and vals[i] have
-///     'undefined' contents. Don't read from them. This invariant is
-///     enforced outside this module with the `EmptyIndex`, `FullIndex`,
+///   - if t.hashes[i] == EMPTY_BUCKET, then `Bucket::at_index(&t, i).raw`
+///     points to 'undefined' contents. Don't read from it. This invariant is
+///     enforced outside this module with the `EmptyBucket`, `FullBucket`,
  ///     and `SafeHash` types.
  ///
-///   - An `EmptyIndex` is only constructed for a bucket at an index with
+///   - An `EmptyBucket` is only constructed at an index with
  ///     a hash of EMPTY_BUCKET.
  ///
-///   - A `FullIndex` is only constructed for a bucket at an index with a
+///   - A `FullBucket` is only constructed at an index with a
  ///     non-EMPTY_BUCKET hash.
  ///
  ///   - A `SafeHash` is only constructed for non-`EMPTY_BUCKET` hash. We get
@@ -56,48 +55,21 @@
  ///     `capacity`. This is set at creation and never changes. The arrays
  ///     are unzipped to save space (we don't have to pay for the padding
  ///     between odd sized elements, such as in a map from u64 to u8), and
-///     be more cache aware (scanning through 8 hashes brings in 2 cache
-///     lines, since they're all right beside each other).
+///     be more cache aware (scanning through 8 hashes brings in at most
+///     2 cache lines, since they're all right beside each other).
  ///
  /// You can kind of think of this module/data structure as a safe wrapper
  /// around just the "table" part of the hashtable. It enforces some
  /// invariants at the type level and employs some performance trickery,
  /// but in general is just a tricked out `Vec<Option<u64, K, V>>`.
-///
-/// FIXME(cgaebel):
-///
-/// Feb 11, 2014: This hashtable was just implemented, and, hard as I tried,
-/// isn't yet totally safe. There's a "known exploit" that you can create
-/// multiple FullIndexes for a bucket, `take` one, and then still `take`
-/// the other causing undefined behavior. Currently, there's no story
-/// for how to protect against this statically. Therefore, there are asserts
-/// on `take`, `get`, `get_mut`, and `put` which check the bucket state.
-/// With time, and when we're confident this works correctly, they should
-/// be removed. Also, the bounds check in `peek` is especially painful,
-/// as that's called in the innermost loops of the hashtable and has the
-/// potential to be a major performance drain. Remove this too.
-///
-/// Or, better than remove, only enable these checks for debug builds.
-/// There's currently no "debug-only" asserts in rust, so if you're reading
-/// this and going "what? of course there are debug-only asserts!", then
-/// please make this use them!
  #[unsafe_no_drop_flag]
  pub struct RawTable<K, V> {
      capacity: uint,
      size:     uint,
-    hashes:   *mut u64
-}
-
-/// A bucket that holds a reference to the table
-pub trait BucketWithTable<M> {
-    /// A bucket that holds a reference to the table
-    fn table<'a>(&'a self) -> &'a M;
-
-    /// Move out the reference to the table.
-    fn into_table(self) -> M;
-
-    /// Get the raw index.
-    fn index(&self) -> uint;
+    hashes:   *mut u64,
+    // Because K/V do not appear directly in any of the types in the struct,
+    // inform rustc that in fact instances of K and V are reachable from here.
+    marker:   marker::CovariantType<(K,V)>,
  }
  
  struct RawBucket<K, V> {
@@ -124,47 +96,66 @@ pub struct FullBucket<K, V, M> {
      table: M
  }
  
-pub type EmptyBucketImm<'table,K,V> = EmptyBucket<K, V, &'table RawTable<K,V>>;
-pub type  FullBucketImm<'table,K,V> =  FullBucket<K, V, &'table RawTable<K,V>>;
+pub type EmptyBucketImm<'table, K, V> = EmptyBucket<K, V, &'table RawTable<K, V>>;
+pub type  FullBucketImm<'table, K, V> =  FullBucket<K, V, &'table RawTable<K, V>>;
  
-pub type EmptyBucketMut<'table,K,V> = EmptyBucket<K, V, &'table mut RawTable<K,V>>;
-pub type  FullBucketMut<'table,K,V> =  FullBucket<K, V, &'table mut RawTable<K,V>>;
+pub type EmptyBucketMut<'table, K, V> = EmptyBucket<K, V, &'table mut RawTable<K, V>>;
+pub type  FullBucketMut<'table, K, V> =  FullBucket<K, V, &'table mut RawTable<K, V>>;
  
+pub enum BucketState<K, V, M> {
+    Empty(EmptyBucket<K, V, M>),
+    Full(FullBucket<K, V, M>),
+}
+
+// A GapThenFull encapsulates the state of two consecutive buckets at once.
+// The first bucket, called the gap, is known to be empty.
+// The second bucket is full.
  struct GapThenFull<K, V, M> {
      gap: EmptyBucket<K, V, ()>,
-    full: FullBucket<K, V, M>
+    full: FullBucket<K, V, M>,
  }
  
-impl<K, V, M: Deref<RawTable<K,V>>> GapThenFull<K, V, M> {
-    pub fn full<'a>(&'a self) -> &'a FullBucket<K, V, M> {
-        &self.full
-    }
-
-    pub fn shift(mut self) -> Option<GapThenFull<K, V, M>> {
-        unsafe {
-            *self.gap.raw.hash = mem::replace(&mut *self.full.raw.hash, EMPTY_BUCKET);
-            mem::overwrite(self.gap.raw.key, ptr::read(self.full.raw.key as *const K));
-            mem::overwrite(self.gap.raw.val, ptr::read(self.full.raw.val as *const V));
-        }
-
-        let FullBucket { raw, idx, .. } = self.full;
-
-        match self.full.next().peek() {
-            Empty(_) => None,
-            Full(bucket) => {
-                self.gap.raw = raw;
-                self.gap.idx = idx;
+/// A hash that is not zero, since we use a hash of zero to represent empty
+/// buckets.
+#[deriving(PartialEq)]
+pub struct SafeHash {
+    hash: u64,
+}
  
-                self.full = bucket;
-                self.full.idx &= self.full.table.capacity - 1;
+impl SafeHash {
+    /// Peek at the hash value, which is guaranteed to be non-zero.
+    #[inline(always)]
+    pub fn inspect(&self) -> u64 { self.hash }
+}
  
-                Some(self)
-            }
-        }
+/// We need to remove hashes of 0. That's reserved for empty buckets.
+/// This function wraps up `hash_keyed` to be the only way outside this
+/// module to generate a SafeHash.
+pub fn make_hash<T: Hash<S>, S, H: Hasher<S>>(hasher: &H, t: &T) -> SafeHash {
+    match hasher.hash(t) {
+        // This constant is exceedingly likely to hash to the same
+        // bucket, but it won't be counted as empty! Just so we can maintain
+        // our precious uniform distribution of initial indexes.
+        EMPTY_BUCKET => SafeHash { hash: 0x8000_0000_0000_0000 },
+        h            => SafeHash { hash: h },
      }
  }
  
-impl<K, V> RawPtr<u64> for RawBucket<K, V> {
+// `replace` casts a `*u64` to a `*SafeHash`. Since we statically
+// ensure that a `FullBucket` points to an index with a non-zero hash,
+// and a `SafeHash` is just a `u64` with a different name, this is
+// safe.
+//
+// This test ensures that a `SafeHash` really IS the same size as a
+// `u64`. If you need to change the size of `SafeHash` (and
+// consequently made this test fail), `replace` needs to be
+// modified to no longer assume this.
+#[test]
+fn can_alias_safehash_as_u64() {
+    assert_eq!(size_of::<SafeHash>(), size_of::<u64>())
+}
+
+impl<K, V> RawBucket<K, V> {
      unsafe fn offset(self, count: int) -> RawBucket<K, V> {
          RawBucket {
              hash: self.hash.offset(count),
@@ -172,35 +163,143 @@ unsafe fn offset(self, count: int) -> RawBucket<K, V> {
              val:  self.val.offset(count),
          }
      }
+}
  
-    fn null() -> RawBucket<K, V> {
-        RawBucket {
-            hash: RawPtr::null(),
-            key:  RawPtr::null(),
-            val:  RawPtr::null()
+// For parameterizing over mutability.
+impl<'t, K, V> Deref<RawTable<K, V>> for &'t RawTable<K, V> {
+    fn deref(&self) -> &RawTable<K, V> {
+        &**self
+    }
+}
+
+impl<'t, K, V> Deref<RawTable<K, V>> for &'t mut RawTable<K, V> {
+    fn deref(&self) -> &RawTable<K,V> {
+        &**self
+    }
+}
+
+impl<'t, K, V> DerefMut<RawTable<K, V>> for &'t mut RawTable<K, V> {
+    fn deref_mut(&mut self) -> &mut RawTable<K,V> {
+        &mut **self
+    }
+}
+
+// Buckets hold references to the table.
+impl<K, V, M> FullBucket<K, V, M> {
+    /// Borrow a reference to the table.
+    pub fn table(&self) -> &M {
+        &self.table
+    }
+    /// Move out the reference to the table.
+    pub fn into_table(self) -> M {
+        self.table
+    }
+    /// Get the raw index.
+    pub fn index(&self) -> uint {
+        self.idx
+    }
+}
+
+impl<K, V, M> EmptyBucket<K, V, M> {
+    /// Borrow a reference to the table.
+    pub fn table(&self) -> &M {
+        &self.table
+    }
+    /// Move out the reference to the table.
+    pub fn into_table(self) -> M {
+        self.table
+    }
+}
+
+impl<K, V, M> Bucket<K, V, M> {
+    /// Move out the reference to the table.
+    pub fn into_table(self) -> M {
+        self.table
+    }
+    /// Get the raw index.
+    pub fn index(&self) -> uint {
+        self.idx
+    }
+}
+
+impl<K, V, M: Deref<RawTable<K, V>>> Bucket<K, V, M> {
+    pub fn new(table: M, hash: &SafeHash) -> Bucket<K, V, M> {
+        Bucket::at_index(table, hash.inspect() as uint)
+    }
+
+    pub fn at_index(table: M, ib_index: uint) -> Bucket<K, V, M> {
+        let ib_index = ib_index & (table.capacity() - 1);
+        Bucket {
+            raw: unsafe {
+               table.first_bucket_raw().offset(ib_index as int)
+            },
+            idx: ib_index,
+            table: table
          }
      }
  
-    fn is_null(&self) -> bool {
-        self.hash.is_null()
+    pub fn first(table: M) -> Bucket<K, V, M> {
+        Bucket {
+            raw: table.first_bucket_raw(),
+            idx: 0,
+            table: table
+        }
      }
  
-    fn to_uint(&self) -> uint {
-        self.hash.to_uint()
+    /// Reads a bucket at a given index, returning an enum indicating whether
+    /// it's initialized or not. You need to match on this enum to get
+    /// the appropriate types to call most of the other functions in
+    /// this module.
+    pub fn peek(self) -> BucketState<K, V, M> {
+        match unsafe { *self.raw.hash } {
+            EMPTY_BUCKET =>
+                Empty(EmptyBucket {
+                    raw: self.raw,
+                    idx: self.idx,
+                    table: self.table
+                }),
+            _ =>
+                Full(FullBucket {
+                    raw: self.raw,
+                    idx: self.idx,
+                    table: self.table
+                })
+        }
      }
  
-    unsafe fn to_option(&self) -> Option<&u64> {
-        self.hash.to_option()
+    /// Modifies the bucket pointer in place to make it point to the next slot.
+    pub fn next(&mut self) {
+        // Branchless bucket iteration step.
+        // As we reach the end of the table...
+        // We take the current idx:          0111111b
+        // Xor it by its increment:        ^ 1000000b
+        //                               ------------
+        //                                   1111111b
+        // Then AND with the capacity:     & 1000000b
+        //                               ------------
+        // to get the backwards offset:      1000000b
+        // ... and it's zero at all other times.
+        let maybe_wraparound_dist = (self.idx ^ (self.idx + 1)) & self.table.capacity();
+        // Finally, we obtain the offset 1 or the offset -cap + 1.
+        let dist = 1i - (maybe_wraparound_dist as int);
+
+        self.idx += 1;
+
+        unsafe {
+            self.raw = self.raw.offset(dist);
+        }
      }
  }
  
-impl<K, V, M: Deref<RawTable<K,V>>> EmptyBucket<K, V, M> {
+impl<K, V, M: Deref<RawTable<K, V>>> EmptyBucket<K, V, M> {
+    #[inline]
      pub fn next(self) -> Bucket<K, V, M> {
          let mut bucket = self.into_bucket();
          bucket.next();
          bucket
      }
  
+    #[inline]
      pub fn into_bucket(self) -> Bucket<K, V, M> {
          Bucket {
              raw: self.raw,
@@ -217,24 +316,31 @@ pub fn gap_peek(self) -> Option<GapThenFull<K, V, M>> {
          };
  
          match self.next().peek() {
-            Empty(_) => None,
              Full(bucket) => {
                  Some(GapThenFull {
                      gap: gap,
                      full: bucket
                  })
              }
+            Empty(..) => None
          }
      }
  }
  
-impl<K, V, M: DerefMut<RawTable<K,V>>> EmptyBucket<K, V, M> {
+impl<K, V, M: DerefMut<RawTable<K, V>>> EmptyBucket<K, V, M> {
+    /// Puts given key and value pair, along with the key's hash,
+    /// into this bucket in the hashtable. Note how `self` is 'moved' into
+    /// this function, because this slot will no longer be empty when
+    /// we return! A `FullBucket` is returned for later use, pointing to
+    /// the newly-filled slot in the hashtable.
+    ///
+    /// Use `make_hash` to construct a `SafeHash` to pass to this function.
      pub fn put(mut self, hash: SafeHash, key: K, value: V)
                 -> FullBucket<K, V, M> {
          unsafe {
              *self.raw.hash = hash.inspect();
-            write(self.raw.key, key);
-            write(self.raw.val, value);
+            ptr::write(self.raw.key, key);
+            ptr::write(self.raw.val, value);
          }
  
          self.table.size += 1;
@@ -243,13 +349,15 @@ pub fn put(mut self, hash: SafeHash, key: K, value: V)
      }
  }
  
-impl<K, V, M: Deref<RawTable<K,V>>> FullBucket<K, V, M> {
+impl<K, V, M: Deref<RawTable<K, V>>> FullBucket<K, V, M> {
+    #[inline]
      pub fn next(self) -> Bucket<K, V, M> {
          let mut bucket = self.into_bucket();
          bucket.next();
          bucket
      }
  
+    #[inline]
      pub fn into_bucket(self) -> Bucket<K, V, M> {
          Bucket {
              raw: self.raw,
@@ -258,10 +366,19 @@ pub fn into_bucket(self) -> Bucket<K, V, M> {
          }
      }
  
+    /// Get the distance between this bucket and the 'ideal' location
+    /// as determined by the key's hash stored in it.
+    ///
+    /// In the cited blog posts above, this is called the "distance to
+    /// initial bucket", or DIB. Also known as "probe count".
      pub fn distance(&self) -> uint {
+        // Calculates the distance one has to travel when going from
+        // `hash mod capacity` onwards to `idx mod capacity`, wrapping around
+        // if the destination is not reached before the end of the table.
          (self.idx - self.hash().inspect() as uint) & (self.table.capacity() - 1)
      }
  
+    #[inline]
      pub fn hash(&self) -> SafeHash {
          unsafe {
              SafeHash {
@@ -270,23 +387,20 @@ pub fn hash(&self) -> SafeHash {
          }
      }
  
-    pub fn read<'a>(&'a self) -> (&'a K, &'a V) {
-        unsafe {
-            (&*self.raw.key,
-             &*self.raw.val)
-        }
-    }
-
-    pub fn into_refs(self) -> (&K, &V) {
+    /// Gets references to the key and value at a given index.
+    pub fn read(&self) -> (&K, &V) {
          unsafe {
-            // debug_assert!(*self.raw.hash != EMPTY_BUCKET);
              (&*self.raw.key,
               &*self.raw.val)
          }
      }
  }
  
-impl<K, V, M: DerefMut<RawTable<K,V>>> FullBucket<K, V, M> {
+impl<K, V, M: DerefMut<RawTable<K, V>>> FullBucket<K, V, M> {
+    /// Removes this bucket's key and value from the hashtable.
+    ///
+    /// This works similarly to `put`, building an `EmptyBucket` out of the
+    /// taken bucket.
      pub fn take(mut self) -> (EmptyBucket<K, V, M>, K, V) {
          let key = self.raw.key as *const K;
          let val = self.raw.val as *const V;
@@ -317,176 +431,86 @@ pub fn replace(&mut self, h: SafeHash, k: K, v: V) -> (SafeHash, K, V) {
          }
      }
  
-    pub fn read_mut<'a>(&'a self) -> (&'a mut K, &'a mut V) {
+    /// Gets mutable references to the key and value at a given index.
+    pub fn read_mut(&mut self) -> (&mut K, &mut V) {
          unsafe {
-            // debug_assert!(*self.raw.hash != EMPTY_BUCKET);
              (&mut *self.raw.key,
               &mut *self.raw.val)
          }
      }
+}
  
-    pub fn into_mut_refs(self) -> (&mut K, &mut V) {
+impl<'t, K, V, M: Deref<RawTable<K, V>> + 't> FullBucket<K, V, M> {
+    /// Exchange a bucket state for immutable references into the table.
+    /// Because the underlying reference to the table is also consumed,
+    /// no further changes to the structure of the table are possible;
+    /// in exchange for this, the returned references have a longer lifetime
+    /// than the references returned by `read()`.
+    pub fn into_refs(self) -> (&'t K, &'t V) {
          unsafe {
-            // debug_assert!(*self.raw.hash != EMPTY_BUCKET);
-            (&mut *self.raw.key,
-             &mut *self.raw.val)
+            (&*self.raw.key,
+             &*self.raw.val)
          }
      }
  }
  
-impl<K, V, M: Deref<RawTable<K,V>>> Bucket<K, V, M> {
-    pub fn new(table: M, hash: &SafeHash) -> Bucket<K, V, M> {
-        let ib_index = (hash.inspect() as uint) & (table.capacity() - 1);
-        Bucket {
-            raw: unsafe {
-               table.as_mut_ptrs().offset(ib_index as int)
-            },
-            idx: ib_index,
-            table: table
-        }
-    }
-
-    pub fn at_index(table: M, ib_index: uint) -> Bucket<K, V, M> {
-        let ib_index = ib_index & (table.capacity() - 1);
-        Bucket {
-            raw: unsafe {
-               table.as_mut_ptrs().offset(ib_index as int)
-            },
-            idx: ib_index,
-            table: table
-        }
-    }
-
-    pub fn first(table: M) -> Bucket<K, V, M> {
-        Bucket {
-            raw: table.as_mut_ptrs(),
-            idx: 0,
-            table: table
-        }
-    }
-
-    pub fn peek(self) -> BucketState<K, V, M> {
-        match unsafe { *self.raw.hash } {
-            EMPTY_BUCKET =>
-                Empty(EmptyBucket {
-                    raw: self.raw,
-                    idx: self.idx,
-                    table: self.table
-                }),
-            _ =>
-                Full(FullBucket {
-                    raw: self.raw,
-                    idx: self.idx,
-                    table: self.table
-                })
-        }
-    }
-
-    pub fn next(&mut self) {
-        self.idx += 1;
-
-        let dist = if self.idx == self.table.capacity() {
-            -(self.table.capacity() as int - 1)
-        } else {
-            1i
-        };
-
+impl<'t, K, V, M: DerefMut<RawTable<K, V>> + 't> FullBucket<K, V, M> {
+    /// This works similarly to `into_refs`, exchanging a bucket state
+    /// for mutable references into the table.
+    pub fn into_mut_refs(self) -> (&'t mut K, &'t mut V) {
          unsafe {
-            self.raw = self.raw.offset(dist);
+            (&mut *self.raw.key,
+             &mut *self.raw.val)
          }
      }
  }
  
-impl<K, V, M> BucketWithTable<M> for FullBucket<K, V, M> {
-    fn table<'a>(&'a self) -> &'a M {
-        &self.table
-    }
-
-    fn into_table(self) -> M {
-        self.table
-    }
-
-    fn index(&self) -> uint {
-        self.idx
-    }
-}
-
-impl<K, V, M> BucketWithTable<M> for EmptyBucket<K, V, M> {
-    fn table<'a>(&'a self) -> &'a M {
-        &self.table
-    }
-
-    fn into_table(self) -> M {
-        self.table
-    }
-
-    fn index(&self) -> uint {
-        self.idx
+impl<K, V, M> BucketState<K, V, M> {
+    // For convenience.
+    pub fn expect_full(self) -> FullBucket<K, V, M> {
+        match self {
+            Full(full) => full,
+            Empty(..) => fail!("Expected full bucket")
+        }
      }
  }
  
-impl<K, V, M> BucketWithTable<M> for Bucket<K, V, M> {
-    fn table<'a>(&'a self) -> &'a M {
-        &self.table
+impl<K, V, M: Deref<RawTable<K, V>>> GapThenFull<K, V, M> {
+    #[inline]
+    pub fn full(&self) -> &FullBucket<K, V, M> {
+        &self.full
      }
  
-    fn into_table(self) -> M {
-        self.table
-    }
+    pub fn shift(mut self) -> Option<GapThenFull<K, V, M>> {
+        unsafe {
+            *self.gap.raw.hash = mem::replace(&mut *self.full.raw.hash, EMPTY_BUCKET);
+            copy_nonoverlapping_memory(self.gap.raw.key, self.full.raw.key as *const K, 1);
+            copy_nonoverlapping_memory(self.gap.raw.val, self.full.raw.val as *const V, 1);
+        }
  
-    fn index(&self) -> uint {
-        self.idx
-    }
-}
+        let FullBucket { raw: prev_raw, idx: prev_idx, .. } = self.full;
  
-impl<'table,K,V> Deref<RawTable<K,V>> for &'table RawTable<K,V> {
-    fn deref<'a>(&'a self) -> &'a RawTable<K,V> {
-        &**self
-    }
-}
+        match self.full.next().peek() {
+            Full(bucket) => {
+                self.gap.raw = prev_raw;
+                self.gap.idx = prev_idx;
  
-impl<'table,K,V> Deref<RawTable<K,V>> for &'table mut RawTable<K,V> {
-    fn deref<'a>(&'a self) -> &'a RawTable<K,V> {
-        &**self
-    }
-}
+                self.full = bucket;
  
-impl<'table,K,V> DerefMut<RawTable<K,V>> for &'table mut RawTable<K,V> {
-    fn deref_mut<'a>(&'a mut self) -> &'a mut RawTable<K,V> {
-        &mut **self
+                Some(self)
+            }
+            Empty(..) => None
+        }
      }
  }
  
-pub enum BucketState<K, V, M> {
-    Empty(EmptyBucket<K, V, M>),
-    Full(FullBucket<K, V, M>),
-}
-
-/// A hash that is not zero, since we use a hash of zero to represent empty
-/// buckets.
-#[deriving(PartialEq)]
-pub struct SafeHash {
-    hash: u64,
-}
-
-impl SafeHash {
-    /// Peek at the hash value, which is guaranteed to be non-zero.
-    #[inline(always)]
-    pub fn inspect(&self) -> u64 { self.hash }
-}
-
-/// We need to remove hashes of 0. That's reserved for empty buckets.
-/// This function wraps up `hash_keyed` to be the only way outside this
-/// module to generate a SafeHash.
-pub fn make_hash<T: Hash<S>, S, H: Hasher<S>>(hasher: &H, t: &T) -> SafeHash {
-    match hasher.hash(t) {
-        // This constant is exceedingly likely to hash to the same
-        // bucket, but it won't be counted as empty!
-        EMPTY_BUCKET => SafeHash { hash: 0x8000_0000_0000_0000 },
-        h            => SafeHash { hash: h },
-    }
-}
  
+/// Rounds up to a multiple of a power of two. Returns the closest multiple
+/// of `target_alignment` that is higher or equal to `unrounded`.
+///
+/// # Failure
+///
+/// Fails if `target_alignment` is not a power of two.
  fn round_up_to_next(unrounded: uint, target_alignment: uint) -> uint {
      assert!(is_power_of_two(target_alignment));
      (unrounded + target_alignment - 1) & !(target_alignment - 1)
@@ -531,7 +555,6 @@ fn test_offset_calculation() {
  }
  
  impl<K, V> RawTable<K, V> {
-
      /// Does not initialize the buckets. The caller should ensure they,
      /// at the very least, set every hash to EMPTY_BUCKET.
      unsafe fn new_uninitialized(capacity: uint) -> RawTable<K, V> {
@@ -540,6 +563,7 @@ unsafe fn new_uninitialized(capacity: uint) -> RawTable<K, V> {
                  size: 0,
                  capacity: 0,
                  hashes: 0 as *mut u64,
+                marker: marker::CovariantType,
              };
          }
          let hashes_size = capacity.checked_mul(&size_of::<u64>())
@@ -571,17 +595,18 @@ unsafe fn new_uninitialized(capacity: uint) -> RawTable<K, V> {
              capacity: capacity,
              size:     0,
              hashes:   hashes,
+            marker:   marker::CovariantType,
          }
      }
  
-    fn as_mut_ptrs(&self) -> RawBucket<K, V> {
+    fn first_bucket_raw(&self) -> RawBucket<K, V> {
          let hashes_size = self.capacity * size_of::<u64>();
          let keys_size = self.capacity * size_of::<K>();
  
-        let keys_offset = (hashes_size + min_align_of::< K >() - 1) & !(min_align_of::< K >() - 1);
+        let keys_offset = (hashes_size + min_align_of::<K>() - 1) & !(min_align_of::<K>() - 1);
          let end_of_keys = keys_offset + keys_size;
  
-        let vals_offset = (end_of_keys + min_align_of::< V >() - 1) & !(min_align_of::< V >() - 1);
+        let vals_offset = (end_of_keys + min_align_of::<V>() - 1) & !(min_align_of::<V>() - 1);
  
          let buffer = self.hashes as *mut u8;
  
@@ -600,7 +625,7 @@ fn as_mut_ptrs(&self) -> RawBucket<K, V> {
      pub fn new(capacity: uint) -> RawTable<K, V> {
          unsafe {
              let ret = RawTable::new_uninitialized(capacity);
-            set_memory(ret.hashes, 0u8, capacity);
+            zero_memory(ret.hashes, capacity);
              ret
          }
      }
@@ -616,49 +641,51 @@ pub fn size(&self) -> uint {
          self.size
      }
  
-    fn ptrs<'a>(&'a self) -> RawBuckets<'a, K, V> {
+    fn raw_buckets(&self) -> RawBuckets<K, V> {
          RawBuckets {
-            raw: self.as_mut_ptrs(),
+            raw: self.first_bucket_raw(),
              hashes_end: unsafe {
                  self.hashes.offset(self.capacity as int)
              }
          }
      }
  
-    pub fn iter<'a>(&'a self) -> Entries<'a, K, V> {
+    pub fn iter(&self) -> Entries<K, V> {
          Entries {
-            iter: self.ptrs(),
+            iter: self.raw_buckets(),
              elems_left: self.size(),
          }
      }
  
-    pub fn mut_iter<'a>(&'a mut self) -> MutEntries<'a, K, V> {
+    pub fn mut_iter(&mut self) -> MutEntries<K, V> {
          MutEntries {
-            iter: self.ptrs(),
+            iter: self.raw_buckets(),
              elems_left: self.size(),
          }
      }
  
      pub fn move_iter(self) -> MoveEntries<K, V> {
          MoveEntries {
-            iter: self.ptrs(),
+            iter: self.raw_buckets(),
              table: self,
          }
      }
  
-    pub fn rev_move_buckets<'a>(&'a mut self) -> RevMoveBuckets<'a, K, V> {
-        let raw_bucket = self.as_mut_ptrs();
-        unsafe {
-            RevMoveBuckets {
-                raw: raw_bucket.offset(self.capacity as int),
-                hashes_end: raw_bucket.hash,
-                elems_left: self.size
-            }
+    /// Returns an iterator that copies out each entry. Used while the table
+    /// is being dropped.
+    unsafe fn rev_move_buckets(&mut self) -> RevMoveBuckets<K, V> {
+        let raw_bucket = self.first_bucket_raw();
+        RevMoveBuckets {
+            raw: raw_bucket.offset(self.capacity as int),
+            hashes_end: raw_bucket.hash,
+            elems_left: self.size
          }
      }
  }
  
-pub struct RawBuckets<'a, K, V> {
+/// A raw iterator. The basis for some other iterators in this module. Although
+/// this interface is safe, it's not used outside this module.
+struct RawBuckets<'a, K, V> {
      raw: RawBucket<K, V>,
      hashes_end: *mut u64
  }
@@ -667,6 +694,8 @@ impl<'a, K, V> Iterator<RawBucket<K, V>> for RawBuckets<'a, K, V> {
      fn next(&mut self) -> Option<RawBucket<K, V>> {
          while self.raw.hash != self.hashes_end {
              unsafe {
+                // We are swapping out the pointer to a bucket and replacing
+                // it with the pointer to the next one.
                  let prev = ptr::replace(&mut self.raw, self.raw.offset(1));
                  if *prev.hash != EMPTY_BUCKET {
                      return Some(prev);
@@ -678,7 +707,10 @@ fn next(&mut self) -> Option<RawBucket<K, V>> {
      }
  }
  
-pub struct RevMoveBuckets<'a, K, V> {
+/// An iterator that moves out buckets in reverse order. It leaves the table
+/// in an an inconsistent state and should only be used for dropping
+/// the table's remaining entries. It's used in the implementation of Drop.
+struct RevMoveBuckets<'a, K, V> {
      raw: RawBucket<K, V>,
      hashes_end: *mut u64,
      elems_left: uint
@@ -708,43 +740,13 @@ fn next(&mut self) -> Option<(K, V)> {
      }
  }
  
-// `read_all_mut` casts a `*u64` to a `*SafeHash`. Since we statically
-// ensure that a `FullIndex` points to an index with a non-zero hash,
-// and a `SafeHash` is just a `u64` with a different name, this is
-// safe.
-//
-// This test ensures that a `SafeHash` really IS the same size as a
-// `u64`. If you need to change the size of `SafeHash` (and
-// consequently made this test fail), `read_all_mut` needs to be
-// modified to no longer assume this.
-#[test]
-fn can_alias_safehash_as_u64() {
-    assert_eq!(size_of::<SafeHash>(), size_of::<u64>())
-}
-
-/// Note: stage0-specific version that lacks bound.
-#[cfg(stage0)]
-pub struct Entries<'a, K, V> {
-    iter: RawBuckets<'a, K, V>,
-    elems_left: uint,
-}
-
  /// Iterator over shared references to entries in a table.
-#[cfg(not(stage0))]
  pub struct Entries<'a, K: 'a, V: 'a> {
      iter: RawBuckets<'a, K, V>,
      elems_left: uint,
  }
  
-/// Note: stage0-specific version that lacks bound.
-#[cfg(stage0)]
-pub struct MutEntries<'a, K, V> {
-    iter: RawBuckets<'a, K, V>,
-    elems_left: uint,
-}
-
  /// Iterator over mutable references to entries in a table.
-#[cfg(not(stage0))]
  pub struct MutEntries<'a, K: 'a, V: 'a> {
      iter: RawBuckets<'a, K, V>,
      elems_left: uint,
@@ -830,14 +832,14 @@ fn clone(&self) -> RawTable<K, V> {
                              mem::overwrite(new_buckets.raw.key, k);
                              mem::overwrite(new_buckets.raw.val, v);
                          }
-                        _  => {
+                        Empty(..) => {
                              *new_buckets.raw.hash = EMPTY_BUCKET;
                          }
                      }
                      new_buckets.next();
                      buckets.next();
                  }
-            }
+            };
  
              new_ht.size = self.size();
  
@@ -852,12 +854,14 @@ fn drop(&mut self) {
          if self.hashes.is_null() {
              return;
          }
-        // This is in reverse because we're likely to have partially taken
+        // This is done in reverse because we've likely partially taken
          // some elements out with `.move_iter()` from the front.
          // Check if the size is 0, so we don't do a useless scan when
          // dropping empty tables such as on resize.
-        // Avoid double free of elements already moved out.
-        for _ in self.rev_move_buckets() {}
+        // Also avoid double drop of elements that have been already moved out.
+        unsafe {
+            for _ in self.rev_move_buckets() {}
+        }
  
          let hashes_size = self.capacity * size_of::<u64>();
          let keys_size = self.capacity * size_of::<K>();
@@ -871,7 +875,5 @@ fn drop(&mut self) {
              // Remember how everything was allocated out of one buffer
              // during initialization? We only need one call to free here.
          }
-
-        self.hashes = RawPtr::null();
      }
  }
author	Piotr Czarnecki <pioczarn@gmail.com>
	Tue, 15 Jul 2014 20:58:35 +0000 (21:58 +0100)
committer	Piotr Czarnecki <pioczarn@gmail.com>
	Thu, 4 Sep 2014 22:22:32 +0000 (23:22 +0100)
src/libstd/collections/hashmap/bench.rs		patch \| blob \| history
src/libstd/collections/hashmap/map.rs		patch \| blob \| history
src/libstd/collections/hashmap/mod.rs		patch \| blob \| history
src/libstd/collections/hashmap/set.rs		patch \| blob \| history
src/libstd/collections/hashmap/table.rs		patch \| blob \| history