Auto merge of #1721 - henryboisdequin:add-atomic-min-and-max, r=oli-obk

[rust.git] / src / data_race.rs
diff --git a/src/data_race.rs b/src/data_race.rs

index 35898f1d937f8cdd18d3882f4703268c7de7cf46..e8071845c7d76240046ef8c3be17023474c772ad 100644 (file)
--- a/src/data_race.rs
+++ b/src/data_race.rs
@@ -1,29 +1,90 @@
-//! Implementation of a data-race detector
-//!  uses Lamport Timestamps / Vector-clocks
-//!  base on the Dyamic Race Detection for C++:
-//!     - https://www.doc.ic.ac.uk/~afd/homepages/papers/pdfs/2017/POPL.pdf
-//!  to extend data-race detection to work correctly with fences
-//!  and RMW operations
+//! Implementation of a data-race detector using Lamport Timestamps / Vector-clocks
+//! based on the Dynamic Race Detection for C++:
+//! https://www.doc.ic.ac.uk/~afd/homepages/papers/pdfs/2017/POPL.pdf
+//! which does not report false-positives when fences are used, and gives better
+//! accuracy in presence of read-modify-write operations.
+//!
+//! The implementation contains modifications to correctly model the changes to the memory model in C++20
+//! regarding the weakening of release sequences: http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2018/p0982r1.html.
+//! Relaxed stores now unconditionally block all currently active release sequences and so per-thread tracking of release
+//! sequences is not needed.
+//!
+//! The implementation also models races with memory allocation and deallocation via treating allocation and
+//! deallocation as a type of write internally for detecting data-races.
+//!
  //! This does not explore weak memory orders and so can still miss data-races
-//!  but should not report false-positives
-
-use std::{fmt::{self, Debug}, cmp::Ordering, rc::Rc, cell::{Cell, RefCell, Ref, RefMut}, ops::Index};
-
+//! but should not report false-positives
+//!
+//! Data-race definition from(https://en.cppreference.com/w/cpp/language/memory_model#Threads_and_data_races):
+//! a data race occurs between two memory accesses if they are on different threads, at least one operation
+//! is non-atomic, at least one operation is a write and neither access happens-before the other. Read the link
+//! for full definition.
+//!
+//! This re-uses vector indexes for threads that are known to be unable to report data-races, this is valid
+//! because it only re-uses vector indexes once all currently-active (not-terminated) threads have an internal
+//! vector clock that happens-after the join operation of the candidate thread. Threads that have not been joined
+//! on are not considered. Since the thread's vector clock will only increase and a data-race implies that
+//! there is some index x where clock[x] > thread_clock, when this is true clock[candidate-idx] > thread_clock
+//! can never hold and hence a data-race can never be reported in that vector index again.
+//! This means that the thread-index can be safely re-used, starting on the next timestamp for the newly created
+//! thread.
+//!
+//! The sequentially consistent ordering corresponds to the ordering that the threads
+//! are currently scheduled, this means that the data-race detector has no additional
+//! logic for sequentially consistent accesses at the moment since they are indistinguishable
+//! from acquire/release operations. If weak memory orderings are explored then this
+//! may need to change or be updated accordingly.
+//!
+//! Per the C++ spec for the memory model a sequentially consistent operation:
+//!   "A load operation with this memory order performs an acquire operation,
+//!    a store performs a release operation, and read-modify-write performs
+//!    both an acquire operation and a release operation, plus a single total
+//!    order exists in which all threads observe all modifications in the same
+//!    order (see Sequentially-consistent ordering below) "
+//! So in the absence of weak memory effects a seq-cst load & a seq-cst store is identical
+//! to a acquire load and a release store given the global sequentially consistent order
+//! of the schedule.
+//!
+//! The timestamps used in the data-race detector assign each sequence of non-atomic operations
+//! followed by a single atomic or concurrent operation a single timestamp.
+//! Write, Read, Write, ThreadJoin will be represented by a single timestamp value on a thread.
+//! This is because extra increment operations between the operations in the sequence are not
+//! required for accurate reporting of data-race values.
+//!
+//! As per the paper a threads timestamp is only incremented after a release operation is performed
+//! so some atomic operations that only perform acquires do not increment the timestamp. Due to shared
+//! code some atomic operations may increment the timestamp when not necessary but this has no effect
+//! on the data-race detection code.
+//!
+//! FIXME:
+//! currently we have our own local copy of the currently active thread index and names, this is due
+//! in part to the inability to access the current location of threads.active_thread inside the AllocExtra
+//! read, write and deallocate functions and should be cleaned up in the future.
+
+use std::{
+    cell::{Cell, Ref, RefCell, RefMut},
+    fmt::Debug,
+    mem,
+    rc::Rc,
+};
+
+use rustc_data_structures::fx::{FxHashMap, FxHashSet};
  use rustc_index::vec::{Idx, IndexVec};
+use rustc_middle::{mir, ty::layout::TyAndLayout};
  use rustc_target::abi::Size;
-use rustc_middle::ty::layout::TyAndLayout;
-use rustc_data_structures::fx::FxHashMap;
-
-use smallvec::SmallVec;
  
-use crate::*;
+use crate::{
+    ImmTy, Immediate, InterpResult, MPlaceTy, MemPlaceMeta, MemoryKind, MiriEvalContext,
+    MiriEvalContextExt, MiriMemoryKind, OpTy, Pointer, RangeMap, Scalar, ScalarMaybeUninit, Tag,
+    ThreadId, VClock, VTimestamp, VectorIdx,
+};
  
  pub type AllocExtra = VClockAlloc;
  pub type MemoryExtra = Rc<GlobalState>;
  
-/// Valid atomic read-write operations, alias of atomic::Ordering (not non-exhaustive)
+/// Valid atomic read-write operations, alias of atomic::Ordering (not non-exhaustive).
  #[derive(Copy, Clone, PartialEq, Eq, Debug)]
-pub enum AtomicRWOp {
+pub enum AtomicRwOp {
      Relaxed,
      Acquire,
      Release,
@@ -31,7 +92,7 @@ pub enum AtomicRWOp {
      SeqCst,
  }
  
-/// Valid atomic read operations, subset of atomic::Ordering
+/// Valid atomic read operations, subset of atomic::Ordering.
  #[derive(Copy, Clone, PartialEq, Eq, Debug)]
  pub enum AtomicReadOp {
      Relaxed,
@@ -39,7 +100,7 @@ pub enum AtomicReadOp {
      SeqCst,
  }
  
-/// Valid atomic write operations, subset of atomic::Ordering
+/// Valid atomic write operations, subset of atomic::Ordering.
  #[derive(Copy, Clone, PartialEq, Eq, Debug)]
  pub enum AtomicWriteOp {
      Relaxed,
@@ -47,8 +108,7 @@ pub enum AtomicWriteOp {
      SeqCst,
  }
  
-
-/// Valid atomic fence operations, subset of atomic::Ordering
+/// Valid atomic fence operations, subset of atomic::Ordering.
  #[derive(Copy, Clone, PartialEq, Eq, Debug)]
  pub enum AtomicFenceOp {
      Acquire,
@@ -57,1351 +117,1388 @@ pub enum AtomicFenceOp {
      SeqCst,
  }
  
-/// Evaluation context extensions
-impl<'mir, 'tcx: 'mir> EvalContextExt<'mir, 'tcx> for crate::MiriEvalContext<'mir, 'tcx> {}
-pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx> {
-
-    /// Variant of `read_immediate` that does not perform `data-race` checks.
-    fn read_immediate_racy(&self, op: MPlaceTy<'tcx, Tag>) -> InterpResult<'tcx, ImmTy<'tcx, Tag>> {
-        let this = self.eval_context_ref();
-        let data_race = &*this.memory.extra.data_race;
-        let old = data_race.multi_threaded.get();
-
-        data_race.multi_threaded.set(false);
-        let res = this.read_immediate(op.into());
-
-        data_race.multi_threaded.set(old);
-        res
-    }
-    
-    /// Variant of `write_immediate` that does not perform `data-race` checks.
-    fn write_immediate_racy(
-        &mut self, src: Immediate<Tag>, dest: MPlaceTy<'tcx, Tag>
-    ) -> InterpResult<'tcx> {
-        let this = self.eval_context_mut();
-        let data_race = &*this.memory.extra.data_race;
-        let old = data_race.multi_threaded.get();
+/// The current set of vector clocks describing the state
+/// of a thread, contains the happens-before clock and
+/// additional metadata to model atomic fence operations.
+#[derive(Clone, Default, Debug)]
+struct ThreadClockSet {
+    /// The increasing clock representing timestamps
+    /// that happen-before this thread.
+    clock: VClock,
  
-        data_race.multi_threaded.set(false);
-        let imm = this.write_immediate(src, dest.into());
+    /// The set of timestamps that will happen-before this
+    /// thread once it performs an acquire fence.
+    fence_acquire: VClock,
  
-        let data_race = &*this.memory.extra.data_race;
-        data_race.multi_threaded.set(old);
-        imm
-    }
+    /// The last timestamp of happens-before relations that
+    /// have been released by this thread by a fence.
+    fence_release: VClock,
+}
  
-    /// Variant of `read_scalar` that does not perform data-race checks.
-    fn read_scalar_racy(
-        &self, op: MPlaceTy<'tcx, Tag>
-    )-> InterpResult<'tcx, ScalarMaybeUninit<Tag>> {
-        Ok(self.read_immediate_racy(op)?.to_scalar_or_uninit())
+impl ThreadClockSet {
+    /// Apply the effects of a release fence to this
+    /// set of thread vector clocks.
+    #[inline]
+    fn apply_release_fence(&mut self) {
+        self.fence_release.clone_from(&self.clock);
      }
  
-    /// Variant of `write_scalar` that does not perform data-race checks.
-    fn write_scalar_racy(
-        &mut self, val: ScalarMaybeUninit<Tag>, dest: MPlaceTy<'tcx, Tag>
-    ) -> InterpResult<'tcx> {
-        self.write_immediate_racy(Immediate::Scalar(val.into()), dest)
+    /// Apply the effects of a acquire fence to this
+    /// set of thread vector clocks.
+    #[inline]
+    fn apply_acquire_fence(&mut self) {
+        self.clock.join(&self.fence_acquire);
      }
  
-    /// Variant of `read_scalar_at_offset` helper function that does not perform
-    /// `data-race checks.
-    fn read_scalar_at_offset_racy(
-        &self,
-        op: OpTy<'tcx, Tag>,
-        offset: u64,
-        layout: TyAndLayout<'tcx>,
-    ) -> InterpResult<'tcx, ScalarMaybeUninit<Tag>> {
-        let this = self.eval_context_ref();
-        let op_place = this.deref_operand(op)?;
-        let offset = Size::from_bytes(offset);
-        // Ensure that the following read at an offset is within bounds
-        assert!(op_place.layout.size >= offset + layout.size);
-        let value_place = op_place.offset(offset, MemPlaceMeta::None, layout, this)?;
-        this.read_scalar_racy(value_place.into())
+    /// Increment the happens-before clock at a
+    /// known index.
+    #[inline]
+    fn increment_clock(&mut self, index: VectorIdx) {
+        self.clock.increment_index(index);
      }
  
-    /// Variant of `write_scalar_at_offfset` helper function that does not perform
-    ///  data-race checks.
-    fn write_scalar_at_offset_racy(
-        &mut self,
-        op: OpTy<'tcx, Tag>,
-        offset: u64,
-        value: impl Into<ScalarMaybeUninit<Tag>>,
-        layout: TyAndLayout<'tcx>,
-    ) -> InterpResult<'tcx, ()> {
-        let this = self.eval_context_mut();
-        let op_place = this.deref_operand(op)?;
-        let offset = Size::from_bytes(offset);
-        // Ensure that the following read at an offset is within bounds
-        assert!(op_place.layout.size >= offset + layout.size);
-        let value_place = op_place.offset(offset, MemPlaceMeta::None, layout, this)?;
-        this.write_scalar_racy(value.into(), value_place.into())
+    /// Join the happens-before clock with that of
+    /// another thread, used to model thread join
+    /// operations.
+    fn join_with(&mut self, other: &ThreadClockSet) {
+        self.clock.join(&other.clock);
      }
+}
  
-    /// Load the data race allocation state for a given memory place
-    ///  also returns the size and the offset of the result in the allocation
-    ///  metadata
-    fn load_data_race_state<'a>(
-        &'a mut self, place: MPlaceTy<'tcx, Tag>
-    ) -> InterpResult<'tcx, (&'a mut VClockAlloc, Size, Size)> where 'mir: 'a {
-        let this = self.eval_context_mut();
-
-        let ptr = place.ptr.assert_ptr();
-        let size = place.layout.size;
-        let data_race = &mut this.memory.get_raw_mut(ptr.alloc_id)?.extra.data_race;
+/// Error returned by finding a data race
+/// should be elaborated upon.
+#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
+pub struct DataRace;
  
-        Ok((data_race, size, ptr.offset))
-    }
-    
-    /// Update the data-race detector for an atomic read occuring at the
-    ///  associated memory-place and on the current thread
-    fn validate_atomic_load(
-        &mut self, place: MPlaceTy<'tcx, Tag>, atomic: AtomicReadOp
-    ) -> InterpResult<'tcx> {
-        let this = self.eval_context_mut();
-        let data_race = &*this.memory.extra.data_race;
-        if data_race.multi_threaded.get() {
-            data_race.advance_vector_clock();
-
-            let (
-                alloc, size, offset
-            ) = this.load_data_race_state(place)?;
-            log::trace!(
-                "Atomic load on {:?} with ordering {:?}, in memory({:?}, offset={}, size={})",
-                alloc.global.current_thread(), atomic,
-                place.ptr.assert_ptr().alloc_id, offset.bytes(), size.bytes()
-            );
-
-            let mut current_state = alloc.global.current_thread_state_mut();
-            if atomic == AtomicReadOp::Relaxed {
-                // Perform relaxed atomic load
-                for (_,range) in alloc.alloc_ranges.get_mut().iter_mut(offset, size) {
-                    range.load_relaxed(&mut *current_state);
-                }
-            }else{
-                // Perform acquire(or seq-cst) atomic load
-                for (_,range) in alloc.alloc_ranges.get_mut().iter_mut(offset, size) {
-                    range.acquire(&mut *current_state);
-                }
-            }
+/// Externally stored memory cell clocks
+/// explicitly to reduce memory usage for the
+/// common case where no atomic operations
+/// exists on the memory cell.
+#[derive(Clone, PartialEq, Eq, Default, Debug)]
+struct AtomicMemoryCellClocks {
+    /// The clock-vector of the timestamp of the last atomic
+    /// read operation performed by each thread.
+    /// This detects potential data-races between atomic read
+    /// and non-atomic write operations.
+    read_vector: VClock,
+
+    /// The clock-vector of the timestamp of the last atomic
+    /// write operation performed by each thread.
+    /// This detects potential data-races between atomic write
+    /// and non-atomic read or write operations.
+    write_vector: VClock,
  
-            // Log changes to atomic memory
-            if log::log_enabled!(log::Level::Trace) {
-                for (_,range) in alloc.alloc_ranges.get_mut().iter(offset, size) {
-                    log::trace!(
-                        "  updated atomic memory({:?}, offset={}, size={}) to {:#?}",
-                        place.ptr.assert_ptr().alloc_id, offset.bytes(), size.bytes(),
-                        range.atomic_ops
-                    );
-                }
-            }
+    /// Synchronization vector for acquire-release semantics
+    /// contains the vector of timestamps that will
+    /// happen-before a thread if an acquire-load is
+    /// performed on the data.
+    sync_vector: VClock,
+}
  
-            std::mem::drop(current_state);
-            let data_race = &*this.memory.extra.data_race;
-            data_race.advance_vector_clock();
+/// Type of write operation: allocating memory
+/// non-atomic writes and deallocating memory
+/// are all treated as writes for the purpose
+/// of the data-race detector.
+#[derive(Copy, Clone, PartialEq, Eq, Debug)]
+enum WriteType {
+    /// Allocate memory.
+    Allocate,
+
+    /// Standard unsynchronized write.
+    Write,
+
+    /// Deallocate memory.
+    /// Note that when memory is deallocated first, later non-atomic accesses
+    /// will be reported as use-after-free, not as data races.
+    /// (Same for `Allocate` above.)
+    Deallocate,
+}
+impl WriteType {
+    fn get_descriptor(self) -> &'static str {
+        match self {
+            WriteType::Allocate => "Allocate",
+            WriteType::Write => "Write",
+            WriteType::Deallocate => "Deallocate",
          }
-        Ok(())
      }
+}
  
-    /// Update the data-race detector for an atomic write occuring at the
-    ///  associated memory-place and on the current thread
-    fn validate_atomic_store(
-        &mut self, place: MPlaceTy<'tcx, Tag>, atomic: AtomicWriteOp
-    ) -> InterpResult<'tcx> {
-        let this = self.eval_context_mut();
-        let data_race = &*this.memory.extra.data_race;
-        if data_race.multi_threaded.get() {
-            data_race.advance_vector_clock();
-
-            let (
-                alloc, size, offset
-            ) = this.load_data_race_state(place)?;
-            let current_thread = alloc.global.current_thread();
-            let mut current_state = alloc.global.current_thread_state_mut();
-            log::trace!(
-                "Atomic store on {:?} with ordering {:?}, in memory({:?}, offset={}, size={})",
-                current_thread, atomic,
-                place.ptr.assert_ptr().alloc_id, offset.bytes(), size.bytes()
-            );
-
-            if atomic == AtomicWriteOp::Relaxed {
-                // Perform relaxed atomic store
-                for (_,range) in alloc.alloc_ranges.get_mut().iter_mut(offset, size) {
-                    range.store_relaxed(&mut *current_state, current_thread);
-                }
-            }else{
-                // Perform release(or seq-cst) atomic store
-                for (_,range) in alloc.alloc_ranges.get_mut().iter_mut(offset, size) {
-                    range.release(&mut *current_state, current_thread);
-                }
-            }
+/// Memory Cell vector clock metadata
+/// for data-race detection.
+#[derive(Clone, PartialEq, Eq, Debug)]
+struct MemoryCellClocks {
+    /// The vector-clock timestamp of the last write
+    /// corresponding to the writing threads timestamp.
+    write: VTimestamp,
+
+    /// The identifier of the vector index, corresponding to a thread
+    /// that performed the last write operation.
+    write_index: VectorIdx,
+
+    /// The type of operation that the write index represents,
+    /// either newly allocated memory, a non-atomic write or
+    /// a deallocation of memory.
+    write_type: WriteType,
+
+    /// The vector-clock of the timestamp of the last read operation
+    /// performed by a thread since the last write operation occurred.
+    /// It is reset to zero on each write operation.
+    read: VClock,
  
-            // Log changes to atomic memory
-            if log::log_enabled!(log::Level::Trace) {
-                for (_,range) in alloc.alloc_ranges.get_mut().iter(offset, size) {
-                    log::trace!(
-                        "  updated atomic memory({:?}, offset={}, size={}) to {:#?}",
-                        place.ptr.assert_ptr().alloc_id, offset.bytes(), size.bytes(),
-                        range.atomic_ops
-                    );
-                }
-            }
+    /// Atomic acquire & release sequence tracking clocks.
+    /// For non-atomic memory in the common case this
+    /// value is set to None.
+    atomic_ops: Option<Box<AtomicMemoryCellClocks>>,
+}
  
-            std::mem::drop(current_state);
-            let data_race = &*this.memory.extra.data_race;
-            data_race.advance_vector_clock();
+impl MemoryCellClocks {
+    /// Create a new set of clocks representing memory allocated
+    ///  at a given vector timestamp and index.
+    fn new(alloc: VTimestamp, alloc_index: VectorIdx) -> Self {
+        MemoryCellClocks {
+            read: VClock::default(),
+            write: alloc,
+            write_index: alloc_index,
+            write_type: WriteType::Allocate,
+            atomic_ops: None,
          }
-        Ok(())
      }
  
-    /// Update the data-race detector for an atomic read-modify-write occuring
-    ///  at the associated memory place and on the current thread
-    fn validate_atomic_rmw(
-        &mut self, place: MPlaceTy<'tcx, Tag>, atomic: AtomicRWOp
-    ) -> InterpResult<'tcx> {
-        use AtomicRWOp::*;
-        let this = self.eval_context_mut();
-        let data_race = &*this.memory.extra.data_race;
-        if data_race.multi_threaded.get() {
-            data_race.advance_vector_clock();
-
-            let (
-                alloc, size, offset
-            ) = this.load_data_race_state(place)?;
-            let current_thread = alloc.global.current_thread();
-            let mut current_state = alloc.global.current_thread_state_mut();
-            log::trace!(
-                "Atomic RMW on {:?} with ordering {:?}, in memory({:?}, offset={}, size={})",
-                current_thread, atomic,
-                place.ptr.assert_ptr().alloc_id, offset.bytes(), size.bytes()
-            );
-
-            let acquire = matches!(atomic, Acquire | AcqRel | SeqCst);
-            let release = matches!(atomic, Release | AcqRel | SeqCst);
-            for (_,range) in alloc.alloc_ranges.get_mut().iter_mut(offset, size) {
-                //FIXME: this is probably still slightly wrong due to the quirks
-                // in the c++11 memory model
-                if acquire {
-                    // Atomic RW-Op acquire
-                    range.acquire(&mut *current_state);
-                }else{
-                    range.load_relaxed(&mut *current_state);
-                }
-                if release {
-                    // Atomic RW-Op release
-                    range.rmw_release(&mut *current_state, current_thread);
-                }else{
-                    range.rmw_relaxed(&mut *current_state);
-                }
-            }
+    /// Load the internal atomic memory cells if they exist.
+    #[inline]
+    fn atomic(&self) -> Option<&AtomicMemoryCellClocks> {
+        match &self.atomic_ops {
+            Some(op) => Some(&*op),
+            None => None,
+        }
+    }
  
-            // Log changes to atomic memory
-            if log::log_enabled!(log::Level::Trace) {
-                for (_,range) in alloc.alloc_ranges.get_mut().iter(offset, size) {
-                    log::trace!(
-                        "  updated atomic memory({:?}, offset={}, size={}) to {:#?}",
-                        place.ptr.assert_ptr().alloc_id, offset.bytes(), size.bytes(),
-                        range.atomic_ops
-                    );
-                }
-            }
+    /// Load or create the internal atomic memory metadata
+    /// if it does not exist.
+    #[inline]
+    fn atomic_mut(&mut self) -> &mut AtomicMemoryCellClocks {
+        self.atomic_ops.get_or_insert_with(Default::default)
+    }
  
-            std::mem::drop(current_state);
-            let data_race = &*this.memory.extra.data_race;
-            data_race.advance_vector_clock();
+    /// Update memory cell data-race tracking for atomic
+    /// load acquire semantics, is a no-op if this memory was
+    /// not used previously as atomic memory.
+    fn load_acquire(
+        &mut self,
+        clocks: &mut ThreadClockSet,
+        index: VectorIdx,
+    ) -> Result<(), DataRace> {
+        self.atomic_read_detect(clocks, index)?;
+        if let Some(atomic) = self.atomic() {
+            clocks.clock.join(&atomic.sync_vector);
          }
          Ok(())
      }
  
-    /// Update the data-race detector for an atomic fence on the current thread
-    fn validate_atomic_fence(&mut self, atomic: AtomicFenceOp) -> InterpResult<'tcx> {
-        let this = self.eval_context_mut();
-        let data_race = &*this.memory.extra.data_race;
-        if data_race.multi_threaded.get() {
-            data_race.advance_vector_clock();
-
-            log::trace!("Atomic fence on {:?} with ordering {:?}", data_race.current_thread(), atomic);
-            // Apply data-race detection for the current fences
-            //  this treats AcqRel and SeqCst as the same as a acquire
-            //  and release fence applied in the same timestamp.
-            if atomic != AtomicFenceOp::Release {
-                // Either Acquire | AcqRel | SeqCst
-                data_race.current_thread_state_mut().apply_acquire_fence();
-            }
-            if atomic != AtomicFenceOp::Acquire {
-                // Either Release | AcqRel | SeqCst
-                data_race.current_thread_state_mut().apply_release_fence();
-            }
-
-            data_race.advance_vector_clock();
+    /// Update memory cell data-race tracking for atomic
+    /// load relaxed semantics, is a no-op if this memory was
+    /// not used previously as atomic memory.
+    fn load_relaxed(
+        &mut self,
+        clocks: &mut ThreadClockSet,
+        index: VectorIdx,
+    ) -> Result<(), DataRace> {
+        self.atomic_read_detect(clocks, index)?;
+        if let Some(atomic) = self.atomic() {
+            clocks.fence_acquire.join(&atomic.sync_vector);
          }
          Ok(())
      }
-}
  
-/// Handle for locks to express their
-///  acquire-release semantics
-#[derive(Clone, Debug, Default)]
-pub struct DataRaceLockHandle {
-
-    /// Internal acquire-release clock
-    ///  to express the acquire release sync
-    ///  found in concurrency primitives
-    clock: VClock,
-}
-impl DataRaceLockHandle {
-    pub fn set_values(&mut self, other: &Self) {
-        self.clock.set_values(&other.clock)
-    }
-    pub fn reset(&mut self) {
-        self.clock.set_zero_vector();
+    /// Update the memory cell data-race tracking for atomic
+    /// store release semantics.
+    fn store_release(&mut self, clocks: &ThreadClockSet, index: VectorIdx) -> Result<(), DataRace> {
+        self.atomic_write_detect(clocks, index)?;
+        let atomic = self.atomic_mut();
+        atomic.sync_vector.clone_from(&clocks.clock);
+        Ok(())
      }
-}
-
-
-/// Avoid an atomic allocation for the common
-///  case with atomic operations where the number
-///  of active release sequences is small
-#[derive(Clone, PartialEq, Eq)]
-enum AtomicReleaseSequences {
  
-    /// Contains one or no values
-    ///  if empty: (None, reset vector clock)
-    ///  if one:   (Some(thread), thread_clock)
-    ReleaseOneOrEmpty(Option<ThreadId>, VClock),
-
-    /// Contains two or more values
-    ///  stored in a hash-map of thread id to
-    ///  vector clocks
-    ReleaseMany(FxHashMap<ThreadId, VClock>)
-}
-impl AtomicReleaseSequences {
+    /// Update the memory cell data-race tracking for atomic
+    /// store relaxed semantics.
+    fn store_relaxed(&mut self, clocks: &ThreadClockSet, index: VectorIdx) -> Result<(), DataRace> {
+        self.atomic_write_detect(clocks, index)?;
+
+        // The handling of release sequences was changed in C++20 and so
+        // the code here is different to the paper since now all relaxed
+        // stores block release sequences. The exception for same-thread
+        // relaxed stores has been removed.
+        let atomic = self.atomic_mut();
+        atomic.sync_vector.clone_from(&clocks.fence_release);
+        Ok(())
+    }
  
-    /// Return an empty set of atomic release sequences
-    #[inline]
-    fn new() -> AtomicReleaseSequences {
-        Self::ReleaseOneOrEmpty(None, VClock::default())
+    /// Update the memory cell data-race tracking for atomic
+    /// store release semantics for RMW operations.
+    fn rmw_release(&mut self, clocks: &ThreadClockSet, index: VectorIdx) -> Result<(), DataRace> {
+        self.atomic_write_detect(clocks, index)?;
+        let atomic = self.atomic_mut();
+        atomic.sync_vector.join(&clocks.clock);
+        Ok(())
      }
  
-    /// Remove all values except for the value stored at `thread` and set
-    ///  the vector clock to the associated `clock` value
-    #[inline]
-    fn clear_and_set(&mut self, thread: ThreadId, clock: &VClock) {
-        match self {
-            Self::ReleaseOneOrEmpty(id, rel_clock) => {
-                *id = Some(thread);
-                rel_clock.set_values(clock);
-            }
-            Self::ReleaseMany(_) => {
-                *self = Self::ReleaseOneOrEmpty(Some(thread), clock.clone());
-            }
-        }
+    /// Update the memory cell data-race tracking for atomic
+    /// store relaxed semantics for RMW operations.
+    fn rmw_relaxed(&mut self, clocks: &ThreadClockSet, index: VectorIdx) -> Result<(), DataRace> {
+        self.atomic_write_detect(clocks, index)?;
+        let atomic = self.atomic_mut();
+        atomic.sync_vector.join(&clocks.fence_release);
+        Ok(())
      }
  
-    /// Remove all values except for the value stored at `thread`
-    #[inline]
-    fn clear_and_retain(&mut self, thread: ThreadId) {
-        match self {
-            Self::ReleaseOneOrEmpty(id, rel_clock) => {
-                // If the id is the same, then reatin the value
-                //  otherwise delete and clear the release vector clock
-                if *id != Some(thread) {
-                    *id = None;
-                    rel_clock.set_zero_vector();
-                }
-            },
-            Self::ReleaseMany(hash_map) => {
-                // Retain only the thread element, so reduce to size
-                //  of 1 or 0, and move to smaller format
-                if let Some(clock) = hash_map.remove(&thread) {
-                    *self = Self::ReleaseOneOrEmpty(Some(thread), clock);
-                }else{
-                    *self = Self::new();
-                }
-            }
+    /// Detect data-races with an atomic read, caused by a non-atomic write that does
+    /// not happen-before the atomic-read.
+    fn atomic_read_detect(
+        &mut self,
+        clocks: &ThreadClockSet,
+        index: VectorIdx,
+    ) -> Result<(), DataRace> {
+        log::trace!("Atomic read with vectors: {:#?} :: {:#?}", self, clocks);
+        if self.write <= clocks.clock[self.write_index] {
+            let atomic = self.atomic_mut();
+            atomic.read_vector.set_at_index(&clocks.clock, index);
+            Ok(())
+        } else {
+            Err(DataRace)
          }
      }
  
-    /// Insert a release sequence at `thread` with values `clock`
-    fn insert(&mut self, thread: ThreadId, clock: &VClock) {
-        match self {
-            Self::ReleaseOneOrEmpty(id, rel_clock) => {
-                if id.map_or(true, |id| id == thread) {
-                    *id = Some(thread);
-                    rel_clock.set_values(clock);
-                }else{
-                    let mut hash_map = FxHashMap::default();
-                    hash_map.insert(thread, clock.clone());
-                    hash_map.insert(id.unwrap(), rel_clock.clone());
-                    *self = Self::ReleaseMany(hash_map);
-                }
-            },
-            Self::ReleaseMany(hash_map) => {
-                hash_map.insert(thread, clock.clone());
-            }
+    /// Detect data-races with an atomic write, either with a non-atomic read or with
+    /// a non-atomic write.
+    fn atomic_write_detect(
+        &mut self,
+        clocks: &ThreadClockSet,
+        index: VectorIdx,
+    ) -> Result<(), DataRace> {
+        log::trace!("Atomic write with vectors: {:#?} :: {:#?}", self, clocks);
+        if self.write <= clocks.clock[self.write_index] && self.read <= clocks.clock {
+            let atomic = self.atomic_mut();
+            atomic.write_vector.set_at_index(&clocks.clock, index);
+            Ok(())
+        } else {
+            Err(DataRace)
          }
      }
  
-    /// Return the release sequence at `thread` if one exists
-    #[inline]
-    fn load(&self, thread: ThreadId) -> Option<&VClock> {
-        match self {
-            Self::ReleaseOneOrEmpty(id, clock) => {
-                if *id == Some(thread) {
-                    Some(clock)
-                }else{
-                    None
-                }
-            },
-            Self::ReleaseMany(hash_map) => {
-                hash_map.get(&thread)
+    /// Detect races for non-atomic read operations at the current memory cell
+    /// returns true if a data-race is detected.
+    fn read_race_detect(
+        &mut self,
+        clocks: &ThreadClockSet,
+        index: VectorIdx,
+    ) -> Result<(), DataRace> {
+        log::trace!("Unsynchronized read with vectors: {:#?} :: {:#?}", self, clocks);
+        if self.write <= clocks.clock[self.write_index] {
+            let race_free = if let Some(atomic) = self.atomic() {
+                atomic.write_vector <= clocks.clock
+            } else {
+                true
+            };
+            if race_free {
+                self.read.set_at_index(&clocks.clock, index);
+                Ok(())
+            } else {
+                Err(DataRace)
              }
+        } else {
+            Err(DataRace)
          }
      }
-}
  
-/// Custom debug implementation to correctly
-///  print debug as a logical mapping from threads
-///  to vector-clocks
-impl Debug for AtomicReleaseSequences {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        match self {
-            Self::ReleaseOneOrEmpty(None,_) => {
-                f.debug_map().finish()
-            },
-            Self::ReleaseOneOrEmpty(Some(id), clock) => {
-                f.debug_map().entry(&id, &clock).finish()
-            },
-            Self::ReleaseMany(hash_map) => {
-                Debug::fmt(hash_map, f)
+    /// Detect races for non-atomic write operations at the current memory cell
+    /// returns true if a data-race is detected.
+    fn write_race_detect(
+        &mut self,
+        clocks: &ThreadClockSet,
+        index: VectorIdx,
+        write_type: WriteType,
+    ) -> Result<(), DataRace> {
+        log::trace!("Unsynchronized write with vectors: {:#?} :: {:#?}", self, clocks);
+        if self.write <= clocks.clock[self.write_index] && self.read <= clocks.clock {
+            let race_free = if let Some(atomic) = self.atomic() {
+                atomic.write_vector <= clocks.clock && atomic.read_vector <= clocks.clock
+            } else {
+                true
+            };
+            if race_free {
+                self.write = clocks.clock[index];
+                self.write_index = index;
+                self.write_type = write_type;
+                self.read.set_zero_vector();
+                Ok(())
+            } else {
+                Err(DataRace)
              }
+        } else {
+            Err(DataRace)
          }
      }
  }
  
-/// Externally stored memory cell clocks
-///  explicitly to reduce memory usage for the
-///  common case where no atomic operations
-///  exists on the memory cell
-#[derive(Clone, PartialEq, Eq, Debug)]
-struct AtomicMemoryCellClocks {
+/// Evaluation context extensions.
+impl<'mir, 'tcx: 'mir> EvalContextExt<'mir, 'tcx> for MiriEvalContext<'mir, 'tcx> {}
+pub trait EvalContextExt<'mir, 'tcx: 'mir>: MiriEvalContextExt<'mir, 'tcx> {
+    /// Atomic variant of read_scalar_at_offset.
+    fn read_scalar_at_offset_atomic(
+        &self,
+        op: &OpTy<'tcx, Tag>,
+        offset: u64,
+        layout: TyAndLayout<'tcx>,
+        atomic: AtomicReadOp,
+    ) -> InterpResult<'tcx, ScalarMaybeUninit<Tag>> {
+        let this = self.eval_context_ref();
+        let op_place = this.deref_operand(op)?;
+        let offset = Size::from_bytes(offset);
  
-    /// Synchronization vector for acquire-release semantics
-    sync_vector: VClock,
+        // Ensure that the following read at an offset is within bounds.
+        assert!(op_place.layout.size >= offset + layout.size);
+        let value_place = op_place.offset(offset, MemPlaceMeta::None, layout, this)?;
+        this.read_scalar_atomic(&value_place, atomic)
+    }
  
-    /// The Hash-Map of all threads for which a release
-    ///  sequence exists in the memory cell 
-    release_sequences: AtomicReleaseSequences,
-}
+    /// Atomic variant of write_scalar_at_offset.
+    fn write_scalar_at_offset_atomic(
+        &mut self,
+        op: &OpTy<'tcx, Tag>,
+        offset: u64,
+        value: impl Into<ScalarMaybeUninit<Tag>>,
+        layout: TyAndLayout<'tcx>,
+        atomic: AtomicWriteOp,
+    ) -> InterpResult<'tcx> {
+        let this = self.eval_context_mut();
+        let op_place = this.deref_operand(op)?;
+        let offset = Size::from_bytes(offset);
  
-/// Memory Cell vector clock metadata
-///  for data-race detection
-#[derive(Clone, PartialEq, Eq, Debug)]
-struct MemoryCellClocks {
+        // Ensure that the following read at an offset is within bounds.
+        assert!(op_place.layout.size >= offset + layout.size);
+        let value_place = op_place.offset(offset, MemPlaceMeta::None, layout, this)?;
+        this.write_scalar_atomic(value.into(), &value_place, atomic)
+    }
  
-    /// The vector-clock of the last write
-    write: Timestamp,
+    /// Perform an atomic read operation at the memory location.
+    fn read_scalar_atomic(
+        &self,
+        place: &MPlaceTy<'tcx, Tag>,
+        atomic: AtomicReadOp,
+    ) -> InterpResult<'tcx, ScalarMaybeUninit<Tag>> {
+        let this = self.eval_context_ref();
+        let scalar = this.allow_data_races_ref(move |this| this.read_scalar(&place.into()))?;
+        self.validate_atomic_load(place, atomic)?;
+        Ok(scalar)
+    }
  
-    /// The id of the thread that performed the last write to this memory location
-    write_thread: ThreadId,
+    /// Perform an atomic write operation at the memory location.
+    fn write_scalar_atomic(
+        &mut self,
+        val: ScalarMaybeUninit<Tag>,
+        dest: &MPlaceTy<'tcx, Tag>,
+        atomic: AtomicWriteOp,
+    ) -> InterpResult<'tcx> {
+        let this = self.eval_context_mut();
+        this.allow_data_races_mut(move |this| this.write_scalar(val, &(*dest).into()))?;
+        self.validate_atomic_store(dest, atomic)
+    }
  
-    /// The vector-clock of the set of previous reads
-    ///  each index is set to the timestamp that the associated
-    ///  thread last read this value.
-    read: VClock,
+    /// Perform a atomic operation on a memory location.
+    fn atomic_op_immediate(
+        &mut self,
+        place: &MPlaceTy<'tcx, Tag>,
+        rhs: &ImmTy<'tcx, Tag>,
+        op: mir::BinOp,
+        neg: bool,
+        atomic: AtomicRwOp,
+    ) -> InterpResult<'tcx, ImmTy<'tcx, Tag>> {
+        let this = self.eval_context_mut();
  
-    /// Atomic acquire & release sequence tracking clocks
-    ///  for non-atomic memory in the common case this
-    ///  value is set to None
-    atomic_ops: Option<Box<AtomicMemoryCellClocks>>,
-}
+        let old = this.allow_data_races_mut(|this| this.read_immediate(&place.into()))?;
  
-/// Create a default memory cell clocks instance
-///  for uninitialized memory
-impl Default for MemoryCellClocks {
-    fn default() -> Self {
-        MemoryCellClocks {
-            read: VClock::default(),
-            write: 0,
-            write_thread: ThreadId::new(u32::MAX as usize),
-            atomic_ops: None
-        }
+        // Atomics wrap around on overflow.
+        let val = this.binary_op(op, &old, rhs)?;
+        let val = if neg { this.unary_op(mir::UnOp::Not, &val)? } else { val };
+        this.allow_data_races_mut(|this| this.write_immediate(*val, &(*place).into()))?;
+
+        this.validate_atomic_rmw(place, atomic)?;
+        Ok(old)
      }
-}
  
-impl MemoryCellClocks {
+    /// Perform an atomic exchange with a memory place and a new
+    /// scalar value, the old value is returned.
+    fn atomic_exchange_scalar(
+        &mut self,
+        place: &MPlaceTy<'tcx, Tag>,
+        new: ScalarMaybeUninit<Tag>,
+        atomic: AtomicRwOp,
+    ) -> InterpResult<'tcx, ScalarMaybeUninit<Tag>> {
+        let this = self.eval_context_mut();
  
-    /// Load the internal atomic memory cells if they exist
-    #[inline]
-    fn atomic(&mut self) -> Option<&AtomicMemoryCellClocks> {
-        match &self.atomic_ops {
-            Some(op) => Some(&*op),
-            None => None
-        }
+        let old = this.allow_data_races_mut(|this| this.read_scalar(&place.into()))?;
+        this.allow_data_races_mut(|this| this.write_scalar(new, &(*place).into()))?;
+        this.validate_atomic_rmw(place, atomic)?;
+        Ok(old)
      }
  
-    /// Load or create the internal atomic memory metadata
-    ///  if it does not exist
-    #[inline]
-    fn atomic_mut(&mut self) -> &mut AtomicMemoryCellClocks {
-        self.atomic_ops.get_or_insert_with(|| {
-            Box::new(AtomicMemoryCellClocks {
-                sync_vector: VClock::default(),
-                release_sequences: AtomicReleaseSequences::new()
-            })
-        })
-    }
+    /// Perform an conditional atomic exchange with a memory place and a new
+    /// scalar value, the old value is returned.
+    fn atomic_min_max_scalar(
+        &mut self,
+        place: &MPlaceTy<'tcx, Tag>,
+        rhs: ImmTy<'tcx, Tag>,
+        min: bool,
+        atomic: AtomicRwOp,
+    ) -> InterpResult<'tcx, ImmTy<'tcx, Tag>> {
+        let this = self.eval_context_mut();
  
-    /// Update memory cell data-race tracking for atomic
-    ///  load acquire semantics, is a no-op if this memory was
-    ///  not used previously as atomic memory
-    fn acquire(&mut self, clocks: &mut ThreadClockSet) {
-        if let Some(atomic) = self.atomic() {
-            clocks.clock.join(&atomic.sync_vector);
-        }
-    }
-    /// Update memory cell data-race tracking for atomic
-    ///  load relaxed semantics, is a no-op if this memory was
-    ///  not used previously as atomic memory
-    fn load_relaxed(&mut self, clocks: &mut ThreadClockSet) {
-        if let Some(atomic) = self.atomic() {
-            clocks.fence_acquire.join(&atomic.sync_vector);
-        }
-    }
+        let old = this.allow_data_races_mut(|this| this.read_immediate(&place.into()))?;
+        let lt = this.overflowing_binary_op(mir::BinOp::Lt, &old, &rhs)?.0.to_bool()?;
  
+        let new_val = if min {
+            if lt { &old } else { &rhs }
+        } else {
+            if lt { &rhs } else { &old }
+        };
  
-    /// Update the memory cell data-race tracking for atomic
-    ///  store release semantics
-    fn release(&mut self, clocks: &ThreadClockSet, thread: ThreadId) {
-        let atomic = self.atomic_mut();
-        atomic.sync_vector.set_values(&clocks.clock);
-        atomic.release_sequences.clear_and_set(thread, &clocks.clock);
+        this.allow_data_races_mut(|this| this.write_immediate_to_mplace(**new_val, place))?;
+
+        this.validate_atomic_rmw(&place, atomic)?;
+
+        // Return the old value.
+        Ok(old)
      }
-    /// Update the memory cell data-race tracking for atomic
-    ///  store relaxed semantics
-    fn store_relaxed(&mut self, clocks: &ThreadClockSet, thread: ThreadId) {
-        let atomic = self.atomic_mut();
-        atomic.sync_vector.set_values(&clocks.fence_release);
-        if let Some(release) = atomic.release_sequences.load(thread) {
-            atomic.sync_vector.join(release);
+
+    /// Perform an atomic compare and exchange at a given memory location.
+    /// On success an atomic RMW operation is performed and on failure
+    /// only an atomic read occurs. If `can_fail_spuriously` is true,
+    /// then we treat it as a "compare_exchange_weak" operation, and
+    /// some portion of the time fail even when the values are actually
+    /// identical.
+    fn atomic_compare_exchange_scalar(
+        &mut self,
+        place: &MPlaceTy<'tcx, Tag>,
+        expect_old: &ImmTy<'tcx, Tag>,
+        new: ScalarMaybeUninit<Tag>,
+        success: AtomicRwOp,
+        fail: AtomicReadOp,
+        can_fail_spuriously: bool,
+    ) -> InterpResult<'tcx, Immediate<Tag>> {
+        use rand::Rng as _;
+        let this = self.eval_context_mut();
+
+        // Failure ordering cannot be stronger than success ordering, therefore first attempt
+        // to read with the failure ordering and if successful then try again with the success
+        // read ordering and write in the success case.
+        // Read as immediate for the sake of `binary_op()`
+        let old = this.allow_data_races_mut(|this| this.read_immediate(&(place.into())))?;
+        // `binary_op` will bail if either of them is not a scalar.
+        let eq = this.overflowing_binary_op(mir::BinOp::Eq, &old, expect_old)?.0;
+        // If the operation would succeed, but is "weak", fail some portion
+        // of the time, based on `rate`.
+        let rate = this.memory.extra.cmpxchg_weak_failure_rate;
+        let cmpxchg_success = eq.to_bool()?
+            && (!can_fail_spuriously || this.memory.extra.rng.borrow_mut().gen::<f64>() < rate);
+        let res = Immediate::ScalarPair(
+            old.to_scalar_or_uninit(),
+            Scalar::from_bool(cmpxchg_success).into(),
+        );
+
+        // Update ptr depending on comparison.
+        // if successful, perform a full rw-atomic validation
+        // otherwise treat this as an atomic load with the fail ordering.
+        if cmpxchg_success {
+            this.allow_data_races_mut(|this| this.write_scalar(new, &(*place).into()))?;
+            this.validate_atomic_rmw(place, success)?;
+        } else {
+            this.validate_atomic_load(place, fail)?;
          }
-        atomic.release_sequences.clear_and_retain(thread);
+
+        // Return the old value.
+        Ok(res)
      }
-    /// Update the memory cell data-race tracking for atomic
-    ///  store release semantics for RMW operations
-    fn rmw_release(&mut self, clocks: &ThreadClockSet, thread: ThreadId) {
-        let atomic = self.atomic_mut();
-        atomic.sync_vector.join(&clocks.clock);
-        atomic.release_sequences.insert(thread, &clocks.clock);
+
+    /// Update the data-race detector for an atomic read occurring at the
+    /// associated memory-place and on the current thread.
+    fn validate_atomic_load(
+        &self,
+        place: &MPlaceTy<'tcx, Tag>,
+        atomic: AtomicReadOp,
+    ) -> InterpResult<'tcx> {
+        let this = self.eval_context_ref();
+        this.validate_atomic_op(
+            place,
+            atomic,
+            "Atomic Load",
+            move |memory, clocks, index, atomic| {
+                if atomic == AtomicReadOp::Relaxed {
+                    memory.load_relaxed(&mut *clocks, index)
+                } else {
+                    memory.load_acquire(&mut *clocks, index)
+                }
+            },
+        )
      }
-    /// Update the memory cell data-race tracking for atomic
-    ///  store relaxed semantics for RMW operations
-    fn rmw_relaxed(&mut self, clocks: &ThreadClockSet) {
-        let atomic = self.atomic_mut();
-        atomic.sync_vector.join(&clocks.fence_release);
+
+    /// Update the data-race detector for an atomic write occurring at the
+    /// associated memory-place and on the current thread.
+    fn validate_atomic_store(
+        &mut self,
+        place: &MPlaceTy<'tcx, Tag>,
+        atomic: AtomicWriteOp,
+    ) -> InterpResult<'tcx> {
+        let this = self.eval_context_ref();
+        this.validate_atomic_op(
+            place,
+            atomic,
+            "Atomic Store",
+            move |memory, clocks, index, atomic| {
+                if atomic == AtomicWriteOp::Relaxed {
+                    memory.store_relaxed(clocks, index)
+                } else {
+                    memory.store_release(clocks, index)
+                }
+            },
+        )
      }
-    
-    
  
-    /// Detect races for non-atomic read operations at the current memory cell
-    ///  returns true if a data-race is detected
-    fn read_race_detect(&mut self, clocks: &ThreadClockSet, thread: ThreadId) -> bool {
-        if self.write <= clocks.clock[self.write_thread] {
-            self.read.set_at_thread(&clocks.clock, thread);
-            false
-        }else{
-            true
+    /// Update the data-race detector for an atomic read-modify-write occurring
+    /// at the associated memory place and on the current thread.
+    fn validate_atomic_rmw(
+        &mut self,
+        place: &MPlaceTy<'tcx, Tag>,
+        atomic: AtomicRwOp,
+    ) -> InterpResult<'tcx> {
+        use AtomicRwOp::*;
+        let acquire = matches!(atomic, Acquire | AcqRel | SeqCst);
+        let release = matches!(atomic, Release | AcqRel | SeqCst);
+        let this = self.eval_context_ref();
+        this.validate_atomic_op(place, atomic, "Atomic RMW", move |memory, clocks, index, _| {
+            if acquire {
+                memory.load_acquire(clocks, index)?;
+            } else {
+                memory.load_relaxed(clocks, index)?;
+            }
+            if release {
+                memory.rmw_release(clocks, index)
+            } else {
+                memory.rmw_relaxed(clocks, index)
+            }
+        })
+    }
+
+    /// Update the data-race detector for an atomic fence on the current thread.
+    fn validate_atomic_fence(&mut self, atomic: AtomicFenceOp) -> InterpResult<'tcx> {
+        let this = self.eval_context_mut();
+        if let Some(data_race) = &this.memory.extra.data_race {
+            data_race.maybe_perform_sync_operation(move |index, mut clocks| {
+                log::trace!("Atomic fence on {:?} with ordering {:?}", index, atomic);
+
+                // Apply data-race detection for the current fences
+                // this treats AcqRel and SeqCst as the same as a acquire
+                // and release fence applied in the same timestamp.
+                if atomic != AtomicFenceOp::Release {
+                    // Either Acquire | AcqRel | SeqCst
+                    clocks.apply_acquire_fence();
+                }
+                if atomic != AtomicFenceOp::Acquire {
+                    // Either Release | AcqRel | SeqCst
+                    clocks.apply_release_fence();
+                }
+
+                // Increment timestamp in case of release semantics.
+                Ok(atomic != AtomicFenceOp::Acquire)
+            })
+        } else {
+            Ok(())
          }
      }
  
-    /// Detect races for non-atomic write operations at the current memory cell
-    ///  returns true if a data-race is detected
-    fn write_race_detect(&mut self, clocks: &ThreadClockSet, thread: ThreadId) -> bool {
-        if self.write <= clocks.clock[self.write_thread] && self.read <= clocks.clock {
-            self.write = clocks.clock[thread];
-            self.write_thread = thread;
-            self.read.set_zero_vector();
-            false
-        }else{
-            true
+    fn reset_vector_clocks(&mut self, ptr: Pointer<Tag>, size: Size) -> InterpResult<'tcx> {
+        let this = self.eval_context_mut();
+        if let Some(data_race) = &mut this.memory.extra.data_race {
+            if data_race.multi_threaded.get() {
+                let alloc_meta =
+                    this.memory.get_raw_mut(ptr.alloc_id)?.extra.data_race.as_mut().unwrap();
+                alloc_meta.reset_clocks(ptr.offset, size);
+            }
          }
+        Ok(())
      }
  }
  
-/// Vector clock metadata for a logical memory allocation
+/// Vector clock metadata for a logical memory allocation.
  #[derive(Debug, Clone)]
  pub struct VClockAlloc {
-
-    /// Range of Vector clocks, mapping to the vector-clock
-    ///  index of the last write to the bytes in this allocation
+    /// Assigning each byte a MemoryCellClocks.
      alloc_ranges: RefCell<RangeMap<MemoryCellClocks>>,
  
-    // Pointer to global state
+    /// Pointer to global state.
      global: MemoryExtra,
  }
  
  impl VClockAlloc {
-
-    /// Create a new data-race allocation detector
-    pub fn new_allocation(global: &MemoryExtra, len: Size) -> VClockAlloc {
+    /// Create a new data-race detector for newly allocated memory.
+    pub fn new_allocation(
+        global: &MemoryExtra,
+        len: Size,
+        kind: MemoryKind<MiriMemoryKind>,
+    ) -> VClockAlloc {
+        let (alloc_timestamp, alloc_index) = match kind {
+            // User allocated and stack memory should track allocation.
+            MemoryKind::Machine(
+                MiriMemoryKind::Rust | MiriMemoryKind::C | MiriMemoryKind::WinHeap,
+            )
+            | MemoryKind::Stack => {
+                let (alloc_index, clocks) = global.current_thread_state();
+                let alloc_timestamp = clocks.clock[alloc_index];
+                (alloc_timestamp, alloc_index)
+            }
+            // Other global memory should trace races but be allocated at the 0 timestamp.
+            MemoryKind::Machine(
+                MiriMemoryKind::Global
+                | MiriMemoryKind::Machine
+                | MiriMemoryKind::Env
+                | MiriMemoryKind::ExternStatic
+                | MiriMemoryKind::Tls,
+            )
+            | MemoryKind::CallerLocation
+            | MemoryKind::Vtable => (0, VectorIdx::MAX_INDEX),
+        };
          VClockAlloc {
              global: Rc::clone(global),
-            alloc_ranges: RefCell::new(
-                RangeMap::new(len, MemoryCellClocks::default())
-            )
+            alloc_ranges: RefCell::new(RangeMap::new(
+                len,
+                MemoryCellClocks::new(alloc_timestamp, alloc_index),
+            )),
+        }
+    }
+
+    fn reset_clocks(&mut self, offset: Size, len: Size) {
+        let mut alloc_ranges = self.alloc_ranges.borrow_mut();
+        for (_, range) in alloc_ranges.iter_mut(offset, len) {
+            // Reset the portion of the range
+            *range = MemoryCellClocks::new(0, VectorIdx::MAX_INDEX);
          }
      }
  
-    /// Report a data-race found in the program
-    ///  this finds the two racing threads and the type
-    ///  of data-race that occured, this will also
-    ///  return info about the memory location the data-race
-    ///  occured in
+    // Find an index, if one exists where the value
+    // in `l` is greater than the value in `r`.
+    fn find_gt_index(l: &VClock, r: &VClock) -> Option<VectorIdx> {
+        log::trace!("Find index where not {:?} <= {:?}", l, r);
+        let l_slice = l.as_slice();
+        let r_slice = r.as_slice();
+        l_slice
+            .iter()
+            .zip(r_slice.iter())
+            .enumerate()
+            .find_map(|(idx, (&l, &r))| if l > r { Some(idx) } else { None })
+            .or_else(|| {
+                if l_slice.len() > r_slice.len() {
+                    // By invariant, if l_slice is longer
+                    // then one element must be larger.
+                    // This just validates that this is true
+                    // and reports earlier elements first.
+                    let l_remainder_slice = &l_slice[r_slice.len()..];
+                    let idx = l_remainder_slice
+                        .iter()
+                        .enumerate()
+                        .find_map(|(idx, &r)| if r == 0 { None } else { Some(idx) })
+                        .expect("Invalid VClock Invariant");
+                    Some(idx + r_slice.len())
+                } else {
+                    None
+                }
+            })
+            .map(|idx| VectorIdx::new(idx))
+    }
+
+    /// Report a data-race found in the program.
+    /// This finds the two racing threads and the type
+    /// of data-race that occurred. This will also
+    /// return info about the memory location the data-race
+    /// occurred in.
      #[cold]
      #[inline(never)]
      fn report_data_race<'tcx>(
-        global: &MemoryExtra, range: &MemoryCellClocks, action: &str,
-        pointer: Pointer<Tag>, len: Size
+        global: &MemoryExtra,
+        range: &MemoryCellClocks,
+        action: &str,
+        is_atomic: bool,
+        pointer: Pointer<Tag>,
+        len: Size,
      ) -> InterpResult<'tcx> {
-        let current_thread = global.current_thread();
-        let current_state = global.current_thread_state();
-        let mut write_clock = VClock::default();
-        let (
-            other_action, other_thread, other_clock
-        ) = if range.write > current_state.clock[range.write_thread] {
-
-            // Create effective write-clock that the data-race occured with
-            let wclock = write_clock.get_mut_with_min_len(
-                current_state.clock.as_slice().len()
-                .max(range.write_thread.to_u32() as usize + 1)
-            );
-            wclock[range.write_thread.to_u32() as usize] = range.write;
-            ("WRITE", range.write_thread, write_clock.as_slice())
-        }else{
-
-            // Find index in the read-clock that the data-race occured with
-            let read_slice = range.read.as_slice();
-            let clock_slice = current_state.clock.as_slice();
-            let conflicting_index = read_slice.iter()
-                .zip(clock_slice.iter())
-                .enumerate().find_map(|(idx,(&read, &clock))| {
-                    if read > clock {
-                        Some(idx)
-                    }else{
-                        None
-                    }
-            }).unwrap_or_else(|| {
-                assert!(read_slice.len() > clock_slice.len(), "BUG: cannot find read race yet reported data-race");
-                let rest_read = &read_slice[clock_slice.len()..];
-                rest_read.iter().enumerate().find_map(|(idx, &val)| {
-                    if val > 0 {
-                        Some(idx + clock_slice.len())
-                    }else{
-                        None
-                    }
-                }).expect("Invariant broken for read-slice, no 0 element at the tail")
-            });
-            ("READ", ThreadId::new(conflicting_index), range.read.as_slice())
+        let (current_index, current_clocks) = global.current_thread_state();
+        let write_clock;
+        let (other_action, other_thread, other_clock) = if range.write
+            > current_clocks.clock[range.write_index]
+        {
+            // Convert the write action into the vector clock it
+            // represents for diagnostic purposes.
+            write_clock = VClock::new_with_index(range.write_index, range.write);
+            (range.write_type.get_descriptor(), range.write_index, &write_clock)
+        } else if let Some(idx) = Self::find_gt_index(&range.read, &current_clocks.clock) {
+            ("Read", idx, &range.read)
+        } else if !is_atomic {
+            if let Some(atomic) = range.atomic() {
+                if let Some(idx) = Self::find_gt_index(&atomic.write_vector, &current_clocks.clock)
+                {
+                    ("Atomic Store", idx, &atomic.write_vector)
+                } else if let Some(idx) =
+                    Self::find_gt_index(&atomic.read_vector, &current_clocks.clock)
+                {
+                    ("Atomic Load", idx, &atomic.read_vector)
+                } else {
+                    unreachable!(
+                        "Failed to report data-race for non-atomic operation: no race found"
+                    )
+                }
+            } else {
+                unreachable!(
+                    "Failed to report data-race for non-atomic operation: no atomic component"
+                )
+            }
+        } else {
+            unreachable!("Failed to report data-race for atomic operation")
          };
  
-        let current_thread_info = global.print_thread_metadata(current_thread);
+        // Load elaborated thread information about the racing thread actions.
+        let current_thread_info = global.print_thread_metadata(current_index);
          let other_thread_info = global.print_thread_metadata(other_thread);
-        
-        // Throw the data-race detection
+
+        // Throw the data-race detection.
          throw_ub_format!(
              "Data race detected between {} on {} and {} on {}, memory({:?},offset={},size={})\
-            \n\t\t -current vector clock = {:?}\
-            \n\t\t -conflicting timestamp = {:?}",
-            action, current_thread_info, 
-            other_action, other_thread_info,
-            pointer.alloc_id, pointer.offset.bytes(), len.bytes(),
-            current_state.clock,
+            \n(current vector clock = {:?}, conflicting timestamp = {:?})",
+            action,
+            current_thread_info,
+            other_action,
+            other_thread_info,
+            pointer.alloc_id,
+            pointer.offset.bytes(),
+            len.bytes(),
+            current_clocks.clock,
              other_clock
          )
      }
  
-    /// Detect data-races for an unsychronized read operation, will not perform
-    ///  data-race threads if `multi-threaded` is false, either due to no threads
-    ///  being created or if it is temporarily disabled during a racy read or write
-    ///  operation
+    /// Detect data-races for an unsynchronized read operation, will not perform
+    /// data-race detection if `multi-threaded` is false, either due to no threads
+    /// being created or if it is temporarily disabled during a racy read or write
+    /// operation for which data-race detection is handled separately, for example
+    /// atomic read operations.
      pub fn read<'tcx>(&self, pointer: Pointer<Tag>, len: Size) -> InterpResult<'tcx> {
          if self.global.multi_threaded.get() {
-            let current_thread = self.global.current_thread();
-            let current_state = self.global.current_thread_state();
-
-            // The alloc-ranges are not split, however changes are not going to be made
-            //  to the ranges being tested, so this is ok
+            let (index, clocks) = self.global.current_thread_state();
              let mut alloc_ranges = self.alloc_ranges.borrow_mut();
-            for (_,range) in alloc_ranges.iter_mut(pointer.offset, len) {
-                if range.read_race_detect(&*current_state, current_thread) {
-                    // Report data-race
+            for (_, range) in alloc_ranges.iter_mut(pointer.offset, len) {
+                if let Err(DataRace) = range.read_race_detect(&*clocks, index) {
+                    // Report data-race.
                      return Self::report_data_race(
-                        &self.global,range, "READ", pointer, len
+                        &self.global,
+                        range,
+                        "Read",
+                        false,
+                        pointer,
+                        len,
                      );
                  }
              }
              Ok(())
-        }else{
+        } else {
              Ok(())
          }
      }
-    /// Detect data-races for an unsychronized write operation, will not perform
-    ///  data-race threads if `multi-threaded` is false, either due to no threads
-    ///  being created or if it is temporarily disabled during a racy read or write
-    ///  operation
-    pub fn write<'tcx>(&mut self, pointer: Pointer<Tag>, len: Size) -> InterpResult<'tcx> {
+
+    // Shared code for detecting data-races on unique access to a section of memory
+    fn unique_access<'tcx>(
+        &mut self,
+        pointer: Pointer<Tag>,
+        len: Size,
+        write_type: WriteType,
+    ) -> InterpResult<'tcx> {
          if self.global.multi_threaded.get() {
-            let current_thread = self.global.current_thread();
-            let current_state = self.global.current_thread_state();
-            for (_,range) in self.alloc_ranges.get_mut().iter_mut(pointer.offset, len) {
-                if range.write_race_detect(&*current_state, current_thread) {
+            let (index, clocks) = self.global.current_thread_state();
+            for (_, range) in self.alloc_ranges.get_mut().iter_mut(pointer.offset, len) {
+                if let Err(DataRace) = range.write_race_detect(&*clocks, index, write_type) {
                      // Report data-race
                      return Self::report_data_race(
-                        &self.global, range, "WRITE", pointer, len
+                        &self.global,
+                        range,
+                        write_type.get_descriptor(),
+                        false,
+                        pointer,
+                        len,
                      );
                  }
              }
              Ok(())
-        }else{
-            Ok(())
-        }
-    }
-    /// Detect data-races for an unsychronized deallocate operation, will not perform
-    ///  data-race threads if `multi-threaded` is false, either due to no threads
-    ///  being created or if it is temporarily disabled during a racy read or write
-    ///  operation
-    pub fn deallocate<'tcx>(&mut self, pointer: Pointer<Tag>, len: Size) -> InterpResult<'tcx> {
-        if self.global.multi_threaded.get() {
-            let current_thread = self.global.current_thread();
-            let current_state = self.global.current_thread_state();
-            for (_,range) in self.alloc_ranges.get_mut().iter_mut(pointer.offset, len) {
-                if range.write_race_detect(&*current_state, current_thread) {
-                    // Report data-race
-                    return Self::report_data_race(
-                        &self.global, range, "DEALLOCATE", pointer, len
-                    );
-                }
-            }
-           Ok(())
-        }else{
+        } else {
              Ok(())
          }
      }
-}
  
-/// The current set of vector clocks describing the state
-///  of a thread, contains the happens-before clock and
-///  additional metadata to model atomic fence operations
-#[derive(Clone, Default, Debug)]
-struct ThreadClockSet {
-    /// The increasing clock representing timestamps
-    ///  that happen-before this thread.
-    clock: VClock,
-
-    /// The set of timestamps that will happen-before this
-    ///  thread once it performs an acquire fence
-    fence_acquire: VClock,
+    /// Detect data-races for an unsynchronized write operation, will not perform
+    /// data-race threads if `multi-threaded` is false, either due to no threads
+    /// being created or if it is temporarily disabled during a racy read or write
+    /// operation
+    pub fn write<'tcx>(&mut self, pointer: Pointer<Tag>, len: Size) -> InterpResult<'tcx> {
+        self.unique_access(pointer, len, WriteType::Write)
+    }
  
-    /// The last timesamp of happens-before relations that
-    ///  have been released by this thread by a fence
-    fence_release: VClock,
+    /// Detect data-races for an unsynchronized deallocate operation, will not perform
+    /// data-race threads if `multi-threaded` is false, either due to no threads
+    /// being created or if it is temporarily disabled during a racy read or write
+    /// operation
+    pub fn deallocate<'tcx>(&mut self, pointer: Pointer<Tag>, len: Size) -> InterpResult<'tcx> {
+        self.unique_access(pointer, len, WriteType::Deallocate)
+    }
  }
  
-impl ThreadClockSet {
-
-    /// Apply the effects of a release fence to this
-    ///  set of thread vector clocks
+impl<'mir, 'tcx: 'mir> EvalContextPrivExt<'mir, 'tcx> for MiriEvalContext<'mir, 'tcx> {}
+trait EvalContextPrivExt<'mir, 'tcx: 'mir>: MiriEvalContextExt<'mir, 'tcx> {
+    // Temporarily allow data-races to occur, this should only be
+    // used if either one of the appropriate `validate_atomic` functions
+    // will be called to treat a memory access as atomic or if the memory
+    // being accessed should be treated as internal state, that cannot be
+    // accessed by the interpreted program.
      #[inline]
-    fn apply_release_fence(&mut self) {
-        self.fence_release.set_values(&self.clock);
+    fn allow_data_races_ref<R>(&self, op: impl FnOnce(&MiriEvalContext<'mir, 'tcx>) -> R) -> R {
+        let this = self.eval_context_ref();
+        let old = if let Some(data_race) = &this.memory.extra.data_race {
+            data_race.multi_threaded.replace(false)
+        } else {
+            false
+        };
+        let result = op(this);
+        if let Some(data_race) = &this.memory.extra.data_race {
+            data_race.multi_threaded.set(old);
+        }
+        result
      }
  
-    /// Apply the effects of a acquire fence to this
-    ///  set of thread vector clocks
+    /// Same as `allow_data_races_ref`, this temporarily disables any data-race detection and
+    /// so should only be used for atomic operations or internal state that the program cannot
+    /// access.
      #[inline]
-    fn apply_acquire_fence(&mut self) {
-        self.clock.join(&self.fence_acquire);
+    fn allow_data_races_mut<R>(
+        &mut self,
+        op: impl FnOnce(&mut MiriEvalContext<'mir, 'tcx>) -> R,
+    ) -> R {
+        let this = self.eval_context_mut();
+        let old = if let Some(data_race) = &this.memory.extra.data_race {
+            data_race.multi_threaded.replace(false)
+        } else {
+            false
+        };
+        let result = op(this);
+        if let Some(data_race) = &this.memory.extra.data_race {
+            data_race.multi_threaded.set(old);
+        }
+        result
      }
  
-    /// Increment the happens-before clock at a
-    ///  known index
-    #[inline]
-    fn increment_clock(&mut self, thread: ThreadId) {
-        self.clock.increment_thread(thread);
-    }
+    /// Generic atomic operation implementation,
+    /// this accesses memory via get_raw instead of
+    /// get_raw_mut, due to issues calling get_raw_mut
+    /// for atomic loads from read-only memory.
+    /// FIXME: is this valid, or should get_raw_mut be used for
+    /// atomic-stores/atomic-rmw?
+    fn validate_atomic_op<A: Debug + Copy>(
+        &self,
+        place: &MPlaceTy<'tcx, Tag>,
+        atomic: A,
+        description: &str,
+        mut op: impl FnMut(
+            &mut MemoryCellClocks,
+            &mut ThreadClockSet,
+            VectorIdx,
+            A,
+        ) -> Result<(), DataRace>,
+    ) -> InterpResult<'tcx> {
+        let this = self.eval_context_ref();
+        if let Some(data_race) = &this.memory.extra.data_race {
+            if data_race.multi_threaded.get() {
+                // Load and log the atomic operation.
+                let place_ptr = place.ptr.assert_ptr();
+                let size = place.layout.size;
+                let alloc_meta =
+                    &this.memory.get_raw(place_ptr.alloc_id)?.extra.data_race.as_ref().unwrap();
+                log::trace!(
+                    "Atomic op({}) with ordering {:?} on memory({:?}, offset={}, size={})",
+                    description,
+                    &atomic,
+                    place_ptr.alloc_id,
+                    place_ptr.offset.bytes(),
+                    size.bytes()
+                );
+
+                // Perform the atomic operation.
+                let data_race = &alloc_meta.global;
+                data_race.maybe_perform_sync_operation(|index, mut clocks| {
+                    for (_, range) in
+                        alloc_meta.alloc_ranges.borrow_mut().iter_mut(place_ptr.offset, size)
+                    {
+                        if let Err(DataRace) = op(range, &mut *clocks, index, atomic) {
+                            mem::drop(clocks);
+                            return VClockAlloc::report_data_race(
+                                &alloc_meta.global,
+                                range,
+                                description,
+                                true,
+                                place_ptr,
+                                size,
+                            )
+                            .map(|_| true);
+                        }
+                    }
  
-    /// Join the happens-before clock with that of
-    ///  another thread, used to model thread join
-    ///  operations
-    fn join_with(&mut self, other: &ThreadClockSet) {
-        self.clock.join(&other.clock);
+                    // This conservatively assumes all operations have release semantics
+                    Ok(true)
+                })?;
+
+                // Log changes to atomic memory.
+                if log::log_enabled!(log::Level::Trace) {
+                    for (_, range) in alloc_meta.alloc_ranges.borrow().iter(place_ptr.offset, size)
+                    {
+                        log::trace!(
+                            "Updated atomic memory({:?}, offset={}, size={}) to {:#?}",
+                            place.ptr.assert_ptr().alloc_id,
+                            place_ptr.offset.bytes(),
+                            size.bytes(),
+                            range.atomic_ops
+                        );
+                    }
+                }
+            }
+        }
+        Ok(())
      }
  }
  
+/// Extra metadata associated with a thread.
+#[derive(Debug, Clone, Default)]
+struct ThreadExtraState {
+    /// The current vector index in use by the
+    /// thread currently, this is set to None
+    /// after the vector index has been re-used
+    /// and hence the value will never need to be
+    /// read during data-race reporting.
+    vector_index: Option<VectorIdx>,
+
+    /// The name of the thread, updated for better
+    /// diagnostics when reporting detected data
+    /// races.
+    thread_name: Option<Box<str>>,
+
+    /// Thread termination vector clock, this
+    /// is set on thread termination and is used
+    /// for joining on threads since the vector_index
+    /// may be re-used when the join operation occurs.
+    termination_vector_clock: Option<VClock>,
+}
+
  /// Global data-race detection state, contains the currently
-///  executing thread as well as the vector-clocks associated
-///  with each of the threads.
+/// executing thread as well as the vector-clocks associated
+/// with each of the threads.
  #[derive(Debug, Clone)]
  pub struct GlobalState {
-
      /// Set to true once the first additional
-    ///  thread has launched, due to the dependency
-    ///  between before and after a thread launch
+    /// thread has launched, due to the dependency
+    /// between before and after a thread launch.
      /// Any data-races must be recorded after this
-    ///  so concurrent execution can ignore recording
-    ///  any data-races
+    /// so concurrent execution can ignore recording
+    /// any data-races.
      multi_threaded: Cell<bool>,
  
-    /// The current vector clock for all threads
-    ///  this includes threads that have terminated
-    ///  execution
-    thread_clocks: RefCell<IndexVec<ThreadId, ThreadClockSet>>,
-
-    /// Thread name cache for better diagnostics on the reporting
-    ///  of a data-race
-    thread_names: RefCell<IndexVec<ThreadId, Option<Box<str>>>>,
-
-    /// The current thread being executed,
-    ///  this is mirrored from the scheduler since
-    ///  it is required for loading the current vector
-    ///  clock for data-race detection
-    current_thread_id: Cell<ThreadId>,
+    /// Mapping of a vector index to a known set of thread
+    /// clocks, this is not directly mapping from a thread id
+    /// since it may refer to multiple threads.
+    vector_clocks: RefCell<IndexVec<VectorIdx, ThreadClockSet>>,
+
+    /// Mapping of a given vector index to the current thread
+    /// that the execution is representing, this may change
+    /// if a vector index is re-assigned to a new thread.
+    vector_info: RefCell<IndexVec<VectorIdx, ThreadId>>,
+
+    /// The mapping of a given thread to associated thread metadata.
+    thread_info: RefCell<IndexVec<ThreadId, ThreadExtraState>>,
+
+    /// The current vector index being executed.
+    current_index: Cell<VectorIdx>,
+
+    /// Potential vector indices that could be re-used on thread creation
+    /// values are inserted here on after the thread has terminated and
+    /// been joined with, and hence may potentially become free
+    /// for use as the index for a new thread.
+    /// Elements in this set may still require the vector index to
+    /// report data-races, and can only be re-used after all
+    /// active vector-clocks catch up with the threads timestamp.
+    reuse_candidates: RefCell<FxHashSet<VectorIdx>>,
+
+    /// Counts the number of threads that are currently active
+    /// if the number of active threads reduces to 1 and then
+    /// a join operation occurs with the remaining main thread
+    /// then multi-threaded execution may be disabled.
+    active_thread_count: Cell<usize>,
+
+    /// This contains threads that have terminated, but not yet joined
+    /// and so cannot become re-use candidates until a join operation
+    /// occurs.
+    /// The associated vector index will be moved into re-use candidates
+    /// after the join operation occurs.
+    terminated_threads: RefCell<FxHashMap<ThreadId, VectorIdx>>,
  }
-impl GlobalState {
  
+impl GlobalState {
      /// Create a new global state, setup with just thread-id=0
-    ///  advanced to timestamp = 1
+    /// advanced to timestamp = 1.
      pub fn new() -> Self {
-        let mut vec = IndexVec::new();
-        let thread_id = vec.push(ThreadClockSet::default());
-        vec[thread_id].increment_clock(thread_id);
-        GlobalState {
+        let global_state = GlobalState {
              multi_threaded: Cell::new(false),
-            thread_clocks: RefCell::new(vec),
-            thread_names: RefCell::new(IndexVec::new()),
-            current_thread_id: Cell::new(thread_id),
+            vector_clocks: RefCell::new(IndexVec::new()),
+            vector_info: RefCell::new(IndexVec::new()),
+            thread_info: RefCell::new(IndexVec::new()),
+            current_index: Cell::new(VectorIdx::new(0)),
+            active_thread_count: Cell::new(1),
+            reuse_candidates: RefCell::new(FxHashSet::default()),
+            terminated_threads: RefCell::new(FxHashMap::default()),
+        };
+
+        // Setup the main-thread since it is not explicitly created:
+        // uses vector index and thread-id 0, also the rust runtime gives
+        // the main-thread a name of "main".
+        let index = global_state.vector_clocks.borrow_mut().push(ThreadClockSet::default());
+        global_state.vector_info.borrow_mut().push(ThreadId::new(0));
+        global_state.thread_info.borrow_mut().push(ThreadExtraState {
+            vector_index: Some(index),
+            thread_name: Some("main".to_string().into_boxed_str()),
+            termination_vector_clock: None,
+        });
+
+        global_state
+    }
+
+    // Try to find vector index values that can potentially be re-used
+    // by a new thread instead of a new vector index being created.
+    fn find_vector_index_reuse_candidate(&self) -> Option<VectorIdx> {
+        let mut reuse = self.reuse_candidates.borrow_mut();
+        let vector_clocks = self.vector_clocks.borrow();
+        let vector_info = self.vector_info.borrow();
+        let terminated_threads = self.terminated_threads.borrow();
+        for &candidate in reuse.iter() {
+            let target_timestamp = vector_clocks[candidate].clock[candidate];
+            if vector_clocks.iter_enumerated().all(|(clock_idx, clock)| {
+                // The thread happens before the clock, and hence cannot report
+                // a data-race with this the candidate index.
+                let no_data_race = clock.clock[candidate] >= target_timestamp;
+
+                // The vector represents a thread that has terminated and hence cannot
+                // report a data-race with the candidate index.
+                let thread_id = vector_info[clock_idx];
+                let vector_terminated =
+                    reuse.contains(&clock_idx) || terminated_threads.contains_key(&thread_id);
+
+                // The vector index cannot report a race with the candidate index
+                // and hence allows the candidate index to be re-used.
+                no_data_race || vector_terminated
+            }) {
+                // All vector clocks for each vector index are equal to
+                // the target timestamp, and the thread is known to have
+                // terminated, therefore this vector clock index cannot
+                // report any more data-races.
+                assert!(reuse.remove(&candidate));
+                return Some(candidate);
+            }
          }
+        None
      }
-    
  
      // Hook for thread creation, enabled multi-threaded execution and marks
-    //  the current thread timestamp as happening-before the current thread
+    // the current thread timestamp as happening-before the current thread.
      #[inline]
      pub fn thread_created(&self, thread: ThreadId) {
+        let current_index = self.current_index();
  
-        // Enable multi-threaded execution mode now that there are at least
-        //  two threads
+        // Increment the number of active threads.
+        let active_threads = self.active_thread_count.get();
+        self.active_thread_count.set(active_threads + 1);
+
+        // Enable multi-threaded execution, there are now two threads
+        // so data-races are now possible.
          self.multi_threaded.set(true);
-        let current_thread = self.current_thread_id.get();
-        let mut vectors = self.thread_clocks.borrow_mut();
-        vectors.ensure_contains_elem(thread, Default::default);
-        let (current, created) = vectors.pick2_mut(current_thread, thread);
  
-        // Pre increment clocks before atomic operation
-        current.increment_clock(current_thread);
+        // Load and setup the associated thread metadata
+        let mut thread_info = self.thread_info.borrow_mut();
+        thread_info.ensure_contains_elem(thread, Default::default);
+
+        // Assign a vector index for the thread, attempting to re-use an old
+        // vector index that can no longer report any data-races if possible.
+        let created_index = if let Some(reuse_index) = self.find_vector_index_reuse_candidate() {
+            // Now re-configure the re-use candidate, increment the clock
+            // for the new sync use of the vector.
+            let mut vector_clocks = self.vector_clocks.borrow_mut();
+            vector_clocks[reuse_index].increment_clock(reuse_index);
+
+            // Locate the old thread the vector was associated with and update
+            // it to represent the new thread instead.
+            let mut vector_info = self.vector_info.borrow_mut();
+            let old_thread = vector_info[reuse_index];
+            vector_info[reuse_index] = thread;
+
+            // Mark the thread the vector index was associated with as no longer
+            // representing a thread index.
+            thread_info[old_thread].vector_index = None;
+
+            reuse_index
+        } else {
+            // No vector re-use candidates available, instead create
+            // a new vector index.
+            let mut vector_info = self.vector_info.borrow_mut();
+            vector_info.push(thread)
+        };
+
+        log::trace!("Creating thread = {:?} with vector index = {:?}", thread, created_index);
+
+        // Mark the chosen vector index as in use by the thread.
+        thread_info[thread].vector_index = Some(created_index);
+
+        // Create a thread clock set if applicable.
+        let mut vector_clocks = self.vector_clocks.borrow_mut();
+        if created_index == vector_clocks.next_index() {
+            vector_clocks.push(ThreadClockSet::default());
+        }
+
+        // Now load the two clocks and configure the initial state.
+        let (current, created) = vector_clocks.pick2_mut(current_index, created_index);
  
-        // The current thread happens-before the created thread
-        //  so update the created vector clock
+        // Join the created with current, since the current threads
+        // previous actions happen-before the created thread.
          created.join_with(current);
  
-        // Post increment clocks after atomic operation
-        current.increment_clock(current_thread);
-        created.increment_clock(thread);
+        // Advance both threads after the synchronized operation.
+        // Both operations are considered to have release semantics.
+        current.increment_clock(current_index);
+        created.increment_clock(created_index);
      }
  
      /// Hook on a thread join to update the implicit happens-before relation
-    ///  between the joined thead and the current thread
+    /// between the joined thread and the current thread.
      #[inline]
      pub fn thread_joined(&self, current_thread: ThreadId, join_thread: ThreadId) {
-        let mut vectors = self.thread_clocks.borrow_mut();
-        let (current, join) = vectors.pick2_mut(current_thread, join_thread);
+        let mut clocks_vec = self.vector_clocks.borrow_mut();
+        let thread_info = self.thread_info.borrow();
+
+        // Load the vector clock of the current thread.
+        let current_index = thread_info[current_thread]
+            .vector_index
+            .expect("Performed thread join on thread with no assigned vector");
+        let current = &mut clocks_vec[current_index];
  
-        // Pre increment clocks before atomic operation
-        current.increment_clock(current_thread);
-        join.increment_clock(join_thread);
+        // Load the associated vector clock for the terminated thread.
+        let join_clock = thread_info[join_thread]
+            .termination_vector_clock
+            .as_ref()
+            .expect("Joined with thread but thread has not terminated");
  
          // The join thread happens-before the current thread
-        //   so update the current vector clock
-        current.join_with(join);
+        // so update the current vector clock.
+        // Is not a release operation so the clock is not incremented.
+        current.clock.join(join_clock);
+
+        // Check the number of active threads, if the value is 1
+        // then test for potentially disabling multi-threaded execution.
+        let active_threads = self.active_thread_count.get();
+        if active_threads == 1 {
+            // May potentially be able to disable multi-threaded execution.
+            let current_clock = &clocks_vec[current_index];
+            if clocks_vec
+                .iter_enumerated()
+                .all(|(idx, clocks)| clocks.clock[idx] <= current_clock.clock[idx])
+            {
+                // All thread terminations happen-before the current clock
+                // therefore no data-races can be reported until a new thread
+                // is created, so disable multi-threaded execution.
+                self.multi_threaded.set(false);
+            }
+        }
+
+        // If the thread is marked as terminated but not joined
+        // then move the thread to the re-use set.
+        let mut termination = self.terminated_threads.borrow_mut();
+        if let Some(index) = termination.remove(&join_thread) {
+            let mut reuse = self.reuse_candidates.borrow_mut();
+            reuse.insert(index);
+        }
+    }
+
+    /// On thread termination, the vector-clock may re-used
+    /// in the future once all remaining thread-clocks catch
+    /// up with the time index of the terminated thread.
+    /// This assigns thread termination with a unique index
+    /// which will be used to join the thread
+    /// This should be called strictly before any calls to
+    /// `thread_joined`.
+    #[inline]
+    pub fn thread_terminated(&self) {
+        let current_index = self.current_index();
+
+        // Increment the clock to a unique termination timestamp.
+        let mut vector_clocks = self.vector_clocks.borrow_mut();
+        let current_clocks = &mut vector_clocks[current_index];
+        current_clocks.increment_clock(current_index);
+
+        // Load the current thread id for the executing vector.
+        let vector_info = self.vector_info.borrow();
+        let current_thread = vector_info[current_index];
+
+        // Load the current thread metadata, and move to a terminated
+        // vector state. Setting up the vector clock all join operations
+        // will use.
+        let mut thread_info = self.thread_info.borrow_mut();
+        let current = &mut thread_info[current_thread];
+        current.termination_vector_clock = Some(current_clocks.clock.clone());
  
-        // Post increment clocks after atomic operation
-        current.increment_clock(current_thread);
-        join.increment_clock(join_thread);
+        // Add this thread as a candidate for re-use after a thread join
+        // occurs.
+        let mut termination = self.terminated_threads.borrow_mut();
+        termination.insert(current_thread, current_index);
+
+        // Reduce the number of active threads, now that a thread has
+        // terminated.
+        let mut active_threads = self.active_thread_count.get();
+        active_threads -= 1;
+        self.active_thread_count.set(active_threads);
      }
  
      /// Hook for updating the local tracker of the currently
-    ///  enabled thread, should always be updated whenever
-    ///  `active_thread` in thread.rs is updated
+    /// enabled thread, should always be updated whenever
+    /// `active_thread` in thread.rs is updated.
      #[inline]
      pub fn thread_set_active(&self, thread: ThreadId) {
-        self.current_thread_id.set(thread);
+        let thread_info = self.thread_info.borrow();
+        let vector_idx = thread_info[thread]
+            .vector_index
+            .expect("Setting thread active with no assigned vector");
+        self.current_index.set(vector_idx);
      }
  
      /// Hook for updating the local tracker of the threads name
-    ///  this should always mirror the local value in thread.rs
-    ///  the thread name is used for improved diagnostics
-    ///  during a data-race
+    /// this should always mirror the local value in thread.rs
+    /// the thread name is used for improved diagnostics
+    /// during a data-race.
      #[inline]
-    pub fn thread_set_name(&self, name: String) {
+    pub fn thread_set_name(&self, thread: ThreadId, name: String) {
          let name = name.into_boxed_str();
-        let mut names = self.thread_names.borrow_mut();
-        let thread = self.current_thread_id.get();
-        names.ensure_contains_elem(thread, Default::default);
-        names[thread] = Some(name);
-    }
-
-
-    /// Advance the vector clock for a thread
-    ///  this is called before and after any atomic/synchronizing operations
-    ///  that may manipulate state
-    #[inline]
-    fn advance_vector_clock(&self) {
-        let thread = self.current_thread_id.get();
-        let mut vectors = self.thread_clocks.borrow_mut();
-        vectors[thread].increment_clock(thread);
-
-        // Log the increment in the atomic vector clock
-        log::trace!("Atomic vector clock increase for {:?} to {:?}",thread, vectors[thread].clock);
+        let mut thread_info = self.thread_info.borrow_mut();
+        thread_info[thread].thread_name = Some(name);
+    }
+
+    /// Attempt to perform a synchronized operation, this
+    /// will perform no operation if multi-threading is
+    /// not currently enabled.
+    /// Otherwise it will increment the clock for the current
+    /// vector before and after the operation for data-race
+    /// detection between any happens-before edges the
+    /// operation may create.
+    fn maybe_perform_sync_operation<'tcx>(
+        &self,
+        op: impl FnOnce(VectorIdx, RefMut<'_, ThreadClockSet>) -> InterpResult<'tcx, bool>,
+    ) -> InterpResult<'tcx> {
+        if self.multi_threaded.get() {
+            let (index, clocks) = self.current_thread_state_mut();
+            if op(index, clocks)? {
+                let (_, mut clocks) = self.current_thread_state_mut();
+                clocks.increment_clock(index);
+            }
+        }
+        Ok(())
      }
-    
  
      /// Internal utility to identify a thread stored internally
-    ///  returns the id and the name for better diagnostics
-    fn print_thread_metadata(&self, thread: ThreadId) -> String {
-        if let Some(Some(name)) = self.thread_names.borrow().get(thread) {
+    /// returns the id and the name for better diagnostics.
+    fn print_thread_metadata(&self, vector: VectorIdx) -> String {
+        let thread = self.vector_info.borrow()[vector];
+        let thread_name = &self.thread_info.borrow()[thread].thread_name;
+        if let Some(name) = thread_name {
              let name: &str = name;
              format!("Thread(id = {:?}, name = {:?})", thread.to_u32(), &*name)
-        }else{
+        } else {
              format!("Thread(id = {:?})", thread.to_u32())
          }
      }
  
-
      /// Acquire a lock, express that the previous call of
-    ///  `validate_lock_release` must happen before this
-    pub fn validate_lock_acquire(&self, lock: &DataRaceLockHandle, thread: ThreadId) {
-        let mut ref_vector = self.thread_clocks.borrow_mut();
-        ref_vector[thread].increment_clock(thread);
-
-        let clocks = &mut ref_vector[thread];
-        clocks.clock.join(&lock.clock);
-
-        ref_vector[thread].increment_clock(thread);
+    /// `validate_lock_release` must happen before this.
+    /// As this is an acquire operation, the thread timestamp is not
+    /// incremented.
+    pub fn validate_lock_acquire(&self, lock: &VClock, thread: ThreadId) {
+        let (_, mut clocks) = self.load_thread_state_mut(thread);
+        clocks.clock.join(&lock);
      }
  
      /// Release a lock handle, express that this happens-before
-    ///  any subsequent calls to `validate_lock_acquire`
-    pub fn validate_lock_release(&self, lock: &mut DataRaceLockHandle, thread: ThreadId) {
-        let mut ref_vector = self.thread_clocks.borrow_mut();
-        ref_vector[thread].increment_clock(thread);
-
-        let clocks = &ref_vector[thread];
-        lock.clock.set_values(&clocks.clock);
-
-        ref_vector[thread].increment_clock(thread);
+    /// any subsequent calls to `validate_lock_acquire`.
+    /// For normal locks this should be equivalent to `validate_lock_release_shared`
+    /// since an acquire operation should have occurred before, however
+    /// for futex & condvar operations this is not the case and this
+    /// operation must be used.
+    pub fn validate_lock_release(&self, lock: &mut VClock, thread: ThreadId) {
+        let (index, mut clocks) = self.load_thread_state_mut(thread);
+        lock.clone_from(&clocks.clock);
+        clocks.increment_clock(index);
      }
  
      /// Release a lock handle, express that this happens-before
-    ///  any subsequent calls to `validate_lock_acquire` as well
-    ///  as any previous calls to this function after any
-    ///  `validate_lock_release` calls
-    pub fn validate_lock_release_shared(&self, lock: &mut DataRaceLockHandle, thread: ThreadId) {
-        let mut ref_vector = self.thread_clocks.borrow_mut();
-        ref_vector[thread].increment_clock(thread);
-
-        let clocks = &ref_vector[thread];
-        lock.clock.join(&clocks.clock);
-
-        ref_vector[thread].increment_clock(thread);
-    }
-
-    /// Load the thread clock set associated with the current thread
-    #[inline]
-    fn current_thread_state(&self) -> Ref<'_, ThreadClockSet> {
-        let ref_vector = self.thread_clocks.borrow();
-        let thread = self.current_thread_id.get();
-        Ref::map(ref_vector, |vector| &vector[thread])
-    }
-
-    /// Load the thread clock set associated with the current thread
-    ///  mutably for modification
-    #[inline]
-    fn current_thread_state_mut(&self) -> RefMut<'_, ThreadClockSet> {
-        let ref_vector = self.thread_clocks.borrow_mut();
-        let thread = self.current_thread_id.get();
-        RefMut::map(ref_vector, |vector| &mut vector[thread])
-    }
-
-    /// Return the current thread, should be the same
-    ///  as the data-race active thread
-    #[inline]
-    fn current_thread(&self) -> ThreadId {
-        self.current_thread_id.get()
-    }
-}
-
-
-/// The size of the vector-clock to store inline
-///  clock vectors larger than this will be stored on the heap
-const SMALL_VECTOR: usize = 4;
-
-/// The type of the time-stamps recorded in the data-race detector
-///  set to a type of unsigned integer
-type Timestamp = u32;
-
-/// A vector clock for detecting data-races
-///  invariants:
-///   - the last element in a VClock must not be 0
-///     -- this means that derive(PartialEq & Eq) is correct
-///     --  as there is no implicit zero tail that might be equal
-///     --  also simplifies the implementation of PartialOrd
-#[derive(Clone, PartialEq, Eq, Default, Debug)]
-pub struct VClock(SmallVec<[Timestamp; SMALL_VECTOR]>);
-
-impl VClock {
-
-    /// Load the backing slice behind the clock vector.
-    #[inline]
-    fn as_slice(&self) -> &[Timestamp] {
-        self.0.as_slice()
-    }
-
-    /// Get a mutable slice to the internal vector with minimum `min_len`
-    ///  elements, to preserve invariants this vector must modify
-    ///  the `min_len`-1 nth element to a non-zero value
+    /// any subsequent calls to `validate_lock_acquire` as well
+    /// as any previous calls to this function after any
+    /// `validate_lock_release` calls.
+    /// For normal locks this should be equivalent to `validate_lock_release`.
+    /// This function only exists for joining over the set of concurrent readers
+    /// in a read-write lock and should not be used for anything else.
+    pub fn validate_lock_release_shared(&self, lock: &mut VClock, thread: ThreadId) {
+        let (index, mut clocks) = self.load_thread_state_mut(thread);
+        lock.join(&clocks.clock);
+        clocks.increment_clock(index);
+    }
+
+    /// Load the vector index used by the given thread as well as the set of vector clocks
+    /// used by the thread.
      #[inline]
-    fn get_mut_with_min_len(&mut self, min_len: usize) -> &mut [Timestamp] {
-        if self.0.len() < min_len {
-            self.0.resize(min_len, 0);
-        }
-        assert!(self.0.len() >= min_len);
-        self.0.as_mut_slice()
-    }
-
-    /// Increment the vector clock at a known index
-    #[inline]
-    fn increment_index(&mut self, idx: usize) {
-        let mut_slice = self.get_mut_with_min_len(idx + 1);
-        let idx_ref = &mut mut_slice[idx];
-        *idx_ref = idx_ref.checked_add(1).expect("Vector clock overflow")
-    }
-
-    // Increment the vector element representing the progress
-    //  of execution in the given thread
-    #[inline]
-    pub fn increment_thread(&mut self, thread: ThreadId) {
-        self.increment_index(thread.to_u32() as usize);
-    }
-
-    // Join the two vector-clocks together, this
-    //  sets each vector-element to the maximum value
-    //  of that element in either of the two source elements.
-    pub fn join(&mut self, other: &Self) {
-        let rhs_slice = other.as_slice();
-        let lhs_slice = self.get_mut_with_min_len(rhs_slice.len());
-
-        // Element-wise set to maximum.
-        for (l, &r) in lhs_slice.iter_mut().zip(rhs_slice.iter()) {
-            *l = r.max(*l);
-        }
-    }
-
-    /// Joins with a thread at a known index
-    fn set_at_index(&mut self, other: &Self, idx: usize){
-        let mut_slice = self.get_mut_with_min_len(idx + 1);
-        let slice = other.as_slice();
-        mut_slice[idx] = slice[idx];
+    fn load_thread_state_mut(&self, thread: ThreadId) -> (VectorIdx, RefMut<'_, ThreadClockSet>) {
+        let index = self.thread_info.borrow()[thread]
+            .vector_index
+            .expect("Loading thread state for thread with no assigned vector");
+        let ref_vector = self.vector_clocks.borrow_mut();
+        let clocks = RefMut::map(ref_vector, |vec| &mut vec[index]);
+        (index, clocks)
      }
  
-    /// Join with a threads vector clock only at the desired index
-    ///  returns true if the value updated
+    /// Load the current vector clock in use and the current set of thread clocks
+    /// in use for the vector.
      #[inline]
-    pub fn set_at_thread(&mut self, other: &Self, thread: ThreadId){
-        self.set_at_index(other, thread.to_u32() as usize);
+    fn current_thread_state(&self) -> (VectorIdx, Ref<'_, ThreadClockSet>) {
+        let index = self.current_index();
+        let ref_vector = self.vector_clocks.borrow();
+        let clocks = Ref::map(ref_vector, |vec| &vec[index]);
+        (index, clocks)
      }
  
-    /// Clear the vector to all zeros, stored as an empty internal
-    ///  vector
+    /// Load the current vector clock in use and the current set of thread clocks
+    /// in use for the vector mutably for modification.
      #[inline]
-    pub fn set_zero_vector(&mut self) {
-        self.0.clear();
-    }
-
-    /// Set the values stored in this vector clock
-    ///  to the values stored in another.
-    pub fn set_values(&mut self, new_value: &VClock) {
-        let new_slice = new_value.as_slice();
-        self.0.resize(new_slice.len(), 0);
-        self.0.copy_from_slice(new_slice);
-    }
-}
-
-
-impl PartialOrd for VClock {
-    fn partial_cmp(&self, other: &VClock) -> Option<Ordering> {
-
-        // Load the values as slices
-        let lhs_slice = self.as_slice();
-        let rhs_slice = other.as_slice();
-
-        // Iterate through the combined vector slice
-        //  keeping track of the order that is currently possible to satisfy.
-        // If an ordering relation is detected to be impossible, then bail and
-        //  directly return None
-        let mut iter = lhs_slice.iter().zip(rhs_slice.iter());
-        let mut order = match iter.next() {
-            Some((lhs, rhs)) => lhs.cmp(rhs),
-            None => Ordering::Equal
-        };
-        for (l, r) in iter {
-            match order {
-                Ordering::Equal => order = l.cmp(r),
-                Ordering::Less => if l > r {
-                    return None
-                },
-                Ordering::Greater => if l < r {
-                    return None
-                }
-            }
-        }
-
-        //Now test if either left or right have trailing elements
-        // by the invariant the trailing elements have at least 1
-        // non zero value, so no additional calculation is required
-        // to determine the result of the PartialOrder
-        let l_len = lhs_slice.len();
-        let r_len = rhs_slice.len();
-        match l_len.cmp(&r_len) {
-            // Equal has no additional elements: return current order
-            Ordering::Equal => Some(order),
-            // Right has at least 1 element > than the implicit 0,
-            //  so the only valid values are Ordering::Less or None
-            Ordering::Less => match order {
-                Ordering::Less | Ordering::Equal => Some(Ordering::Less),
-                Ordering::Greater => None
-            }
-            // Left has at least 1 element > than the implicit 0,
-            //  so the only valid values are Ordering::Greater or None
-            Ordering::Greater => match order {
-                Ordering::Greater | Ordering::Equal => Some(Ordering::Greater),
-                Ordering::Less => None
-            }
-        }
-    }
-
-    fn lt(&self, other: &VClock) -> bool {
-        // Load the values as slices
-        let lhs_slice = self.as_slice();
-        let rhs_slice = other.as_slice();
-
-        // If l_len > r_len then at least one element
-        //  in l_len is > than r_len, therefore the result
-        //  is either Some(Greater) or None, so return false
-        //  early.
-        let l_len = lhs_slice.len();
-        let r_len = rhs_slice.len();
-        if l_len <= r_len {
-            // If any elements on the left are greater than the right
-            //  then the result is None or Some(Greater), both of which
-            //  return false, the earlier test asserts that no elements in the
-            //  extended tail violate this assumption. Otherwise l <= r, finally
-            //  the case where the values are potentially equal needs to be considered
-            //  and false returned as well
-            let mut equal = l_len == r_len;
-            for (&l, &r) in lhs_slice.iter().zip(rhs_slice.iter()) {
-                if l > r {
-                    return false
-                }else if l < r {
-                    equal = false;
-                }
-            }
-            !equal
-        }else{
-            false
-        }
-    }
-
-    fn le(&self, other: &VClock) -> bool {
-        // Load the values as slices
-        let lhs_slice = self.as_slice();
-        let rhs_slice = other.as_slice();
-
-        // If l_len > r_len then at least one element
-        //  in l_len is > than r_len, therefore the result
-        //  is either Some(Greater) or None, so return false
-        //  early.
-        let l_len = lhs_slice.len();
-        let r_len = rhs_slice.len();
-        if l_len <= r_len {
-            // If any elements on the left are greater than the right
-            //  then the result is None or Some(Greater), both of which
-            //  return false, the earlier test asserts that no elements in the
-            //  extended tail violate this assumption. Otherwise l <= r
-            !lhs_slice.iter().zip(rhs_slice.iter()).any(|(&l, &r)| l > r)
-        }else{
-            false
-        }
-    }
-
-    fn gt(&self, other: &VClock) -> bool {
-        // Load the values as slices
-        let lhs_slice = self.as_slice();
-        let rhs_slice = other.as_slice();
-
-        // If r_len > l_len then at least one element
-        //  in r_len is > than l_len, therefore the result
-        //  is either Some(Less) or None, so return false
-        //  early.
-        let l_len = lhs_slice.len();
-        let r_len = rhs_slice.len();
-        if l_len >= r_len {
-            // If any elements on the left are less than the right
-            //  then the result is None or Some(Less), both of which
-            //  return false, the earlier test asserts that no elements in the
-            //  extended tail violate this assumption. Otherwise l >=, finally
-            //  the case where the values are potentially equal needs to be considered
-            //  and false returned as well
-            let mut equal = l_len == r_len;
-            for (&l, &r) in lhs_slice.iter().zip(rhs_slice.iter()) {
-                if l < r {
-                    return false
-                }else if l > r {
-                    equal = false;
-                }
-            }
-            !equal
-        }else{
-            false
-        }
-    }
-
-    fn ge(&self, other: &VClock) -> bool {
-        // Load the values as slices
-        let lhs_slice = self.as_slice();
-        let rhs_slice = other.as_slice();
-
-        // If r_len > l_len then at least one element
-        //  in r_len is > than l_len, therefore the result
-        //  is either Some(Less) or None, so return false
-        //  early.
-        let l_len = lhs_slice.len();
-        let r_len = rhs_slice.len();
-        if l_len >= r_len {
-            // If any elements on the left are less than the right
-            //  then the result is None or Some(Less), both of which
-            //  return false, the earlier test asserts that no elements in the
-            //  extended tail violate this assumption. Otherwise l >= r
-            !lhs_slice.iter().zip(rhs_slice.iter()).any(|(&l, &r)| l < r)
-        }else{
-            false
-        }
+    fn current_thread_state_mut(&self) -> (VectorIdx, RefMut<'_, ThreadClockSet>) {
+        let index = self.current_index();
+        let ref_vector = self.vector_clocks.borrow_mut();
+        let clocks = RefMut::map(ref_vector, |vec| &mut vec[index]);
+        (index, clocks)
      }
-}
-
-impl Index<ThreadId> for VClock {
-    type Output = Timestamp;
  
+    /// Return the current thread, should be the same
+    /// as the data-race active thread.
      #[inline]
-    fn index(&self, index: ThreadId) -> &Timestamp {
-       self.as_slice().get(index.to_u32() as usize).unwrap_or(&0)
+    fn current_index(&self) -> VectorIdx {
+        self.current_index.get()
      }
  }
-
-
-/// Test vector clock ordering operations
-///  data-race detection is tested in the external
-///  test suite
-#[cfg(test)]
-mod tests {
-    use super::{VClock, Timestamp};
-    use std::cmp::Ordering;
-
-    #[test]
-    fn test_equal() {
-        let mut c1 = VClock::default();
-        let mut c2 = VClock::default();
-        assert_eq!(c1, c2);
-        c1.increment_index(5);
-        assert_ne!(c1, c2);
-        c2.increment_index(53);
-        assert_ne!(c1, c2);
-        c1.increment_index(53);
-        assert_ne!(c1, c2);
-        c2.increment_index(5);
-        assert_eq!(c1, c2);
-    }
-
-    #[test]
-    fn test_partial_order() {
-        // Small test
-        assert_order(&[1], &[1], Some(Ordering::Equal));
-        assert_order(&[1], &[2], Some(Ordering::Less));
-        assert_order(&[2], &[1], Some(Ordering::Greater));
-        assert_order(&[1], &[1,2], Some(Ordering::Less));
-        assert_order(&[2], &[1,2], None);
-
-        // Misc tests
-        assert_order(&[400], &[0, 1], None);
-
-        // Large test
-        assert_order(&[0,1,2,3,4,5,6,7,8,9,10], &[0,1,2,3,4,5,6,7,8,9,10,0,0,0], Some(Ordering::Equal));
-        assert_order(&[0,1,2,3,4,5,6,7,8,9,10], &[0,1,2,3,4,5,6,7,8,9,10,0,1,0], Some(Ordering::Less));
-        assert_order(&[0,1,2,3,4,5,6,7,8,9,11], &[0,1,2,3,4,5,6,7,8,9,10,0,0,0], Some(Ordering::Greater));
-        assert_order(&[0,1,2,3,4,5,6,7,8,9,11], &[0,1,2,3,4,5,6,7,8,9,10,0,1,0], None);
-        assert_order(&[0,1,2,3,4,5,6,7,8,9,9 ], &[0,1,2,3,4,5,6,7,8,9,10,0,0,0], Some(Ordering::Less));
-        assert_order(&[0,1,2,3,4,5,6,7,8,9,9 ], &[0,1,2,3,4,5,6,7,8,9,10,0,1,0], Some(Ordering::Less));
-    }
-
-    fn from_slice(mut slice: &[Timestamp]) -> VClock {
-        while let Some(0) = slice.last() {
-            slice = &slice[..slice.len() - 1]
-        }
-        VClock(smallvec::SmallVec::from_slice(slice))
-    }
-
-    fn assert_order(l: &[Timestamp], r: &[Timestamp], o: Option<Ordering>) {
-        let l = from_slice(l);
-        let r = from_slice(r);
-
-        //Test partial_cmp
-        let compare = l.partial_cmp(&r);
-        assert_eq!(compare, o, "Invalid comparison\n l: {:?}\n r: {:?}",l,r);
-        let alt_compare = r.partial_cmp(&l);
-        assert_eq!(alt_compare, o.map(Ordering::reverse), "Invalid alt comparison\n l: {:?}\n r: {:?}",l,r);
-
-        //Test operatorsm with faster implementations
-        assert_eq!(
-            matches!(compare,Some(Ordering::Less)), l < r,
-            "Invalid (<):\n l: {:?}\n r: {:?}",l,r
-        );
-        assert_eq!(
-            matches!(compare,Some(Ordering::Less) | Some(Ordering::Equal)), l <= r,
-            "Invalid (<=):\n l: {:?}\n r: {:?}",l,r
-        );
-        assert_eq!(
-            matches!(compare,Some(Ordering::Greater)), l > r,
-            "Invalid (>):\n l: {:?}\n r: {:?}",l,r
-        );
-        assert_eq!(
-            matches!(compare,Some(Ordering::Greater) | Some(Ordering::Equal)), l >= r,
-            "Invalid (>=):\n l: {:?}\n r: {:?}",l,r
-        );
-        assert_eq!(
-            matches!(alt_compare,Some(Ordering::Less)), r < l,
-            "Invalid alt (<):\n l: {:?}\n r: {:?}",l,r
-        );
-        assert_eq!(
-            matches!(alt_compare,Some(Ordering::Less) | Some(Ordering::Equal)), r <= l,
-            "Invalid alt (<=):\n l: {:?}\n r: {:?}",l,r
-        );
-        assert_eq!(
-            matches!(alt_compare,Some(Ordering::Greater)), r > l,
-            "Invalid alt (>):\n l: {:?}\n r: {:?}",l,r
-        );
-        assert_eq!(
-            matches!(alt_compare,Some(Ordering::Greater) | Some(Ordering::Equal)), r >= l,
-            "Invalid alt (>=):\n l: {:?}\n r: {:?}",l,r
-        );
-    }
-}
-\ No newline at end of file