`reserve_and_rehash` takes up 1.4% of the runtime on the `packed-simd`
benchmark which I believe is due to the number of reads are very low in
many cases (see https://github.com/rust-lang/rust/pull/50565 for
instance).
This avoids allocating the set until we start allocating the `reads`
`SmallVec` but it is possible that a lower limit might be better (not
tested since the improvement will be hard to spot either way).
let icx = if let Some(icx) = icx { icx } else { return };
if let Some(task_deps) = icx.task_deps {
let mut task_deps = task_deps.lock();
let icx = if let Some(icx) = icx { icx } else { return };
if let Some(task_deps) = icx.task_deps {
let mut task_deps = task_deps.lock();
+ let task_deps = &mut *task_deps;
if cfg!(debug_assertions) {
self.current.total_read_count.fetch_add(1, Relaxed);
}
if cfg!(debug_assertions) {
self.current.total_read_count.fetch_add(1, Relaxed);
}
- if task_deps.read_set.insert(source) {
+
+ // As long as we only have a low number of reads we can avoid doing a hash
+ // insert and potentially allocating/reallocating the hashmap
+ let new_read = if task_deps.reads.len() < TASK_DEPS_READS_CAP {
+ task_deps.reads.iter().all(|other| *other != source)
+ } else {
+ task_deps.read_set.insert(source)
+ };
+ if new_read {
task_deps.reads.push(source);
task_deps.reads.push(source);
+ if task_deps.reads.len() == TASK_DEPS_READS_CAP {
+ // Fill `read_set` with what we have so far so we can use the hashset next
+ // time
+ task_deps.read_set.extend(task_deps.reads.iter().copied());
+ }
#[cfg(debug_assertions)]
{
#[cfg(debug_assertions)]
{
+const TASK_DEPS_READS_CAP: usize = 8;
pub struct TaskDeps {
#[cfg(debug_assertions)]
node: Option<DepNode>,
pub struct TaskDeps {
#[cfg(debug_assertions)]
node: Option<DepNode>,
- reads: SmallVec<[DepNodeIndex; 8]>,
+ reads: SmallVec<[DepNodeIndex; TASK_DEPS_READS_CAP]>,
read_set: FxHashSet<DepNodeIndex>,
}
read_set: FxHashSet<DepNodeIndex>,
}