1 //! This optimization replaces stack accesses with SSA variables and removes dead stores when possible.
3 //! # Undefined behaviour
5 //! This optimization is based on the assumption that stack slots which don't have their address
6 //! leaked through `stack_addr` are only accessed using `stack_load` and `stack_store` in the
7 //! function which has the stack slots. This optimization also assumes that stack slot accesses
8 //! are never out of bounds. If these assumptions are not correct, then this optimization may remove
9 //! `stack_store` instruction incorrectly, or incorrectly use a previously stored value as the value
10 //! being loaded by a `stack_load`.
12 use std::collections::BTreeMap;
16 use rustc_data_structures::fx::{FxHashSet, FxHasher};
18 use cranelift_codegen::cursor::{Cursor, FuncCursor};
19 use cranelift_codegen::ir::{InstructionData, Opcode, ValueDef};
20 use cranelift_codegen::ir::immediates::Offset32;
22 use hashbrown::HashSet;
23 use std::hash::BuildHasherDefault;
25 use crate::prelude::*;
27 /// Workaround for `StackSlot` not implementing `Ord`.
28 #[derive(Copy, Clone, PartialEq, Eq)]
29 struct OrdStackSlot(StackSlot);
31 impl fmt::Debug for OrdStackSlot {
32 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
33 write!(f, "{:?}", self.0)
37 impl PartialOrd for OrdStackSlot {
38 fn partial_cmp(&self, rhs: &Self) -> Option<std::cmp::Ordering> {
39 self.0.as_u32().partial_cmp(&rhs.0.as_u32())
43 impl Ord for OrdStackSlot {
44 fn cmp(&self, rhs: &Self) -> std::cmp::Ordering {
45 self.0.as_u32().cmp(&rhs.0.as_u32())
49 #[derive(Debug, Default)]
50 struct StackSlotUsage {
51 stack_addr: HashSet<Inst, BuildHasherDefault<FxHasher>>,
52 stack_load: HashSet<Inst, BuildHasherDefault<FxHasher>>,
53 stack_store: HashSet<Inst, BuildHasherDefault<FxHasher>>,
57 fn potential_stores_for_load(&self, ctx: &Context, load: Inst) -> Vec<Inst> {
58 self.stack_store.iter().cloned().filter(|&store| {
59 match spatial_overlap(&ctx.func, store, load) {
60 SpatialOverlap::No => false, // Can never be the source of the loaded value.
61 SpatialOverlap::Partial | SpatialOverlap::Full => true,
64 match temporal_order(ctx, store, load) {
65 TemporalOrder::NeverBefore => false, // Can never be the source of the loaded value.
66 TemporalOrder::MaybeBefore | TemporalOrder::DefinitivelyBefore => true,
68 }).collect::<Vec<Inst>>()
71 fn potential_loads_of_store(&self, ctx: &Context, store: Inst) -> Vec<Inst> {
72 self.stack_load.iter().cloned().filter(|&load| {
73 match spatial_overlap(&ctx.func, store, load) {
74 SpatialOverlap::No => false, // Can never be the source of the loaded value.
75 SpatialOverlap::Partial | SpatialOverlap::Full => true,
78 match temporal_order(ctx, store, load) {
79 TemporalOrder::NeverBefore => false, // Can never be the source of the loaded value.
80 TemporalOrder::MaybeBefore | TemporalOrder::DefinitivelyBefore => true,
82 }).collect::<Vec<Inst>>()
85 fn remove_unused_stack_addr(func: &mut Function, inst: Inst) {
86 func.dfg.detach_results(inst);
87 func.dfg.replace(inst).nop();
90 fn remove_unused_load(func: &mut Function, load: Inst) {
91 func.dfg.detach_results(load);
92 func.dfg.replace(load).nop();
95 fn remove_dead_store(&mut self, func: &mut Function, store: Inst) {
96 func.dfg.replace(store).nop();
97 self.stack_store.remove(&store);
100 fn change_load_to_alias(&mut self, func: &mut Function, load: Inst, value: Value) {
101 let loaded_value = func.dfg.inst_results(load)[0];
102 let loaded_type = func.dfg.value_type(loaded_value);
104 if func.dfg.value_type(value) == loaded_type {
105 func.dfg.detach_results(load);
106 func.dfg.replace(load).nop();
107 func.dfg.change_to_alias(loaded_value, value);
109 func.dfg.replace(load).bitcast(loaded_type, value);
112 self.stack_load.remove(&load);
116 struct OptimizeContext<'a> {
117 ctx: &'a mut Context,
118 stack_slot_usage_map: BTreeMap<OrdStackSlot, StackSlotUsage>,
121 impl<'a> OptimizeContext<'a> {
122 fn for_context(ctx: &'a mut Context) -> Self {
123 ctx.flowgraph(); // Compute cfg and domtree.
125 // Record all stack_addr, stack_load and stack_store instructions.
126 let mut stack_slot_usage_map = BTreeMap::<OrdStackSlot, StackSlotUsage>::new();
128 let mut cursor = FuncCursor::new(&mut ctx.func);
129 while let Some(_block) = cursor.next_block() {
130 while let Some(inst) = cursor.next_inst() {
131 match cursor.func.dfg[inst] {
132 InstructionData::StackLoad {
133 opcode: Opcode::StackAddr,
137 stack_slot_usage_map.entry(OrdStackSlot(stack_slot)).or_insert_with(StackSlotUsage::default).stack_addr.insert(inst);
139 InstructionData::StackLoad {
140 opcode: Opcode::StackLoad,
144 stack_slot_usage_map.entry(OrdStackSlot(stack_slot)).or_insert_with(StackSlotUsage::default).stack_load.insert(inst);
146 InstructionData::StackStore {
147 opcode: Opcode::StackStore,
152 stack_slot_usage_map.entry(OrdStackSlot(stack_slot)).or_insert_with(StackSlotUsage::default).stack_store.insert(inst);
161 stack_slot_usage_map,
166 pub(super) fn optimize_function(
168 #[cfg_attr(not(debug_assertions), allow(unused_variables))]
169 clif_comments: &mut crate::pretty_clif::CommentWriter,
171 combine_stack_addr_with_load_store(&mut ctx.func);
173 let mut opt_ctx = OptimizeContext::for_context(ctx);
175 // FIXME Repeat following instructions until fixpoint.
177 remove_unused_stack_addr_and_stack_load(&mut opt_ctx);
179 #[cfg(debug_assertions)] {
180 for (&OrdStackSlot(stack_slot), usage) in &opt_ctx.stack_slot_usage_map {
181 clif_comments.add_comment(stack_slot, format!("used by: {:?}", usage));
185 for (stack_slot, users) in opt_ctx.stack_slot_usage_map.iter_mut() {
186 if users.stack_addr.is_empty().not() {
187 // Stack addr leaked; there may be unknown loads and stores.
188 // FIXME use stacked borrows to optimize
192 for load in users.stack_load.clone().into_iter() {
193 let potential_stores = users.potential_stores_for_load(&opt_ctx.ctx, load);
195 #[cfg(debug_assertions)]
196 for &store in &potential_stores {
197 clif_comments.add_comment(load, format!(
198 "Potential store -> load forwarding {} -> {} ({:?}, {:?})",
199 opt_ctx.ctx.func.dfg.display_inst(store, None),
200 opt_ctx.ctx.func.dfg.display_inst(load, None),
201 spatial_overlap(&opt_ctx.ctx.func, store, load),
202 temporal_order(&opt_ctx.ctx, store, load),
206 match *potential_stores {
208 #[cfg(debug_assertions)]
209 clif_comments.add_comment(load, format!("[BUG?] Reading uninitialized memory"));
211 [store] if spatial_overlap(&opt_ctx.ctx.func, store, load) == SpatialOverlap::Full && temporal_order(&opt_ctx.ctx, store, load) == TemporalOrder::DefinitivelyBefore => {
212 // Only one store could have been the origin of the value.
213 let stored_value = opt_ctx.ctx.func.dfg.inst_args(store)[0];
215 #[cfg(debug_assertions)]
216 clif_comments.add_comment(load, format!("Store to load forward {} -> {}", store, load));
218 users.change_load_to_alias(&mut opt_ctx.ctx.func, load, stored_value);
220 _ => {} // FIXME implement this
224 for store in users.stack_store.clone().into_iter() {
225 let potential_loads = users.potential_loads_of_store(&opt_ctx.ctx, store);
227 #[cfg(debug_assertions)]
228 for &load in &potential_loads {
229 clif_comments.add_comment(store, format!(
230 "Potential load from store {} <- {} ({:?}, {:?})",
231 opt_ctx.ctx.func.dfg.display_inst(load, None),
232 opt_ctx.ctx.func.dfg.display_inst(store, None),
233 spatial_overlap(&opt_ctx.ctx.func, store, load),
234 temporal_order(&opt_ctx.ctx, store, load),
238 if potential_loads.is_empty() {
239 // Never loaded; can safely remove all stores and the stack slot.
240 // FIXME also remove stores when there is always a next store before a load.
242 #[cfg(debug_assertions)]
243 clif_comments.add_comment(store, format!("Remove dead stack store {} of {}", opt_ctx.ctx.func.dfg.display_inst(store, None), stack_slot.0));
245 users.remove_dead_store(&mut opt_ctx.ctx.func, store);
249 if users.stack_store.is_empty() && users.stack_load.is_empty() {
250 opt_ctx.ctx.func.stack_slots[stack_slot.0].size = 0;
255 fn combine_stack_addr_with_load_store(func: &mut Function) {
256 // Turn load and store into stack_load and stack_store when possible.
257 let mut cursor = FuncCursor::new(func);
258 while let Some(_block) = cursor.next_block() {
259 while let Some(inst) = cursor.next_inst() {
260 match cursor.func.dfg[inst] {
261 InstructionData::Load { opcode: Opcode::Load, arg: addr, flags: _, offset } => {
262 if cursor.func.dfg.ctrl_typevar(inst) == types::I128 || cursor.func.dfg.ctrl_typevar(inst).is_vector() {
263 continue; // WORKAROUD: stack_load.i128 not yet implemented
265 if let Some((stack_slot, stack_addr_offset)) = try_get_stack_slot_and_offset_for_addr(cursor.func, addr) {
266 if let Some(combined_offset) = offset.try_add_i64(stack_addr_offset.into()) {
267 let ty = cursor.func.dfg.ctrl_typevar(inst);
268 cursor.func.dfg.replace(inst).stack_load(ty, stack_slot, combined_offset);
272 InstructionData::Store { opcode: Opcode::Store, args: [value, addr], flags: _, offset } => {
273 if cursor.func.dfg.ctrl_typevar(inst) == types::I128 || cursor.func.dfg.ctrl_typevar(inst).is_vector() {
274 continue; // WORKAROUND: stack_store.i128 not yet implemented
276 if let Some((stack_slot, stack_addr_offset)) = try_get_stack_slot_and_offset_for_addr(cursor.func, addr) {
277 if let Some(combined_offset) = offset.try_add_i64(stack_addr_offset.into()) {
278 cursor.func.dfg.replace(inst).stack_store(value, stack_slot, combined_offset);
288 fn remove_unused_stack_addr_and_stack_load(opt_ctx: &mut OptimizeContext<'_>) {
289 // FIXME incrementally rebuild on each call?
290 let mut stack_addr_load_insts_users = FxHashMap::<Inst, FxHashSet<Inst>>::default();
292 let mut cursor = FuncCursor::new(&mut opt_ctx.ctx.func);
293 while let Some(_block) = cursor.next_block() {
294 while let Some(inst) = cursor.next_inst() {
295 for &arg in cursor.func.dfg.inst_args(inst) {
296 if let ValueDef::Result(arg_origin, 0) = cursor.func.dfg.value_def(arg) {
297 match cursor.func.dfg[arg_origin].opcode() {
298 Opcode::StackAddr | Opcode::StackLoad => {
299 stack_addr_load_insts_users.entry(arg_origin).or_insert_with(FxHashSet::default).insert(inst);
308 #[cfg(debug_assertions)]
309 for inst in stack_addr_load_insts_users.keys() {
310 let mut is_recorded_stack_addr_or_stack_load = false;
311 for stack_slot_users in opt_ctx.stack_slot_usage_map.values() {
312 is_recorded_stack_addr_or_stack_load |= stack_slot_users.stack_addr.contains(inst) || stack_slot_users.stack_load.contains(inst);
314 assert!(is_recorded_stack_addr_or_stack_load);
317 // Replace all unused stack_addr and stack_load instructions with nop.
318 let mut func = &mut opt_ctx.ctx.func;
320 // drain_filter() on hashbrown::HashSet drains the items that do *not* match the
321 // predicate. Once hashbrown gets updated to match the behaviour of std::drain_filter
322 // (0.8.2), the predicate will have to be reversed
323 for stack_slot_users in opt_ctx.stack_slot_usage_map.values_mut() {
326 .drain_filter(|inst| !(stack_addr_load_insts_users.get(inst).map(|users| users.is_empty()).unwrap_or(true)))
327 .for_each(|inst| StackSlotUsage::remove_unused_stack_addr(&mut func, inst));
331 .drain_filter(|inst| !(stack_addr_load_insts_users.get(inst).map(|users| users.is_empty()).unwrap_or(true)))
332 .for_each(|inst| StackSlotUsage::remove_unused_load(&mut func, inst));
336 fn try_get_stack_slot_and_offset_for_addr(func: &Function, addr: Value) -> Option<(StackSlot, Offset32)> {
337 if let ValueDef::Result(addr_inst, 0) = func.dfg.value_def(addr) {
338 if let InstructionData::StackLoad {
339 opcode: Opcode::StackAddr,
342 } = func.dfg[addr_inst] {
343 return Some((stack_slot, offset));
349 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
350 enum SpatialOverlap {
356 fn spatial_overlap(func: &Function, src: Inst, dest: Inst) -> SpatialOverlap {
357 fn inst_info(func: &Function, inst: Inst) -> (StackSlot, Offset32, u32) {
358 match func.dfg[inst] {
359 InstructionData::StackLoad {
360 opcode: Opcode::StackAddr,
364 | InstructionData::StackLoad {
365 opcode: Opcode::StackLoad,
369 | InstructionData::StackStore {
370 opcode: Opcode::StackStore,
374 } => (stack_slot, offset, func.dfg.ctrl_typevar(inst).bytes()),
375 _ => unreachable!("{:?}", func.dfg[inst]),
379 debug_assert_ne!(src, dest);
381 let (src_ss, src_offset, src_size) = inst_info(func, src);
382 let (dest_ss, dest_offset, dest_size) = inst_info(func, dest);
384 if src_ss != dest_ss {
385 return SpatialOverlap::No;
388 if src_offset == dest_offset && src_size == dest_size {
389 return SpatialOverlap::Full;
392 let src_end: i64 = src_offset.try_add_i64(i64::from(src_size)).unwrap().into();
393 let dest_end: i64 = dest_offset.try_add_i64(i64::from(dest_size)).unwrap().into();
394 if src_end <= dest_offset.into() || dest_end <= src_offset.into() {
395 return SpatialOverlap::No;
398 SpatialOverlap::Partial
401 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
403 /// `src` will never be executed before `dest`.
406 /// `src` may be executed before `dest`.
409 /// `src` will always be executed before `dest`.
410 /// There may still be other instructions in between.
414 fn temporal_order(ctx: &Context, src: Inst, dest: Inst) -> TemporalOrder {
415 debug_assert_ne!(src, dest);
417 if ctx.domtree.dominates(src, dest, &ctx.func.layout) {
418 TemporalOrder::DefinitivelyBefore
419 } else if ctx.domtree.dominates(src, dest, &ctx.func.layout) {
420 TemporalOrder::NeverBefore
422 TemporalOrder::MaybeBefore