diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp index 78254f59a4f..ad44b9ef483 100644 --- a/src/amd/compiler/aco_insert_waitcnt.cpp +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -782,11 +782,15 @@ void insert_wait_entry(wait_ctx& ctx, Definition def, wait_event event, uint8_t vmem_types = 0, unsigned cycles = 0) { - /* We can't safely write to unwritten destination VGPR lanes on GFX11 without waiting for - * the load to finish. + /* We can't safely write to unwritten destination VGPR lanes with DS/VMEM on GFX11 without + * waiting for the load to finish. + * Also, follow linear control flow for ALU because it's unlikely that the hardware does per-lane + * dependency checks. */ + uint32_t ds_vmem_events = event_lds | event_gds | event_vmem | event_flat; + uint32_t alu_events = event_trans | event_valu | event_salu; bool force_linear = - ctx.gfx_level >= GFX11 && (event & (event_lds | event_gds | event_vmem | event_flat)); + ctx.gfx_level >= GFX11 && (event & (ds_vmem_events | alu_events)); insert_wait_entry(ctx, def.physReg(), def.regClass(), event, true, vmem_types, cycles, force_linear);