From 30c7f5820ff00b5f1a96fd8a8fc895cb9f8d3dc3 Mon Sep 17 00:00:00 2001 From: Calder Young Date: Tue, 14 Apr 2026 20:56:42 -0700 Subject: [PATCH 1/2] brw: Allow instruction reordering around memory writes Our scheduler is overly conservative about reordering instructions around memory writes or fences. Fortunately, there are several simple assumptions we can make about our IR to schedule these things a lot more fluidly: * Unless its an EOT, a SEND instruction's side effects will only be observed through other SEND instructions * The effects of workgroup barriers, memory fences, and BRW_OPCODE_SYNC, are only used in the IR to synchronize SEND instructions * All other scheduler dependencies related to memory access are already expressed through the source and destination operands Reviewed-by: Francisco Jerez --- .../brw/brw_schedule_instructions.cpp | 49 ++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/src/intel/compiler/brw/brw_schedule_instructions.cpp b/src/intel/compiler/brw/brw_schedule_instructions.cpp index a2a837eaea5..f856eaa0995 100644 --- a/src/intel/compiler/brw/brw_schedule_instructions.cpp +++ b/src/intel/compiler/brw/brw_schedule_instructions.cpp @@ -571,6 +571,7 @@ public: int block_count, bool post_reg_alloc, bool need_latencies); void add_barrier_deps(schedule_node *n); + void add_memory_deps(schedule_node *n); void add_cross_lane_deps(schedule_node *n); void add_dep(schedule_node *before, schedule_node *after, int latency); void add_dep(schedule_node *before, schedule_node *after); @@ -1080,8 +1081,27 @@ static bool is_scheduling_barrier(const brw_inst *inst) { return inst->opcode == SHADER_OPCODE_HALT_TARGET || + inst->opcode == SHADER_OPCODE_RND_MODE || + inst->opcode == SHADER_OPCODE_FLOAT_CONTROL_MODE || (inst->is_control_flow() && inst->opcode != BRW_OPCODE_HALT) || - inst->has_side_effects(); + inst->eot; +} + +static bool +has_memory_side_effects(const brw_inst *inst) +{ + assert(inst->opcode != SHADER_OPCODE_LSC_SPILL); + return inst->opcode == BRW_OPCODE_SYNC || + inst->opcode == SHADER_OPCODE_BARRIER || + inst->opcode == FS_OPCODE_SCHEDULING_FENCE || + (inst->is_send() && inst->as_send()->has_side_effects); +} + +static bool +is_memory_volatile(const brw_inst *inst) +{ + return has_memory_side_effects(inst) || + (inst->is_send() && inst->as_send()->is_volatile); } static bool @@ -1186,6 +1206,30 @@ brw_instruction_scheduler::add_barrier_deps(schedule_node *n) } } +void +brw_instruction_scheduler::add_memory_deps(schedule_node *n) +{ + for (schedule_node *prev = n - 1; prev >= current.start; prev--) { + if (has_memory_side_effects(prev->inst)) { + add_dep(prev, n, 0); + break; + } + if (is_memory_volatile(prev->inst)) { + add_dep(prev, n, 0); + } + } + + for (schedule_node *next = n + 1; next < current.end; next++) { + if (has_memory_side_effects(next->inst)) { + add_dep(n, next, 0); + break; + } + if (is_memory_volatile(next->inst)) { + add_dep(n, next, 0); + } + } +} + /** * Because some instructions like HALT can disable lanes, scheduling prior to * a cross lane access should not be allowed, otherwise we could end up with @@ -1338,6 +1382,9 @@ brw_instruction_scheduler::calculate_deps() if (is_scheduling_barrier(inst)) add_barrier_deps(n); + if (has_memory_side_effects(inst)) + add_memory_deps(n); + if (inst->opcode == BRW_OPCODE_HALT || inst->opcode == SHADER_OPCODE_HALT_TARGET) add_cross_lane_deps(n); From bbfc98657379efe63ef3d98ba82d6d32ff84286c Mon Sep 17 00:00:00 2001 From: Calder Young Date: Wed, 15 Apr 2026 12:39:38 -0700 Subject: [PATCH 2/2] brw: Add support for ACCESS_CAN_REORDER memory ordering Passes the ACCESS_CAN_REORDER flag from NIR on to the backend so that we can lower the loads to a non-volatile SEND. This allows the scheduler to freely reorder them around stores or fences. Reviewed-by: Francisco Jerez --- src/intel/compiler/brw/brw_eu_defines.h | 4 ++++ src/intel/compiler/brw/brw_from_nir.cpp | 4 +++- src/intel/compiler/brw/brw_lower_logical_sends.cpp | 6 ++++-- src/intel/compiler/brw/brw_opt_cse.cpp | 2 +- 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/intel/compiler/brw/brw_eu_defines.h b/src/intel/compiler/brw/brw_eu_defines.h index 29801caa6de..4b3554b71e8 100644 --- a/src/intel/compiler/brw/brw_eu_defines.h +++ b/src/intel/compiler/brw/brw_eu_defines.h @@ -669,6 +669,10 @@ enum memory_flags { * fusion (Gfx12.x only). */ MEMORY_FLAG_FUSED_EU_DISABLE = 1 << 4, + /** Whether this memory load can be arbitrarily reordered or CSE'd + * with other loads. + */ + MEMORY_FLAG_CAN_REORDER = 1 << 5, }; enum rt_logical_srcs { diff --git a/src/intel/compiler/brw/brw_from_nir.cpp b/src/intel/compiler/brw/brw_from_nir.cpp index c84ba0cbff9..0556d5ad7a0 100644 --- a/src/intel/compiler/brw/brw_from_nir.cpp +++ b/src/intel/compiler/brw/brw_from_nir.cpp @@ -5936,13 +5936,15 @@ brw_from_nir_emit_memory_access(nir_to_brw_state &ntb, (nir_intrinsic_access(instr) & ACCESS_COHERENT); const bool fused_eu_disable = nir_intrinsic_has_access(instr) && (nir_intrinsic_access(instr) & ACCESS_FUSED_EU_DISABLE_INTEL); + const bool can_reorder = nir_intrinsic_can_reorder(instr); const unsigned alignment = nir_intrinsic_has_align(instr) ? nir_intrinsic_align(instr) : 0; uint8_t flags = (include_helpers ? MEMORY_FLAG_INCLUDE_HELPERS : 0) | (volatile_access ? MEMORY_FLAG_VOLATILE_ACCESS : 0) | (coherent_access ? MEMORY_FLAG_COHERENT_ACCESS : 0) | - (fused_eu_disable ? MEMORY_FLAG_FUSED_EU_DISABLE : 0); + (fused_eu_disable ? MEMORY_FLAG_FUSED_EU_DISABLE : 0) | + (can_reorder ? MEMORY_FLAG_CAN_REORDER : 0); bool no_mask_handle = false; uint8_t coord_components = 1; diff --git a/src/intel/compiler/brw/brw_lower_logical_sends.cpp b/src/intel/compiler/brw/brw_lower_logical_sends.cpp index 7fbe16f85ea..f141f0b7c55 100644 --- a/src/intel/compiler/brw/brw_lower_logical_sends.cpp +++ b/src/intel/compiler/brw/brw_lower_logical_sends.cpp @@ -1192,6 +1192,7 @@ lower_lsc_memory_logical_send(const brw_builder &bld, brw_mem_inst *mem) const bool coherent_access = mem->flags & MEMORY_FLAG_COHERENT_ACCESS; const bool has_side_effects = mem->has_side_effects(); const bool fused_eu_disable = mem->flags & MEMORY_FLAG_FUSED_EU_DISABLE; + const bool can_reorder = mem->flags & MEMORY_FLAG_CAN_REORDER; const uint32_t data_size_B = lsc_data_size_bytes(data_size); const enum brw_reg_type data_type = @@ -1344,7 +1345,7 @@ lower_lsc_memory_logical_send(const brw_builder &bld, brw_mem_inst *mem) send->ex_mlen = ex_mlen; send->header_size = 0; send->has_side_effects = has_side_effects; - send->is_volatile = !has_side_effects || volatile_access; + send->is_volatile = (!has_side_effects && !can_reorder) || volatile_access; send->fused_eu_disable = fused_eu_disable; /* Finally, the payload */ @@ -1405,6 +1406,7 @@ lower_hdc_memory_logical_send(const brw_builder &bld, brw_mem_inst *mem) const bool volatile_access = mem->flags & MEMORY_FLAG_VOLATILE_ACCESS; const bool fused_eu_disable = mem->flags & MEMORY_FLAG_FUSED_EU_DISABLE; const bool has_side_effects = mem->has_side_effects(); + const bool can_reorder = mem->flags & MEMORY_FLAG_CAN_REORDER; const bool has_dest = mem->dst.file != BAD_FILE && !mem->dst.is_null(); assert(mem->address_offset == 0); @@ -1610,7 +1612,7 @@ lower_hdc_memory_logical_send(const brw_builder &bld, brw_mem_inst *mem) send->ex_mlen = ex_mlen; send->header_size = header.file != BAD_FILE ? 1 : 0; send->has_side_effects = has_side_effects; - send->is_volatile = !has_side_effects || volatile_access; + send->is_volatile = (!has_side_effects && !can_reorder) || volatile_access; send->fused_eu_disable = fused_eu_disable; if (block) { diff --git a/src/intel/compiler/brw/brw_opt_cse.cpp b/src/intel/compiler/brw/brw_opt_cse.cpp index 3aac87b4ce6..b7e5bf37118 100644 --- a/src/intel/compiler/brw/brw_opt_cse.cpp +++ b/src/intel/compiler/brw/brw_opt_cse.cpp @@ -94,7 +94,7 @@ is_expression(const brw_shader *v, const brw_inst *const inst) case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: return true; case SHADER_OPCODE_MEMORY_LOAD_LOGICAL: - return inst->as_mem()->mode == MEMORY_MODE_CONSTANT; + return inst->as_mem()->flags & MEMORY_FLAG_CAN_REORDER; case SHADER_OPCODE_LOAD_PAYLOAD: return !is_coalescing_payload(*v, inst); case SHADER_OPCODE_SEND: