Merge branch 'review/brw-scheduler-fence-improvements' into 'main'

brw: Allow instruction reordering around memory writes See merge request mesa/mesa!41008
2026-05-08 02:38:04 +02:00 · 2026-05-08 00:09:17 +00:00 · 2026-05-08 00:09:17 +00:00 · b532eda0c0
commit b532eda0c0
parent ff5b909511 bbfc986573
5 changed files with 60 additions and 5 deletions
--- a/src/intel/compiler/brw/brw_eu_defines.h
+++ b/src/intel/compiler/brw/brw_eu_defines.h
@ -669,6 +669,10 @@ enum memory_flags {
    *  fusion (Gfx12.x only).
    */
   MEMORY_FLAG_FUSED_EU_DISABLE = 1 << 4,
+   /** Whether this memory load can be arbitrarily reordered or CSE'd
+    *  with other loads.
+    */
+   MEMORY_FLAG_CAN_REORDER = 1 << 5,
 };

 enum rt_logical_srcs {
--- a/src/intel/compiler/brw/brw_from_nir.cpp
+++ b/src/intel/compiler/brw/brw_from_nir.cpp
@ -5930,13 +5930,15 @@ brw_from_nir_emit_memory_access(nir_to_brw_state &ntb,
      (nir_intrinsic_access(instr) & ACCESS_COHERENT);
   const bool fused_eu_disable = nir_intrinsic_has_access(instr) &&
      (nir_intrinsic_access(instr) & ACCESS_FUSED_EU_DISABLE_INTEL);
+   const bool can_reorder = nir_intrinsic_can_reorder(instr);
   const unsigned alignment =
      nir_intrinsic_has_align(instr) ? nir_intrinsic_align(instr) : 0;
   uint8_t flags =
      (include_helpers ? MEMORY_FLAG_INCLUDE_HELPERS : 0) |
      (volatile_access ? MEMORY_FLAG_VOLATILE_ACCESS : 0) |
      (coherent_access ? MEMORY_FLAG_COHERENT_ACCESS : 0) |
-      (fused_eu_disable ? MEMORY_FLAG_FUSED_EU_DISABLE : 0);
+      (fused_eu_disable ? MEMORY_FLAG_FUSED_EU_DISABLE : 0) |
+      (can_reorder ? MEMORY_FLAG_CAN_REORDER : 0);
   bool no_mask_handle = false;

   uint8_t coord_components = 1;
--- a/src/intel/compiler/brw/brw_lower_logical_sends.cpp
+++ b/src/intel/compiler/brw/brw_lower_logical_sends.cpp
@ -1192,6 +1192,7 @@ lower_lsc_memory_logical_send(const brw_builder &bld, brw_mem_inst *mem)
   const bool coherent_access = mem->flags & MEMORY_FLAG_COHERENT_ACCESS;
   const bool has_side_effects = mem->has_side_effects();
   const bool fused_eu_disable = mem->flags & MEMORY_FLAG_FUSED_EU_DISABLE;
+   const bool can_reorder = mem->flags & MEMORY_FLAG_CAN_REORDER;

   const uint32_t data_size_B = lsc_data_size_bytes(data_size);
   const enum brw_reg_type data_type =
@ -1344,7 +1345,7 @@ lower_lsc_memory_logical_send(const brw_builder &bld, brw_mem_inst *mem)
   send->ex_mlen = ex_mlen;
   send->header_size = 0;
   send->has_side_effects = has_side_effects;
-   send->is_volatile = !has_side_effects || volatile_access;
+   send->is_volatile = (!has_side_effects && !can_reorder) || volatile_access;
   send->fused_eu_disable = fused_eu_disable;

   /* Finally, the payload */
@ -1405,6 +1406,7 @@ lower_hdc_memory_logical_send(const brw_builder &bld, brw_mem_inst *mem)
   const bool volatile_access = mem->flags & MEMORY_FLAG_VOLATILE_ACCESS;
   const bool fused_eu_disable = mem->flags & MEMORY_FLAG_FUSED_EU_DISABLE;
   const bool has_side_effects = mem->has_side_effects();
+   const bool can_reorder = mem->flags & MEMORY_FLAG_CAN_REORDER;
   const bool has_dest = mem->dst.file != BAD_FILE && !mem->dst.is_null();
   assert(mem->address_offset == 0);

@ -1610,7 +1612,7 @@ lower_hdc_memory_logical_send(const brw_builder &bld, brw_mem_inst *mem)
   send->ex_mlen = ex_mlen;
   send->header_size = header.file != BAD_FILE ? 1 : 0;
   send->has_side_effects = has_side_effects;
-   send->is_volatile = !has_side_effects || volatile_access;
+   send->is_volatile = (!has_side_effects && !can_reorder) || volatile_access;
   send->fused_eu_disable = fused_eu_disable;

   if (block) {
--- a/src/intel/compiler/brw/brw_opt_cse.cpp
+++ b/src/intel/compiler/brw/brw_opt_cse.cpp
@ -94,7 +94,7 @@ is_expression(const brw_shader *v, const brw_inst *const inst)
   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
      return true;
   case SHADER_OPCODE_MEMORY_LOAD_LOGICAL:
-      return inst->as_mem()->mode == MEMORY_MODE_CONSTANT;
+      return inst->as_mem()->flags & MEMORY_FLAG_CAN_REORDER;
   case SHADER_OPCODE_LOAD_PAYLOAD:
      return !is_coalescing_payload(*v, inst);
   case SHADER_OPCODE_SEND:
--- a/src/intel/compiler/brw/brw_schedule_instructions.cpp
+++ b/src/intel/compiler/brw/brw_schedule_instructions.cpp
@ -571,6 +571,7 @@ public:
                             int block_count, bool post_reg_alloc, bool need_latencies);

   void add_barrier_deps(schedule_node *n);
+   void add_memory_deps(schedule_node *n);
   void add_cross_lane_deps(schedule_node *n);
   void add_dep(schedule_node *before, schedule_node *after, int latency);
   void add_dep(schedule_node *before, schedule_node *after);
@ -1080,8 +1081,27 @@ static bool
 is_scheduling_barrier(const brw_inst *inst)
 {
   return inst->opcode == SHADER_OPCODE_HALT_TARGET ||
+          inst->opcode == SHADER_OPCODE_RND_MODE ||
+          inst->opcode == SHADER_OPCODE_FLOAT_CONTROL_MODE ||
          (inst->is_control_flow() && inst->opcode != BRW_OPCODE_HALT) ||
-          inst->has_side_effects();
+          inst->eot;
+}
+
+static bool
+has_memory_side_effects(const brw_inst *inst)
+{
+   assert(inst->opcode != SHADER_OPCODE_LSC_SPILL);
+   return inst->opcode == BRW_OPCODE_SYNC ||
+          inst->opcode == SHADER_OPCODE_BARRIER ||
+          inst->opcode == FS_OPCODE_SCHEDULING_FENCE ||
+          (inst->is_send() && inst->as_send()->has_side_effects);
+}
+
+static bool
+is_memory_volatile(const brw_inst *inst)
+{
+   return has_memory_side_effects(inst) ||
+          (inst->is_send() && inst->as_send()->is_volatile);
 }

 static bool
@ -1186,6 +1206,30 @@ brw_instruction_scheduler::add_barrier_deps(schedule_node *n)
   }
 }

+void
+brw_instruction_scheduler::add_memory_deps(schedule_node *n)
+{
+   for (schedule_node *prev = n - 1; prev >= current.start; prev--) {
+      if (has_memory_side_effects(prev->inst)) {
+         add_dep(prev, n, 0);
+         break;
+      }
+      if (is_memory_volatile(prev->inst)) {
+         add_dep(prev, n, 0);
+      }
+   }
+
+   for (schedule_node *next = n + 1; next < current.end; next++) {
+      if (has_memory_side_effects(next->inst)) {
+         add_dep(n, next, 0);
+         break;
+      }
+      if (is_memory_volatile(next->inst)) {
+         add_dep(n, next, 0);
+      }
+   }
+}
+
 /**
 * Because some instructions like HALT can disable lanes, scheduling prior to
 * a cross lane access should not be allowed, otherwise we could end up with
@ -1338,6 +1382,9 @@ brw_instruction_scheduler::calculate_deps()
      if (is_scheduling_barrier(inst))
         add_barrier_deps(n);

+      if (has_memory_side_effects(inst))
+         add_memory_deps(n);
+
      if (inst->opcode == BRW_OPCODE_HALT ||
          inst->opcode == SHADER_OPCODE_HALT_TARGET)
          add_cross_lane_deps(n);