diff --git a/src/intel/compiler/brw/brw_eu_defines.h b/src/intel/compiler/brw/brw_eu_defines.h
index 29801caa6de..4b3554b71e8 100644
--- a/src/intel/compiler/brw/brw_eu_defines.h
+++ b/src/intel/compiler/brw/brw_eu_defines.h
@@ -669,6 +669,10 @@ enum memory_flags {
     *  fusion (Gfx12.x only).
     */
    MEMORY_FLAG_FUSED_EU_DISABLE = 1 << 4,
+   /** Whether this memory load can be arbitrarily reordered or CSE'd
+    *  with other loads.
+    */
+   MEMORY_FLAG_CAN_REORDER = 1 << 5,
 };
 
 enum rt_logical_srcs {
diff --git a/src/intel/compiler/brw/brw_from_nir.cpp b/src/intel/compiler/brw/brw_from_nir.cpp
index 13902d4ad67..ffd023e0a58 100644
--- a/src/intel/compiler/brw/brw_from_nir.cpp
+++ b/src/intel/compiler/brw/brw_from_nir.cpp
@@ -5930,13 +5930,15 @@ brw_from_nir_emit_memory_access(nir_to_brw_state &ntb,
       (nir_intrinsic_access(instr) & ACCESS_COHERENT);
    const bool fused_eu_disable = nir_intrinsic_has_access(instr) &&
       (nir_intrinsic_access(instr) & ACCESS_FUSED_EU_DISABLE_INTEL);
+   const bool can_reorder = nir_intrinsic_can_reorder(instr);
    const unsigned alignment =
       nir_intrinsic_has_align(instr) ? nir_intrinsic_align(instr) : 0;
    uint8_t flags =
       (include_helpers ? MEMORY_FLAG_INCLUDE_HELPERS : 0) |
       (volatile_access ? MEMORY_FLAG_VOLATILE_ACCESS : 0) |
       (coherent_access ? MEMORY_FLAG_COHERENT_ACCESS : 0) |
-      (fused_eu_disable ? MEMORY_FLAG_FUSED_EU_DISABLE : 0);
+      (fused_eu_disable ? MEMORY_FLAG_FUSED_EU_DISABLE : 0) |
+      (can_reorder ? MEMORY_FLAG_CAN_REORDER : 0);
    bool no_mask_handle = false;
 
    uint8_t coord_components = 1;
diff --git a/src/intel/compiler/brw/brw_lower_logical_sends.cpp b/src/intel/compiler/brw/brw_lower_logical_sends.cpp
index 7fbe16f85ea..f141f0b7c55 100644
--- a/src/intel/compiler/brw/brw_lower_logical_sends.cpp
+++ b/src/intel/compiler/brw/brw_lower_logical_sends.cpp
@@ -1192,6 +1192,7 @@ lower_lsc_memory_logical_send(const brw_builder &bld, brw_mem_inst *mem)
    const bool coherent_access = mem->flags & MEMORY_FLAG_COHERENT_ACCESS;
    const bool has_side_effects = mem->has_side_effects();
    const bool fused_eu_disable = mem->flags & MEMORY_FLAG_FUSED_EU_DISABLE;
+   const bool can_reorder = mem->flags & MEMORY_FLAG_CAN_REORDER;
 
    const uint32_t data_size_B = lsc_data_size_bytes(data_size);
    const enum brw_reg_type data_type =
@@ -1344,7 +1345,7 @@ lower_lsc_memory_logical_send(const brw_builder &bld, brw_mem_inst *mem)
    send->ex_mlen = ex_mlen;
    send->header_size = 0;
    send->has_side_effects = has_side_effects;
-   send->is_volatile = !has_side_effects || volatile_access;
+   send->is_volatile = (!has_side_effects && !can_reorder) || volatile_access;
    send->fused_eu_disable = fused_eu_disable;
 
    /* Finally, the payload */
@@ -1405,6 +1406,7 @@ lower_hdc_memory_logical_send(const brw_builder &bld, brw_mem_inst *mem)
    const bool volatile_access = mem->flags & MEMORY_FLAG_VOLATILE_ACCESS;
    const bool fused_eu_disable = mem->flags & MEMORY_FLAG_FUSED_EU_DISABLE;
    const bool has_side_effects = mem->has_side_effects();
+   const bool can_reorder = mem->flags & MEMORY_FLAG_CAN_REORDER;
    const bool has_dest = mem->dst.file != BAD_FILE && !mem->dst.is_null();
    assert(mem->address_offset == 0);
 
@@ -1610,7 +1612,7 @@ lower_hdc_memory_logical_send(const brw_builder &bld, brw_mem_inst *mem)
    send->ex_mlen = ex_mlen;
    send->header_size = header.file != BAD_FILE ? 1 : 0;
    send->has_side_effects = has_side_effects;
-   send->is_volatile = !has_side_effects || volatile_access;
+   send->is_volatile = (!has_side_effects && !can_reorder) || volatile_access;
    send->fused_eu_disable = fused_eu_disable;
 
    if (block) {
diff --git a/src/intel/compiler/brw/brw_opt_cse.cpp b/src/intel/compiler/brw/brw_opt_cse.cpp
index 3aac87b4ce6..b7e5bf37118 100644
--- a/src/intel/compiler/brw/brw_opt_cse.cpp
+++ b/src/intel/compiler/brw/brw_opt_cse.cpp
@@ -94,7 +94,7 @@ is_expression(const brw_shader *v, const brw_inst *const inst)
    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
       return true;
    case SHADER_OPCODE_MEMORY_LOAD_LOGICAL:
-      return inst->as_mem()->mode == MEMORY_MODE_CONSTANT;
+      return inst->as_mem()->flags & MEMORY_FLAG_CAN_REORDER;
    case SHADER_OPCODE_LOAD_PAYLOAD:
       return !is_coalescing_payload(*v, inst);
    case SHADER_OPCODE_SEND:
diff --git a/src/intel/compiler/brw/brw_schedule_instructions.cpp b/src/intel/compiler/brw/brw_schedule_instructions.cpp
index a2a837eaea5..f856eaa0995 100644
--- a/src/intel/compiler/brw/brw_schedule_instructions.cpp
+++ b/src/intel/compiler/brw/brw_schedule_instructions.cpp
@@ -571,6 +571,7 @@ public:
                              int block_count, bool post_reg_alloc, bool need_latencies);
 
    void add_barrier_deps(schedule_node *n);
+   void add_memory_deps(schedule_node *n);
    void add_cross_lane_deps(schedule_node *n);
    void add_dep(schedule_node *before, schedule_node *after, int latency);
    void add_dep(schedule_node *before, schedule_node *after);
@@ -1080,8 +1081,27 @@ static bool
 is_scheduling_barrier(const brw_inst *inst)
 {
    return inst->opcode == SHADER_OPCODE_HALT_TARGET ||
+          inst->opcode == SHADER_OPCODE_RND_MODE ||
+          inst->opcode == SHADER_OPCODE_FLOAT_CONTROL_MODE ||
           (inst->is_control_flow() && inst->opcode != BRW_OPCODE_HALT) ||
-          inst->has_side_effects();
+          inst->eot;
+}
+
+static bool
+has_memory_side_effects(const brw_inst *inst)
+{
+   assert(inst->opcode != SHADER_OPCODE_LSC_SPILL);
+   return inst->opcode == BRW_OPCODE_SYNC ||
+          inst->opcode == SHADER_OPCODE_BARRIER ||
+          inst->opcode == FS_OPCODE_SCHEDULING_FENCE ||
+          (inst->is_send() && inst->as_send()->has_side_effects);
+}
+
+static bool
+is_memory_volatile(const brw_inst *inst)
+{
+   return has_memory_side_effects(inst) ||
+          (inst->is_send() && inst->as_send()->is_volatile);
 }
 
 static bool
@@ -1186,6 +1206,30 @@ brw_instruction_scheduler::add_barrier_deps(schedule_node *n)
    }
 }
 
+void
+brw_instruction_scheduler::add_memory_deps(schedule_node *n)
+{
+   for (schedule_node *prev = n - 1; prev >= current.start; prev--) {
+      if (has_memory_side_effects(prev->inst)) {
+         add_dep(prev, n, 0);
+         break;
+      }
+      if (is_memory_volatile(prev->inst)) {
+         add_dep(prev, n, 0);
+      }
+   }
+
+   for (schedule_node *next = n + 1; next < current.end; next++) {
+      if (has_memory_side_effects(next->inst)) {
+         add_dep(n, next, 0);
+         break;
+      }
+      if (is_memory_volatile(next->inst)) {
+         add_dep(n, next, 0);
+      }
+   }
+}
+
 /**
  * Because some instructions like HALT can disable lanes, scheduling prior to
  * a cross lane access should not be allowed, otherwise we could end up with
@@ -1338,6 +1382,9 @@ brw_instruction_scheduler::calculate_deps()
       if (is_scheduling_barrier(inst))
          add_barrier_deps(n);
 
+      if (has_memory_side_effects(inst))
+         add_memory_deps(n);
+
       if (inst->opcode == BRW_OPCODE_HALT ||
           inst->opcode == SHADER_OPCODE_HALT_TARGET)
           add_cross_lane_deps(n);