diff --git a/src/intel/compiler/brw/brw_disasm.c b/src/intel/compiler/brw/brw_disasm.c
index 43636930706..e0709bec5c0 100644
--- a/src/intel/compiler/brw/brw_disasm.c
+++ b/src/intel/compiler/brw/brw_disasm.c
@@ -206,6 +206,11 @@ static const char *const branch_ctrl[2] = {
    [1] = "BranchCtrl"
 };
 
+static const char *const fusion_ctrl[2] = {
+   [0] = "",
+   [1] = "FusionCtrl"
+};
+
 static const char *const wectrl[2] = {
    [0] = "",
    [1] = "WE_all"
@@ -2619,6 +2624,12 @@ brw_disassemble_inst(FILE *file, const struct brw_isa_info *isa,
          err |= control(file, "acc write control", accwr,
                         brw_eu_inst_acc_wr_control(devinfo, inst), &space);
       }
+
+      if (devinfo->ver == 12 && is_send(opcode)) {
+         err |= control(file, "fusion ctrl", fusion_ctrl,
+                        brw_eu_inst_fusion_ctrl(devinfo, inst), &space);
+      }
+
       if (is_send(opcode))
          err |= control(file, "end of thread", end_of_thread,
                         brw_eu_inst_eot(devinfo, inst), &space);
diff --git a/src/intel/compiler/brw/brw_eu_defines.h b/src/intel/compiler/brw/brw_eu_defines.h
index 781239687a0..e40f2339e6b 100644
--- a/src/intel/compiler/brw/brw_eu_defines.h
+++ b/src/intel/compiler/brw/brw_eu_defines.h
@@ -712,6 +712,10 @@ enum memory_flags {
    MEMORY_FLAG_VOLATILE_ACCESS = 1 << 2,
    /** Whether memory access is marked coherent by GLSL/SPIR-V. */
    MEMORY_FLAG_COHERENT_ACCESS = 1 << 3,
+   /** Whether this instruction should run serialized with regard to EU
+    *  fusion (Gfx12.x only).
+    */
+   MEMORY_FLAG_FUSED_EU_DISABLE = 1 << 4,
 };
 
 enum rt_logical_srcs {
diff --git a/src/intel/compiler/brw/brw_eu_inst.h b/src/intel/compiler/brw/brw_eu_inst.h
index 77c390b5ecd..e5a49b36460 100644
--- a/src/intel/compiler/brw/brw_eu_inst.h
+++ b/src/intel/compiler/brw/brw_eu_inst.h
@@ -897,6 +897,7 @@ brw_eu_inst_sends_ex_desc(const struct intel_device_info *devinfo,
  *  @{
  */
 F(eot,                 /* 9+ */ 127, 127,       /* 12+ */ 34, 34)
+F(fusion_ctrl,         /* 9+ */ -1, -1,         /* 12+ */ 33, 33)
 F(mlen,                /* 9+ */ 124, 121,       /* 12+ */ MD12(28), MD12(25))
 F(rlen,                /* 9+ */ 120, 116,       /* 12+ */ MD12(24), MD12(20))
 F(header_present,      /* 9+ */ 115, 115,       /* 12+ */ MD12(19), MD12(19))
diff --git a/src/intel/compiler/brw/brw_from_nir.cpp b/src/intel/compiler/brw/brw_from_nir.cpp
index 5804655c7f9..f96bbbbbc93 100644
--- a/src/intel/compiler/brw/brw_from_nir.cpp
+++ b/src/intel/compiler/brw/brw_from_nir.cpp
@@ -6471,6 +6471,8 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb,
       brw_inst *inst = ubld.emit(SHADER_OPCODE_GET_BUFFER_SIZE, ret_payload,
                                 srcs, GET_BUFFER_SIZE_SRCS);
       inst->size_written = 4 * REG_SIZE * reg_unit(devinfo);
+      inst->fused_eu_disable =
+         (nir_intrinsic_access(instr) & ACCESS_FUSED_EU_DISABLE_INTEL) != 0;
 
       /* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting:
        *
@@ -7016,12 +7018,15 @@ brw_from_nir_emit_memory_access(nir_to_brw_state &ntb,
       (nir_intrinsic_access(instr) & ACCESS_VOLATILE);
    const bool coherent_access = nir_intrinsic_has_access(instr) &&
       (nir_intrinsic_access(instr) & ACCESS_COHERENT);
+   const bool fused_eu_disable = nir_intrinsic_has_access(instr) &&
+      (nir_intrinsic_access(instr) & ACCESS_FUSED_EU_DISABLE_INTEL);
    const unsigned align =
       nir_intrinsic_has_align(instr) ? nir_intrinsic_align(instr) : 0;
    uint8_t flags =
       (include_helpers ? MEMORY_FLAG_INCLUDE_HELPERS : 0) |
       (volatile_access ? MEMORY_FLAG_VOLATILE_ACCESS : 0) |
-      (coherent_access ? MEMORY_FLAG_COHERENT_ACCESS : 0);
+      (coherent_access ? MEMORY_FLAG_COHERENT_ACCESS : 0) |
+      (fused_eu_disable ? MEMORY_FLAG_FUSED_EU_DISABLE : 0);
    bool no_mask_handle = false;
    int data_src = -1;
 
@@ -7661,6 +7666,7 @@ brw_from_nir_emit_texture(nir_to_brw_state &ntb,
    tex->residency = instr->is_sparse;
    tex->coord_components = instr->coord_components;
    tex->grad_components = lod_components;
+   tex->fused_eu_disable = (instr->backend_flags & BRW_TEX_INSTR_FUSED_EU_DISABLE) != 0;
 
    /* Wa_14012688258:
     *
diff --git a/src/intel/compiler/brw/brw_generator.cpp b/src/intel/compiler/brw/brw_generator.cpp
index d6252d3a56f..2a76d8bd821 100644
--- a/src/intel/compiler/brw/brw_generator.cpp
+++ b/src/intel/compiler/brw/brw_generator.cpp
@@ -198,6 +198,10 @@ brw_generator::generate_send(brw_send_inst *inst,
          brw_eu_inst_set_opcode(p->isa, brw_last_inst,
                              devinfo->ver >= 12 ? BRW_OPCODE_SENDC : BRW_OPCODE_SENDSC);
    }
+
+   /* Serialize messages if needed */
+   if (devinfo->ver == 12 && inst->fused_eu_disable)
+      brw_eu_inst_set_fusion_ctrl(devinfo, brw_last_inst, true);
 }
 
 void
diff --git a/src/intel/compiler/brw/brw_inst.h b/src/intel/compiler/brw/brw_inst.h
index 8474a6da3fb..6ad4d786ea8 100644
--- a/src/intel/compiler/brw/brw_inst.h
+++ b/src/intel/compiler/brw/brw_inst.h
@@ -213,7 +213,12 @@ struct brw_inst : brw_exec_node {
           */
          bool has_no_mask_send_params:1;
 
-         uint8_t pad:5;
+         /**
+          * Serialize the message (Gfx12.x only)
+          */
+         bool fused_eu_disable:1;
+
+         uint8_t pad:4;
       };
       uint16_t bits;
    };
@@ -261,6 +266,11 @@ struct brw_send_inst : brw_inst {
           */
          bool ex_bso:1;
 
+         /**
+          * Serialize the message (Gfx12.x only)
+          */
+         bool fused_eu_disable:1;
+
          /**
           * Only for SHADER_OPCODE_SEND, @offset field contains an immediate
           * part of the extended descriptor that must be encoded in the
@@ -268,7 +278,7 @@ struct brw_send_inst : brw_inst {
           */
          bool ex_desc_imm:1;
 
-         uint8_t pad:3;
+         uint8_t pad:2;
       };
       uint8_t send_bits;
    };
@@ -279,9 +289,28 @@ struct brw_tex_inst : brw_inst {
    uint32_t offset;
    uint8_t coord_components;
    uint8_t grad_components;
-   bool residency:1;
-   bool surface_bindless:1;
-   bool sampler_bindless:1;
+   union {
+      struct {
+         /**
+          * Whether the instruction requests the residency data (additional register
+          * written).
+          */
+         bool residency:1;
+         /**
+          * Serialize the message (Gfx12.x only)
+          */
+         bool fused_eu_disable:1;
+         /**
+          * Whether the surface handle is bindless
+          */
+         bool surface_bindless:1;
+         /**
+          * Whether the sampler handle is bindless
+          */
+         bool sampler_bindless:1;
+      };
+      uint8_t bits;
+   };
 };
 
 struct brw_mem_inst : brw_inst {
diff --git a/src/intel/compiler/brw/brw_lower_logical_sends.cpp b/src/intel/compiler/brw/brw_lower_logical_sends.cpp
index 17760cdc838..10ab57705a0 100644
--- a/src/intel/compiler/brw/brw_lower_logical_sends.cpp
+++ b/src/intel/compiler/brw/brw_lower_logical_sends.cpp
@@ -1217,9 +1217,12 @@ lower_sampler_logical_send(const brw_builder &bld, brw_tex_inst *tex)
       }
    }
 
+   const bool fused_eu_disable = tex->fused_eu_disable;
+
    brw_send_inst *send = brw_transform_inst_to_send(bld, tex);
    tex = NULL;
 
+   send->fused_eu_disable = fused_eu_disable;
    send->mlen = mlen;
    send->header_size = header_size;
    send->sfid = BRW_SFID_SAMPLER;
@@ -1481,6 +1484,7 @@ lower_lsc_memory_logical_send(const brw_builder &bld, brw_mem_inst *mem)
    const bool volatile_access = mem->flags & MEMORY_FLAG_VOLATILE_ACCESS;
    const bool coherent_access = mem->flags & MEMORY_FLAG_COHERENT_ACCESS;
    const bool has_side_effects = mem->has_side_effects();
+   const bool fused_eu_disable = mem->flags & MEMORY_FLAG_FUSED_EU_DISABLE;
 
    const uint32_t data_size_B = lsc_data_size_bytes(data_size);
    const enum brw_reg_type data_type =
@@ -1634,6 +1638,7 @@ lower_lsc_memory_logical_send(const brw_builder &bld, brw_mem_inst *mem)
    send->header_size = 0;
    send->has_side_effects = has_side_effects;
    send->is_volatile = !has_side_effects || volatile_access;
+   send->fused_eu_disable = fused_eu_disable;
 
    /* Finally, the payload */
    send->src[SEND_SRC_PAYLOAD1] = payload;
@@ -1692,6 +1697,7 @@ lower_hdc_memory_logical_send(const brw_builder &bld, brw_mem_inst *mem)
    const bool block = mem->flags & MEMORY_FLAG_TRANSPOSE;
    const bool include_helpers = mem->flags & MEMORY_FLAG_INCLUDE_HELPERS;
    const bool volatile_access = mem->flags & MEMORY_FLAG_VOLATILE_ACCESS;
+   const bool fused_eu_disable = mem->flags & MEMORY_FLAG_FUSED_EU_DISABLE;
    const bool has_side_effects = mem->has_side_effects();
    const bool has_dest = mem->dst.file != BAD_FILE && !mem->dst.is_null();
    assert(mem->address_offset == 0);
@@ -1903,6 +1909,7 @@ lower_hdc_memory_logical_send(const brw_builder &bld, brw_mem_inst *mem)
    send->header_size = header.file != BAD_FILE ? 1 : 0;
    send->has_side_effects = has_side_effects;
    send->is_volatile = !has_side_effects || volatile_access;
+   send->fused_eu_disable = fused_eu_disable;
 
    if (block) {
       assert(send->force_writemask_all);
@@ -2447,6 +2454,7 @@ lower_get_buffer_size(const brw_builder &bld, brw_inst *inst)
    brw_reg surface = inst->src[GET_BUFFER_SIZE_SRC_SURFACE];
    brw_reg surface_handle = inst->src[GET_BUFFER_SIZE_SRC_SURFACE_HANDLE];
    brw_reg lod = bld.move_to_vgrf(inst->src[GET_BUFFER_SIZE_SRC_LOD], 1);
+   const bool fused_eu_disable = inst->fused_eu_disable;
 
    brw_send_inst *send = brw_transform_inst_to_send(bld, inst);
    inst = NULL;
@@ -2468,6 +2476,7 @@ lower_get_buffer_size(const brw_builder &bld, brw_inst *inst)
 
    send->dst = retype(send->dst, BRW_TYPE_UW);
    send->sfid = BRW_SFID_SAMPLER;
+   send->fused_eu_disable = fused_eu_disable;
    setup_surface_descriptors(bld, send, desc, surface, surface_handle);
 }
 
diff --git a/src/intel/compiler/brw/brw_nir.c b/src/intel/compiler/brw/brw_nir.c
index 6159b47b245..78c1ccfed56 100644
--- a/src/intel/compiler/brw/brw_nir.c
+++ b/src/intel/compiler/brw/brw_nir.c
@@ -2065,6 +2065,86 @@ lower_txd_cb(const nir_tex_instr *tex, const void *data)
    return false;
 }
 
+static bool
+flag_fused_eu_disable_instr(nir_builder *b, nir_instr *instr, void *data)
+{
+   switch (instr->type) {
+   case nir_instr_type_tex: {
+      nir_tex_instr *tex = nir_instr_as_tex(instr);
+
+      for (unsigned i = 0; i < tex->num_srcs; ++i) {
+         nir_tex_src_type src_type = tex->src[i].src_type;
+
+         if (src_type != nir_tex_src_texture_handle &&
+             src_type != nir_tex_src_sampler_handle &&
+             src_type != nir_tex_src_texture_offset &&
+             src_type != nir_tex_src_sampler_offset)
+            continue;
+
+         if (nir_src_is_divergent(&tex->src[i].src)) {
+            tex->backend_flags |= BRW_TEX_INSTR_FUSED_EU_DISABLE;
+            return true;
+         }
+      }
+      return false;
+   }
+
+   case nir_instr_type_intrinsic: {
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+      /* We only need to care of intrinsics that refers to a structure/descriptor
+       * outside of the EU's registers like RENDER_SURFACE_STATE/SAMPLER_STATE,
+       * because the fusing will pick one thread's descriptor handle and use that
+       * for the 2 fused threads.
+       *
+       * Global pointers don't have that problem since all the access' data is
+       * per lane in the payload of the SEND message (the 64bit pointer).
+       *
+       * URB/shared-memory don't have that problem either because there is no
+       * descriptor information outside the EU, it's just a per lane
+       * handle/offset.
+       */
+      switch (intrin->intrinsic) {
+      case nir_intrinsic_load_ssbo_uniform_block_intel:
+      case nir_intrinsic_load_ubo_uniform_block_intel:
+      case nir_intrinsic_load_ssbo_block_intel:
+      case nir_intrinsic_load_ssbo_intel:
+      case nir_intrinsic_store_ssbo_intel:
+      case nir_intrinsic_load_ssbo:
+      case nir_intrinsic_store_ssbo:
+      case nir_intrinsic_get_ssbo_size:
+      case nir_intrinsic_load_ubo:
+      case nir_intrinsic_image_load:
+      case nir_intrinsic_image_store:
+      case nir_intrinsic_image_size:
+      case nir_intrinsic_image_levels:
+      case nir_intrinsic_image_atomic:
+      case nir_intrinsic_image_atomic_swap:
+      case nir_intrinsic_bindless_image_load:
+      case nir_intrinsic_bindless_image_store:
+      case nir_intrinsic_bindless_image_size:
+      case nir_intrinsic_bindless_image_levels:
+      case nir_intrinsic_bindless_image_atomic:
+      case nir_intrinsic_bindless_image_atomic_swap: {
+         int src_idx = nir_get_io_index_src_number(intrin);
+         if (nir_src_is_divergent(&intrin->src[src_idx])) {
+            nir_intrinsic_set_access(intrin,
+                                     nir_intrinsic_access(intrin) |
+                                     ACCESS_FUSED_EU_DISABLE_INTEL);
+            return true;
+         }
+         return false;
+      }
+
+      default:
+         return false;
+      }
+   }
+
+   default:
+      return false;
+   }
+}
+
 /* Prepare the given shader for codegen
  *
  * This function is intended to be called right before going into the actual
@@ -2283,6 +2363,28 @@ brw_postprocess_nir_opts(nir_shader *nir, const struct brw_compiler *compiler,
 
       OPT(nir_lower_subgroups, &subgroups_options);
    }
+
+   /* Deal with EU fusion */
+   if (devinfo->ver == 12) {
+      nir_divergence_options options =
+         nir_divergence_across_subgroups |
+         nir_divergence_multiple_workgroup_per_compute_subgroup;
+
+      nir_foreach_function_impl(impl, nir) {
+         nir_divergence_analysis_impl(impl, options);
+      }
+
+      nir_shader_instructions_pass(nir,
+                                   flag_fused_eu_disable_instr,
+                                   nir_metadata_all, NULL);
+
+      /* We request a special divergence information which is not needed
+       * after.
+       */
+      nir_foreach_function_impl(impl, nir) {
+         nir_progress(true, impl, ~nir_metadata_divergence);
+      }
+   }
 }
 
 void
diff --git a/src/intel/compiler/brw/brw_nir.h b/src/intel/compiler/brw/brw_nir.h
index 078e5d1b21e..d9195f1c062 100644
--- a/src/intel/compiler/brw/brw_nir.h
+++ b/src/intel/compiler/brw/brw_nir.h
@@ -32,6 +32,8 @@
 extern "C" {
 #endif
 
+#define BRW_TEX_INSTR_FUSED_EU_DISABLE (1u << 30)
+
 extern const struct nir_shader_compiler_options brw_scalar_nir_options;
 
 int type_size_vec4(const struct glsl_type *type, bool bindless);
diff --git a/src/intel/compiler/brw/brw_opt_cse.cpp b/src/intel/compiler/brw/brw_opt_cse.cpp
index c7983d311e4..8e2267b9479 100644
--- a/src/intel/compiler/brw/brw_opt_cse.cpp
+++ b/src/intel/compiler/brw/brw_opt_cse.cpp
@@ -391,9 +391,7 @@ hash_inst(const void *v)
       const uint8_t tex_u8data[] = {
          tex->coord_components,
          tex->grad_components,
-         tex->residency,
-         tex->surface_bindless,
-         tex->sampler_bindless,
+         tex->bits,
       };
       const uint32_t tex_u32data[] = {
          tex->sampler_opcode,