diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index 0ce0ce9f139..1d2d6389274 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -671,6 +671,7 @@ bool brw_fs_opt_saturate_propagation(fs_visitor &s);
 bool brw_fs_opt_split_sends(fs_visitor &s);
 bool brw_fs_opt_split_virtual_grfs(fs_visitor &s);
 bool brw_fs_opt_zero_samples(fs_visitor &s);
+bool brw_opt_combine_convergent_txf(fs_visitor &s);
 
 bool brw_fs_workaround_emit_dummy_mov_instruction(fs_visitor &s);
 bool brw_fs_workaround_memory_fence_before_eot(fs_visitor &s);
diff --git a/src/intel/compiler/brw_fs_opt.cpp b/src/intel/compiler/brw_fs_opt.cpp
index 3d04f018d89..309eb4f99c7 100644
--- a/src/intel/compiler/brw_fs_opt.cpp
+++ b/src/intel/compiler/brw_fs_opt.cpp
@@ -85,6 +85,9 @@ brw_fs_optimize(fs_visitor &s)
    progress = false;
    pass_num = 0;
 
+   if (OPT(brw_opt_combine_convergent_txf))
+      OPT(brw_fs_opt_copy_propagation_defs);
+
    if (OPT(brw_fs_lower_pack)) {
       OPT(brw_fs_opt_register_coalesce);
       OPT(brw_fs_opt_dead_code_eliminate);
@@ -552,4 +555,3 @@ brw_fs_opt_remove_extra_rounding_modes(fs_visitor &s)
 
    return progress;
 }
-
diff --git a/src/intel/compiler/brw_opt_txf_combiner.cpp b/src/intel/compiler/brw_opt_txf_combiner.cpp
new file mode 100644
index 00000000000..b4d912b0257
--- /dev/null
+++ b/src/intel/compiler/brw_opt_txf_combiner.cpp
@@ -0,0 +1,235 @@
+/*
+ * Copyright © 2024 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "brw_eu.h"
+#include "brw_fs.h"
+#include "brw_fs_builder.h"
+
+using namespace brw;
+
+static unsigned
+dest_comps_for_txf(const fs_visitor &s, const fs_inst *txf)
+{
+   if (!txf)
+      return 0;
+
+   const unsigned grf_size = REG_SIZE * reg_unit(s.devinfo);
+   const unsigned per_component_regs =
+      DIV_ROUND_UP(brw_type_size_bytes(txf->dst.type) *
+                   txf->exec_size, grf_size);
+   const unsigned dest_regs = txf->size_written / grf_size;
+   const unsigned dest_comps = dest_regs / per_component_regs;
+   return dest_comps;
+}
+
+static bool
+is_def(const def_analysis &defs, const brw_reg &r)
+{
+   return r.file == IMM || r.file == BAD_FILE || defs.get(r) != NULL;
+}
+
+static bool
+is_uniform_def(const def_analysis &defs, const brw_reg &r)
+{
+   return is_def(defs, r) && is_uniform(r);
+}
+
+/**
+ * Check if two texture instructions have a matching source (either the same
+ * immediate value, or both references to the same immutable SSA def and
+ * with matching source modifiers and regions).
+ */
+static bool
+sources_match(ASSERTED const def_analysis &defs,
+              const fs_inst *a, const fs_inst *b, enum tex_logical_srcs src)
+{
+   assert(is_def(defs, a->src[src]));
+   assert(is_def(defs, b->src[src]));
+   return brw_regs_equal(&a->src[src], &b->src[src]);
+}
+
+/**
+ * Look for a series of convergent texture buffer fetches within a basic
+ * block and combine them into a single divergent load with one lane for
+ * each original fetch.  For example, this series of convergent fetches:
+ *
+ *   txf(16) %12:UD, coord = 12d, lod = 0u, handle = %1<0>:D
+ *   txf(16) %13:UD, coord = 13d, lod = 0u, handle = %1<0>:D
+ *   txf(16) %14:UD, coord = 14d, lod = 0u, handle = %1<0>:D
+ *   txf(16) %15:UD, coord = 15d, lod = 0u, handle = %1<0>:D
+ *   txf(16) %16:UD, coord = 16d, lod = 0u, handle = %1<0>:D
+ *   txf(16) %17:UD, coord = 17d, lod = 0u, handle = %1<0>:D
+ *   txf(16) %18:UD, coord = 18d, lod = 0u, handle = %1<0>:D
+ *   txf(16) %19:UD, coord = 19d, lod = 0u, handle = %1<0>:D
+ *
+ * can be combined into a single divergent load and scalar-expansion moves
+ * (which can easily be copy propagated away):
+ *
+ *   load_payload(1) %2:D 12d, 13d, 14d, 15d, 16d, 17d, 18d, 19d
+ *   txf(8) %3:UD, coord = %2, lod = 0u, handle = %1<0>:D
+ *   mov(16) %12:UD, %3+0.0<0>:UD
+ *   ...
+ *   mov(16) %19:UD, %3+0.28<0>:UD
+ *
+ * Our sampler hardware doesn't have any special support for convergent
+ * loads (like LSC transpose/block loads), and always performs SIMD8/16/32
+ * per-channel loads.  But with this trick, we can still combine multiple
+ * convergent loads into a single message with fewer round-trips, and much
+ * lower register pressure.
+ */
+bool
+brw_opt_combine_convergent_txf(fs_visitor &s)
+{
+   const def_analysis &defs = s.def_analysis.require();
+
+   const unsigned min_simd = 8 * reg_unit(s.devinfo);
+   const unsigned grf_size = REG_SIZE * reg_unit(s.devinfo);
+
+   bool progress = false;
+
+   foreach_block(block, s.cfg) {
+      /* Gather a list of convergent TXFs to the same surface in this block */
+      fs_inst *txfs[32] = {};
+      unsigned count = 0;
+
+      foreach_inst_in_block(fs_inst, inst, block) {
+         if (inst->opcode != SHADER_OPCODE_TXF_LOGICAL)
+            continue;
+
+         /* Only handle buffers or single miplevel 1D images for now */
+         if (inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud > 1)
+            continue;
+
+         if (inst->src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0)
+            continue;
+
+         if (inst->predicate || inst->force_writemask_all)
+            continue;
+
+         if (!is_uniform_def(defs, inst->src[TEX_LOGICAL_SRC_LOD]) ||
+             !is_uniform_def(defs, inst->src[TEX_LOGICAL_SRC_SURFACE]) ||
+             !is_uniform_def(defs, inst->src[TEX_LOGICAL_SRC_SURFACE_HANDLE]))
+            continue;
+
+         /* Only handle immediates for now: we could check is_uniform(),
+          * but we'd need to ensure the coordinate's definition reaches
+          * txfs[0] which is where we'll insert the combined coordinate.
+          */
+         if (inst->src[TEX_LOGICAL_SRC_COORDINATE].file != IMM)
+            continue;
+
+         /* texelFetch from 1D buffers shouldn't have any of these */
+         assert(inst->src[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE);
+         assert(inst->src[TEX_LOGICAL_SRC_LOD2].file == BAD_FILE);
+         assert(inst->src[TEX_LOGICAL_SRC_MIN_LOD].file == BAD_FILE);
+         assert(inst->src[TEX_LOGICAL_SRC_SAMPLE_INDEX].file == BAD_FILE);
+         assert(inst->src[TEX_LOGICAL_SRC_MCS].file == BAD_FILE);
+         assert(inst->src[TEX_LOGICAL_SRC_TG4_OFFSET].file == BAD_FILE);
+         assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM &&
+                inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud == 0);
+
+         if (count > 0 &&
+             (!sources_match(defs, inst, txfs[0], TEX_LOGICAL_SRC_LOD) ||
+              !sources_match(defs, inst, txfs[0], TEX_LOGICAL_SRC_SURFACE) ||
+              !sources_match(defs, inst, txfs[0],
+                             TEX_LOGICAL_SRC_SURFACE_HANDLE)))
+            continue;
+
+         txfs[count++] = inst;
+
+         if (count == ARRAY_SIZE(txfs))
+            break;
+      }
+
+      /* Need at least two things to combine. */
+      if (count < 2)
+         continue;
+
+      /* Emit divergent TXFs and replace the original ones with MOVs */
+      for (unsigned curr = 0; curr < count; curr += 32) {
+         const unsigned lanes = CLAMP(count - curr, min_simd, 32);
+         const unsigned width = util_next_power_of_two(lanes);
+         const fs_builder ubld =
+            fs_builder(&s).at(block, txfs[curr]).exec_all().group(width, 0);
+         const fs_builder ubld1 = ubld.group(1, 0);
+
+         enum brw_reg_type coord_type =
+            txfs[curr]->src[TEX_LOGICAL_SRC_COORDINATE].type;
+         brw_reg coord = ubld.vgrf(coord_type);
+         brw_reg coord_comps[32];
+
+         for (unsigned i = 0; i < width; i++) {
+            /* Our block size might be larger than the number of convergent
+             * loads we're combining.  If so, repeat the last component.
+             */
+            if (txfs[curr+i])
+               coord_comps[i] = txfs[curr+i]->src[TEX_LOGICAL_SRC_COORDINATE];
+            else
+               coord_comps[i] = coord_comps[i-1];
+         }
+         ubld1.VEC(coord, coord_comps, width);
+
+         brw_reg srcs[TEX_LOGICAL_NUM_SRCS];
+         srcs[TEX_LOGICAL_SRC_COORDINATE] = coord;
+         srcs[TEX_LOGICAL_SRC_LOD] = txfs[0]->src[TEX_LOGICAL_SRC_LOD];
+         srcs[TEX_LOGICAL_SRC_SURFACE] = txfs[0]->src[TEX_LOGICAL_SRC_SURFACE];
+         srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] =
+            txfs[0]->src[TEX_LOGICAL_SRC_SURFACE_HANDLE];
+         srcs[TEX_LOGICAL_SRC_SAMPLER] = txfs[0]->src[TEX_LOGICAL_SRC_SAMPLER];
+         srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE] =
+            txfs[0]->src[TEX_LOGICAL_SRC_SAMPLER_HANDLE];
+         srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_ud(1);
+         srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_ud(0);
+         srcs[TEX_LOGICAL_SRC_RESIDENCY] = brw_imm_ud(0);
+
+         /* Each of our txf may have a reduced response length if some
+          * components are never read.  Use the maximum of the sizes.
+          */
+         unsigned new_dest_comps = 0;
+         for (unsigned i = 0; i < width; i++) {
+            const unsigned this_comps = dest_comps_for_txf(s, txfs[curr+i]);
+            new_dest_comps = MAX2(new_dest_comps, this_comps);
+         }
+
+         /* Emit the new divergent TXF */
+         brw_reg div = ubld.vgrf(BRW_TYPE_UD, new_dest_comps);
+         fs_inst *div_txf =
+            ubld.emit(SHADER_OPCODE_TXF_LOGICAL, div, srcs,
+                      TEX_LOGICAL_NUM_SRCS);
+
+         /* Update it to also use response length reduction */
+         const unsigned per_component_regs =
+            DIV_ROUND_UP(brw_type_size_bytes(div.type) * div_txf->exec_size,
+                         grf_size);
+         div_txf->size_written = new_dest_comps * per_component_regs * grf_size;
+
+         for (unsigned i = 0; i < width; i++) {
+            fs_inst *txf = txfs[curr+i];
+            if (!txf)
+               break;
+
+            const fs_builder ibld = fs_builder(&s, block, txf);
+
+            /* Replace each of the original TXFs with MOVs from our new one */
+            const unsigned dest_comps = dest_comps_for_txf(s, txf);
+            assert(dest_comps <= 4);
+
+            brw_reg v[4];
+            for (unsigned c = 0; c < dest_comps; c++)
+               v[c] = component(offset(div, ubld, c), i);
+            ibld.VEC(retype(txf->dst, BRW_TYPE_UD), v, dest_comps);
+
+            txf->remove(block);
+         }
+
+         progress = true;
+      }
+   }
+
+   if (progress)
+      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+
+   return progress;
+}
diff --git a/src/intel/compiler/meson.build b/src/intel/compiler/meson.build
index 718b080a38b..252f4fa8762 100644
--- a/src/intel/compiler/meson.build
+++ b/src/intel/compiler/meson.build
@@ -99,6 +99,7 @@ libintel_compiler_brw_files = files(
   'brw_nir_rt.h',
   'brw_nir_rt.c',
   'brw_nir_rt_builder.h',
+  'brw_opt_txf_combiner.cpp',
   'brw_packed_float.c',
   'brw_print.cpp',
   'brw_prim.h',