diff --git a/src/gallium/drivers/iris/driinfo_iris.h b/src/gallium/drivers/iris/driinfo_iris.h
index d0fa3b6c1d9..9486da874fb 100644
--- a/src/gallium/drivers/iris/driinfo_iris.h
+++ b/src/gallium/drivers/iris/driinfo_iris.h
@@ -14,6 +14,8 @@ DRI_CONF_SECTION_END
 
 DRI_CONF_SECTION_PERFORMANCE
    DRI_CONF_ADAPTIVE_SYNC(true)
+   DRI_CONFIG_INTEL_FORCE_COMPUTE_SURFACE_PREFETCH(false)
+   DRI_CONFIG_INTEL_FORCE_SAMPLER_PREFETCH(false)
    DRI_CONFIG_INTEL_TBIMR(true)
    DRI_CONFIG_INTEL_VF_DISTRIBUTION(true)
    DRI_CONFIG_INTEL_TE_DISTRIBUTION(true)
diff --git a/src/gallium/drivers/iris/iris_program_cache.c b/src/gallium/drivers/iris/iris_program_cache.c
index 4e7cae5bb9a..dede919098f 100644
--- a/src/gallium/drivers/iris/iris_program_cache.c
+++ b/src/gallium/drivers/iris/iris_program_cache.c
@@ -163,8 +163,6 @@ iris_upload_shader(struct iris_screen *screen,
                    const void *key,
                    const void *assembly)
 {
-   const struct intel_device_info *devinfo = screen->devinfo;
-
    u_upload_alloc_ref(uploader, 0, shader->program_size, 64,
                   &shader->assembly.offset, &shader->assembly.res,
                   &shader->map);
@@ -200,7 +198,7 @@ iris_upload_shader(struct iris_screen *screen,
    }
 
    /* Store the 3DSTATE shader packets and other derived state. */
-   screen->vtbl.store_derived_program_state(devinfo, cache_id, shader);
+   screen->vtbl.store_derived_program_state(screen, cache_id, shader);
 
    util_queue_fence_signal(&shader->ready);
 
diff --git a/src/gallium/drivers/iris/iris_screen.c b/src/gallium/drivers/iris/iris_screen.c
index bb2e3bbf936..54374f019e8 100644
--- a/src/gallium/drivers/iris/iris_screen.c
+++ b/src/gallium/drivers/iris/iris_screen.c
@@ -745,6 +745,10 @@ iris_screen_create(int fd, const struct pipe_screen_config *config)
       driQueryOptionb(config->options, "intel_enable_wa_14018912822");
    screen->driconf.intel_enable_wa_14024015672_msaa =
       driQueryOptionb(config->options, "intel_enable_wa_14024015672_msaa");
+   screen->driconf.force_sampler_prefetch =
+      driQueryOptionb(config->options, "intel_force_sampler_prefetch");
+   screen->driconf.force_compute_surface_prefetch =
+      driQueryOptionb(config->options, "intel_force_compute_surface_prefetch");
    screen->driconf.enable_tbimr =
       driQueryOptionb(config->options, "intel_tbimr");
    screen->driconf.enable_vf_distribution =
diff --git a/src/gallium/drivers/iris/iris_screen.h b/src/gallium/drivers/iris/iris_screen.h
index 0e6a7f540b5..902c0c95cd1 100644
--- a/src/gallium/drivers/iris/iris_screen.h
+++ b/src/gallium/drivers/iris/iris_screen.h
@@ -132,7 +132,7 @@ struct iris_vtable {
                                      uint32_t offset);
 
    unsigned (*derived_program_state_size)(enum iris_program_cache_id id);
-   void (*store_derived_program_state)(const struct intel_device_info *devinfo,
+   void (*store_derived_program_state)(const struct iris_screen *screen,
                                        enum iris_program_cache_id cache_id,
                                        struct iris_compiled_shader *shader);
    uint32_t *(*create_so_decl_list)(const struct pipe_stream_output_info *sol,
@@ -209,6 +209,8 @@ struct iris_screen {
       bool enable_te_distribution;
       unsigned generated_indirect_threshold;
       bool disable_threaded_context;
+      bool force_sampler_prefetch;
+      bool force_compute_surface_prefetch;
    } driconf;
 
    /** Does the kernel support various features (KERNEL_HAS_* bitfield)? */
diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c
index 0de2cd5d6f3..bfccf68850d 100644
--- a/src/gallium/drivers/iris/iris_state.c
+++ b/src/gallium/drivers/iris/iris_state.c
@@ -5111,20 +5111,40 @@ iris_populate_cs_key(const struct iris_context *ice,
 }
 
 static inline uint32_t
-encode_sampler_count(const struct iris_compiled_shader *shader)
+encode_sampler_count(const struct iris_screen *screen,
+                     const struct iris_compiled_shader *shader)
 {
+#if GFX_VER == 11
+   /* Wa_1606682166 */
+   return 0;
+#else
+   if (!screen->driconf.force_sampler_prefetch)
+      return 0;
    /* We can potentially have way more than 32 samplers and that's ok.
     * However, the 3DSTATE_XS packets only have 3 bits to specify how
     * many to pre-fetch and all values above 4 are marked reserved.
     */
    uint32_t count = util_last_bit64(shader->bt.samplers_used_mask);
    return DIV_ROUND_UP(CLAMP(count, 0, 16), 4);
+#endif
+}
+
+static inline uint32_t
+encode_surface_count(const struct iris_screen *screen,
+                     const struct iris_compiled_shader *shader)
+{
+#if GFX_VERx10 >= 125
+   if (shader->stage == MESA_SHADER_COMPUTE &&
+       !screen->driconf.force_compute_surface_prefetch)
+      return 0;
+#endif
+   return shader->bt.size_bytes / 4;
 }
 
 #define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix, stage)                   \
    pkt.KernelStartPointer = KSP(shader);                                  \
-   pkt.BindingTableEntryCount = shader->bt.size_bytes / 4;                \
-   pkt.SamplerCount = encode_sampler_count(shader);                       \
+   pkt.BindingTableEntryCount = encode_surface_count(screen, shader);     \
+   pkt.SamplerCount = encode_sampler_count(screen, shader);               \
    pkt.FloatingPointMode = shader->use_alt_mode;                          \
                                                                           \
    pkt.DispatchGRFStartRegisterForURBData =                               \
@@ -5180,9 +5200,10 @@ encode_sampler_count(const struct iris_compiled_shader *shader)
  * Encode most of 3DSTATE_VS based on the compiled shader.
  */
 static void
-iris_store_vs_state(const struct intel_device_info *devinfo,
+iris_store_vs_state(const struct iris_screen *screen,
                     struct iris_compiled_shader *shader)
 {
+   const struct intel_device_info *devinfo = screen->devinfo;
    struct iris_vue_data *vue_data = iris_vue_data(shader);
 
    iris_pack_command(GENX(3DSTATE_VS), shader->derived_data, vs) {
@@ -5203,9 +5224,10 @@ iris_store_vs_state(const struct intel_device_info *devinfo,
  * Encode most of 3DSTATE_HS based on the compiled shader.
  */
 static void
-iris_store_tcs_state(const struct intel_device_info *devinfo,
+iris_store_tcs_state(const struct iris_screen *screen,
                      struct iris_compiled_shader *shader)
 {
+   const struct intel_device_info *devinfo = screen->devinfo;
    struct iris_tcs_data *tcs_data = iris_tcs_data(shader);
    struct iris_vue_data *vue_data = &tcs_data->base;
 
@@ -5252,9 +5274,10 @@ iris_store_tcs_state(const struct intel_device_info *devinfo,
  * Encode 3DSTATE_TE and most of 3DSTATE_DS based on the compiled shader.
  */
 static void
-iris_store_tes_state(const struct intel_device_info *devinfo,
+iris_store_tes_state(const struct iris_screen *screen,
                      struct iris_compiled_shader *shader)
 {
+   const struct intel_device_info *devinfo = screen->devinfo;
    struct iris_tes_data *tes_data = iris_tes_data(shader);
    struct iris_vue_data *vue_data = &tes_data->base;
 
@@ -5319,9 +5342,10 @@ iris_store_tes_state(const struct intel_device_info *devinfo,
  * Encode most of 3DSTATE_GS based on the compiled shader.
  */
 static void
-iris_store_gs_state(const struct intel_device_info *devinfo,
+iris_store_gs_state(const struct iris_screen *screen,
                     struct iris_compiled_shader *shader)
 {
+   const struct intel_device_info *devinfo = screen->devinfo;
    struct iris_gs_data *gs_data = iris_gs_data(shader);
    struct iris_vue_data *vue_data = &gs_data->base;
 
@@ -5367,9 +5391,10 @@ iris_store_gs_state(const struct intel_device_info *devinfo,
  * Encode most of 3DSTATE_PS and 3DSTATE_PS_EXTRA based on the shader.
  */
 static void
-iris_store_fs_state(const struct intel_device_info *devinfo,
+iris_store_fs_state(const struct iris_screen *screen,
                     struct iris_compiled_shader *shader)
 {
+   const struct intel_device_info *devinfo = screen->devinfo;
    struct iris_fs_data *fs_data = iris_fs_data(shader);
 
    uint32_t *ps_state = (void *) shader->derived_data;
@@ -5377,8 +5402,8 @@ iris_store_fs_state(const struct intel_device_info *devinfo,
 
    iris_pack_command(GENX(3DSTATE_PS), ps_state, ps) {
       ps.VectorMaskEnable = fs_data->uses_vmask;
-      ps.BindingTableEntryCount = shader->bt.size_bytes / 4;
-      ps.SamplerCount = encode_sampler_count(shader);
+      ps.BindingTableEntryCount = encode_surface_count(screen, shader);
+      ps.SamplerCount = encode_sampler_count(screen, shader);
       ps.FloatingPointMode = shader->use_alt_mode;
       ps.MaximumNumberofThreadsPerPSD =
          devinfo->max_threads_per_psd - (GFX_VER == 8 ? 2 : 1);
@@ -5453,7 +5478,7 @@ iris_store_fs_state(const struct intel_device_info *devinfo,
  * This must match the data written by the iris_store_xs_state() functions.
  */
 static void
-iris_store_cs_state(const struct intel_device_info *devinfo,
+iris_store_cs_state(const struct iris_screen *screen,
                     struct iris_compiled_shader *shader)
 {
    struct iris_cs_data *cs_data = iris_cs_data(shader);
@@ -5471,10 +5496,8 @@ iris_store_cs_state(const struct intel_device_info *devinfo,
 #if GFX_VERx10 <= 125
       desc.BarrierEnable = cs_data->uses_barrier;
 #endif
-      /* Typically set to 0 to avoid prefetching on every thread dispatch. */
-      desc.BindingTableEntryCount = devinfo->verx10 == 125 ?
-         0 : MIN2(shader->bt.size_bytes / 4, 31);
-      desc.SamplerCount = encode_sampler_count(shader);
+      desc.BindingTableEntryCount = MIN2(encode_surface_count(screen, shader), 31);
+      desc.SamplerCount = encode_sampler_count(screen, shader);
       /* TODO: Check if we are missing workarounds and enable mid-thread
        * preemption.
        *
@@ -5522,28 +5545,28 @@ iris_derived_program_state_size(enum iris_program_cache_id cache_id)
  * get most of the state packet without having to reconstruct it.
  */
 static void
-iris_store_derived_program_state(const struct intel_device_info *devinfo,
+iris_store_derived_program_state(const struct iris_screen *screen,
                                  enum iris_program_cache_id cache_id,
                                  struct iris_compiled_shader *shader)
 {
    switch (cache_id) {
    case IRIS_CACHE_VS:
-      iris_store_vs_state(devinfo, shader);
+      iris_store_vs_state(screen, shader);
       break;
    case IRIS_CACHE_TCS:
-      iris_store_tcs_state(devinfo, shader);
+      iris_store_tcs_state(screen, shader);
       break;
    case IRIS_CACHE_TES:
-      iris_store_tes_state(devinfo, shader);
+      iris_store_tes_state(screen, shader);
       break;
    case IRIS_CACHE_GS:
-      iris_store_gs_state(devinfo, shader);
+      iris_store_gs_state(screen, shader);
       break;
    case IRIS_CACHE_FS:
-      iris_store_fs_state(devinfo, shader);
+      iris_store_fs_state(screen, shader);
       break;
    case IRIS_CACHE_CS:
-      iris_store_cs_state(devinfo, shader);
+      iris_store_cs_state(screen, shader);
       break;
    case IRIS_CACHE_BLORP:
       break;
@@ -9248,11 +9271,9 @@ iris_upload_compute_walker(struct iris_context *ice,
                                                    dispatch.group_size,
                                                    dispatch.simd_size);
    idd.SamplerStatePointer = shs->sampler_table.offset;
-   idd.SamplerCount = encode_sampler_count(shader),
+   idd.SamplerCount = encode_sampler_count(screen, shader),
    idd.BindingTablePointer = binder->bt_offset[MESA_SHADER_COMPUTE];
-   /* Typically set to 0 to avoid prefetching on every thread dispatch. */
-   idd.BindingTableEntryCount = devinfo->verx10 == 125 ?
-      0 : MIN2(shader->bt.size_bytes / 4, 31);
+   idd.BindingTableEntryCount = MIN2(encode_surface_count(screen, shader), 31);
    idd.NumberOfBarriers = cs_data->uses_barrier;
 #if GFX_VER >= 30
    idd.RegistersPerThread = ptl_register_blocks(shader->brw_prog_data->grf_used);
diff --git a/src/intel/vulkan/anv_instance.c b/src/intel/vulkan/anv_instance.c
index 27b540ea4d9..2e1a3e76c38 100644
--- a/src/intel/vulkan/anv_instance.c
+++ b/src/intel/vulkan/anv_instance.c
@@ -36,6 +36,8 @@ static const driOptionDescription anv_dri_options[] = {
       DRI_CONF_ANV_FORCE_INDIRECT_DESCRIPTORS(false)
       DRI_CONF_ANV_DISABLE_LINK_TIME_OPTIMIZATION(false)
       DRI_CONF_SHADER_SPILLING_RATE(11)
+      DRI_CONFIG_INTEL_FORCE_COMPUTE_SURFACE_PREFETCH(false)
+      DRI_CONFIG_INTEL_FORCE_SAMPLER_PREFETCH(false)
       DRI_CONFIG_INTEL_TBIMR(true)
       DRI_CONFIG_INTEL_VF_DISTRIBUTION(true)
       DRI_CONFIG_INTEL_TE_DISTRIBUTION(true)
@@ -200,6 +202,10 @@ anv_init_dri_options(struct anv_instance *instance)
        driQueryOptioni(&instance->dri_options, "force_vk_vendor");
     instance->has_fake_sparse =
        driQueryOptionb(&instance->dri_options, "fake_sparse");
+    instance->force_sampler_prefetch =
+       driQueryOptionb(&instance->dri_options, "intel_force_sampler_prefetch");
+    instance->force_compute_surface_prefetch =
+       driQueryOptionb(&instance->dri_options, "intel_force_compute_surface_prefetch");
     instance->enable_tbimr = driQueryOptionb(&instance->dri_options, "intel_tbimr");
     instance->enable_vf_distribution =
        driQueryOptionb(&instance->dri_options, "intel_vf_distribution");
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index 5d1fccb6bcb..fb5a58f3888 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -1782,6 +1782,8 @@ struct anv_instance {
     bool                                        custom_border_colors_without_format;
     bool                                        vf_component_packing;
     bool                                        large_workgroup_non_coherent_image_workaround;
+    bool                                        force_sampler_prefetch;
+    bool                                        force_compute_surface_prefetch;
 
     /* HW workarounds */
     bool                                        no_16bit;
diff --git a/src/intel/vulkan/genX_shader.c b/src/intel/vulkan/genX_shader.c
index 747427b8890..5feddc1f62b 100644
--- a/src/intel/vulkan/genX_shader.c
+++ b/src/intel/vulkan/genX_shader.c
@@ -27,15 +27,36 @@
         }))
 
 static uint32_t
-get_sampler_count(const struct anv_shader *shader)
+get_surface_count(const struct anv_device *device,
+                  const struct anv_shader *shader)
 {
-   uint32_t count_by_4 = DIV_ROUND_UP(shader->bind_map.sampler_count, 4);
+#if GFX_VERx10 >= 125
+   if (shader->vk.stage == MESA_SHADER_COMPUTE &&
+       !device->physical->instance->force_compute_surface_prefetch)
+      return 0;
+#endif
+   return shader->bind_map.surface_count;
+}
 
-   /* We can potentially have way more than 32 samplers and that's ok.
-    * However, the 3DSTATE_XS packets only have 3 bits to specify how
-    * many to pre-fetch and all values above 4 are marked reserved.
+static uint32_t
+get_sampler_count(const struct anv_device *device,
+                  const struct anv_shader *shader)
+{
+#if GFX_VER == 11
+   /* Wa_1606682166:
+    *
+    * Incorrect TDL's SSP address shift in SARB for 16:6 & 18:8 modes. Disable
+    * the Sampler state prefetch functionality in the SARB by programming
+    * 0xB000[30] to '1'.
     */
-   return MIN2(count_by_4, 4);
+   return 0;
+#else
+   if (!device->physical->instance->force_sampler_prefetch)
+      return 0;
+
+   return DIV_ROUND_UP(
+      CLAMP(shader->bind_map.sampler_count, 0, 16), 4);
+#endif
 }
 
 static UNUSED struct anv_address
@@ -557,13 +578,8 @@ emit_vs_shader(struct anv_batch *batch,
       vs.SingleVertexDispatch       = false;
 #endif
       vs.VectorMaskEnable           = false;
-      /* Wa_1606682166:
-       * Incorrect TDL's SSP address shift in SARB for 16:6 & 18:8 modes.
-       * Disable the Sampler state prefetch functionality in the SARB by
-       * programming 0xB000[30] to '1'.
-       */
-      vs.SamplerCount               = GFX_VER == 11 ? 0 : get_sampler_count(shader);
-      vs.BindingTableEntryCount     = shader->bind_map.surface_count;
+      vs.SamplerCount               = get_sampler_count(device, shader);
+      vs.BindingTableEntryCount     = get_surface_count(device, shader);
       vs.FloatingPointMode          = IEEE754;
       vs.IllegalOpcodeExceptionEnable = false;
       vs.SoftwareExceptionEnable    = false;
@@ -619,9 +635,8 @@ emit_hs_shader(struct anv_batch *batch,
       hs.Enable = true;
       hs.StatisticsEnable = true;
       hs.KernelStartPointer = shader->kernel.offset;
-      /* Wa_1606682166 */
-      hs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(shader);
-      hs.BindingTableEntryCount = shader->bind_map.surface_count;
+      hs.SamplerCount = get_sampler_count(device, shader);
+      hs.BindingTableEntryCount = get_surface_count(device, shader);
 
 #if GFX_VER >= 12
       /* Wa_1604578095:
@@ -724,9 +739,8 @@ emit_ds_shader(struct anv_batch *batch,
       ds.Enable = true;
       ds.StatisticsEnable = true;
       ds.KernelStartPointer = shader->kernel.offset;
-      /* Wa_1606682166 */
-      ds.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(shader);
-      ds.BindingTableEntryCount = shader->bind_map.surface_count;
+      ds.SamplerCount = get_sampler_count(device, shader);
+      ds.BindingTableEntryCount = get_surface_count(device, shader);
       ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;
 
       ds.PatchURBEntryReadLength = tes_prog_data->base.urb_read_length;
@@ -799,9 +813,8 @@ emit_gs_shader(struct anv_batch *batch,
 
       gs.SingleProgramFlow       = false;
       gs.VectorMaskEnable        = false;
-      /* Wa_1606682166 */
-      gs.SamplerCount            = GFX_VER == 11 ? 0 : get_sampler_count(shader);
-      gs.BindingTableEntryCount  = shader->bind_map.surface_count;
+      gs.SamplerCount            = get_sampler_count(device, shader);
+      gs.BindingTableEntryCount  = get_surface_count(device, shader);
       gs.IncludeVertexHandles    = gs_prog_data->base.include_vue_handles;
       gs.IncludePrimitiveID      = gs_prog_data->include_primitive_id;
 
@@ -1060,9 +1073,8 @@ emit_ps_shader(struct anv_batch *batch,
 
       ps.SingleProgramFlow          = false;
       ps.VectorMaskEnable           = wm_prog_data->uses_vmask;
-      /* Wa_1606682166 */
-      ps.SamplerCount               = GFX_VER == 11 ? 0 : get_sampler_count(shader);
-      ps.BindingTableEntryCount     = shader->bind_map.surface_count;
+      ps.SamplerCount               = get_sampler_count(device, shader);
+      ps.BindingTableEntryCount     = get_surface_count(device, shader);
 #if GFX_VER < 20
       ps.PushConstantEnable         = wm_prog_data->base.push_sizes[0] > 0;
 #endif
@@ -1177,11 +1189,8 @@ emit_cs_shader(struct anv_batch *batch,
       },
       .InterfaceDescriptor            = {
          .KernelStartPointer                = shader->kernel.offset,
-         .SamplerCount                      = DIV_ROUND_UP(
-            CLAMP(shader->bind_map.sampler_count, 0, 16), 4),
-         /* Typically set to 0 to avoid prefetching on every thread dispatch. */
-         .BindingTableEntryCount            = devinfo->verx10 == 125 ?
-                                              0 : 1 + MIN2(shader->bind_map.surface_count, 30),
+         .SamplerCount                      = get_sampler_count(device, shader),
+         .BindingTableEntryCount            = MIN2(get_surface_count(device, shader), 31),
          .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
          .SharedLocalMemorySize             = intel_compute_slm_encode_size(
             GFX_VER, cs_prog_data->base.total_shared),
@@ -1231,16 +1240,8 @@ emit_cs_shader(struct anv_batch *batch,
          shader->kernel.offset +
          brw_cs_prog_data_prog_offset(cs_prog_data, dispatch.simd_size),
 
-      /* Wa_1606682166 */
-      .SamplerCount           = GFX_VER == 11 ? 0 : get_sampler_count(shader),
-
-      /* We add 1 because the CS indirect parameters buffer isn't accounted
-       * for in bind_map.surface_count.
-       *
-       * Typically set to 0 to avoid prefetching on every thread dispatch.
-       */
-      .BindingTableEntryCount = devinfo->verx10 == 125 ?
-         0 : MIN2(shader->bind_map.surface_count, 30),
+      .SamplerCount           = get_sampler_count(device, shader),
+      .BindingTableEntryCount = MIN2(get_surface_count(device, shader), 31),
       .BarrierEnable          = cs_prog_data->uses_barrier,
       .SharedLocalMemorySize  =
          intel_compute_slm_encode_size(GFX_VER, cs_prog_data->base.total_shared),
diff --git a/src/util/driconf.h b/src/util/driconf.h
index 4ed27b66443..055dbaa8223 100644
--- a/src/util/driconf.h
+++ b/src/util/driconf.h
@@ -353,6 +353,14 @@
 #define DRI_CONFIG_INTEL_TBIMR(def) \
    DRI_CONF_OPT_B(intel_tbimr, def, "Enable TBIMR tiled rendering")
 
+#define DRI_CONFIG_INTEL_FORCE_COMPUTE_SURFACE_PREFETCH(def) \
+   DRI_CONF_OPT_B(intel_force_compute_surface_prefetch, def, \
+                  "Enable binding table surface prefteching for compute shaders")
+
+#define DRI_CONFIG_INTEL_FORCE_SAMPLER_PREFETCH(def) \
+   DRI_CONF_OPT_B(intel_force_sampler_prefetch, def, \
+                  "Enable binding table sampler prefteching")
+
 #define DRI_CONFIG_INTEL_VF_DISTRIBUTION(def) \
    DRI_CONF_OPT_B(intel_vf_distribution, def, "Enable geometry distribution")