anv/iris: add drirc to enable sampler state & compute surface state prefetch

I noticed we disable the prefetch only on Gfx12.5. But surely that recommendation carries on on later platforms. It seems other drivers just disable it all the time and only have an option to force the prefetch. So implementing the same thing here. Blorp path is left untouched. Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Tapani Pälli <tapani.palli@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39424>
2026-06-14 13:38:20 +02:00 · 2026-01-20 17:42:27 +02:00 · 2026-01-20 17:42:27 +02:00 · a05fc97bc9
commit a05fc97bc9
parent 2f0d18f6af
9 changed files with 114 additions and 70 deletions
--- a/src/gallium/drivers/iris/driinfo_iris.h
+++ b/src/gallium/drivers/iris/driinfo_iris.h
@ -14,6 +14,8 @@ DRI_CONF_SECTION_END

 DRI_CONF_SECTION_PERFORMANCE
   DRI_CONF_ADAPTIVE_SYNC(true)
+   DRI_CONFIG_INTEL_FORCE_COMPUTE_SURFACE_PREFETCH(false)
+   DRI_CONFIG_INTEL_FORCE_SAMPLER_PREFETCH(false)
   DRI_CONFIG_INTEL_TBIMR(true)
   DRI_CONFIG_INTEL_VF_DISTRIBUTION(true)
   DRI_CONFIG_INTEL_TE_DISTRIBUTION(true)
--- a/src/gallium/drivers/iris/iris_program_cache.c
+++ b/src/gallium/drivers/iris/iris_program_cache.c
@ -163,8 +163,6 @@ iris_upload_shader(struct iris_screen *screen,
                   const void *key,
                   const void *assembly)
 {
-   const struct intel_device_info *devinfo = screen->devinfo;
-
   u_upload_alloc_ref(uploader, 0, shader->program_size, 64,
                  &shader->assembly.offset, &shader->assembly.res,
                  &shader->map);
@ -200,7 +198,7 @@ iris_upload_shader(struct iris_screen *screen,
   }

   /* Store the 3DSTATE shader packets and other derived state. */
-   screen->vtbl.store_derived_program_state(devinfo, cache_id, shader);
+   screen->vtbl.store_derived_program_state(screen, cache_id, shader);

   util_queue_fence_signal(&shader->ready);

--- a/src/gallium/drivers/iris/iris_screen.c
+++ b/src/gallium/drivers/iris/iris_screen.c
@ -745,6 +745,10 @@ iris_screen_create(int fd, const struct pipe_screen_config *config)
      driQueryOptionb(config->options, "intel_enable_wa_14018912822");
   screen->driconf.intel_enable_wa_14024015672_msaa =
      driQueryOptionb(config->options, "intel_enable_wa_14024015672_msaa");
+   screen->driconf.force_sampler_prefetch =
+      driQueryOptionb(config->options, "intel_force_sampler_prefetch");
+   screen->driconf.force_compute_surface_prefetch =
+      driQueryOptionb(config->options, "intel_force_compute_surface_prefetch");
   screen->driconf.enable_tbimr =
      driQueryOptionb(config->options, "intel_tbimr");
   screen->driconf.enable_vf_distribution =
--- a/src/gallium/drivers/iris/iris_screen.h
+++ b/src/gallium/drivers/iris/iris_screen.h
@ -132,7 +132,7 @@ struct iris_vtable {
                                     uint32_t offset);

   unsigned (*derived_program_state_size)(enum iris_program_cache_id id);
-   void (*store_derived_program_state)(const struct intel_device_info *devinfo,
+   void (*store_derived_program_state)(const struct iris_screen *screen,
                                       enum iris_program_cache_id cache_id,
                                       struct iris_compiled_shader *shader);
   uint32_t *(*create_so_decl_list)(const struct pipe_stream_output_info *sol,
@ -209,6 +209,8 @@ struct iris_screen {
      bool enable_te_distribution;
      unsigned generated_indirect_threshold;
      bool disable_threaded_context;
+      bool force_sampler_prefetch;
+      bool force_compute_surface_prefetch;
   } driconf;

   /** Does the kernel support various features (KERNEL_HAS_* bitfield)? */
--- a/src/gallium/drivers/iris/iris_state.c
+++ b/src/gallium/drivers/iris/iris_state.c
@ -5111,20 +5111,40 @@ iris_populate_cs_key(const struct iris_context *ice,
 }

 static inline uint32_t
-encode_sampler_count(const struct iris_compiled_shader *shader)
+encode_sampler_count(const struct iris_screen *screen,
+                     const struct iris_compiled_shader *shader)
 {
+#if GFX_VER == 11
+   /* Wa_1606682166 */
+   return 0;
+#else
+   if (!screen->driconf.force_sampler_prefetch)
+      return 0;
   /* We can potentially have way more than 32 samplers and that's ok.
    * However, the 3DSTATE_XS packets only have 3 bits to specify how
    * many to pre-fetch and all values above 4 are marked reserved.
    */
   uint32_t count = util_last_bit64(shader->bt.samplers_used_mask);
   return DIV_ROUND_UP(CLAMP(count, 0, 16), 4);
+#endif
+}
+
+static inline uint32_t
+encode_surface_count(const struct iris_screen *screen,
+                     const struct iris_compiled_shader *shader)
+{
+#if GFX_VERx10 >= 125
+   if (shader->stage == MESA_SHADER_COMPUTE &&
+       !screen->driconf.force_compute_surface_prefetch)
+      return 0;
+#endif
+   return shader->bt.size_bytes / 4;
 }

 #define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix, stage)                   \
   pkt.KernelStartPointer = KSP(shader);                                  \
-   pkt.BindingTableEntryCount = shader->bt.size_bytes / 4;                \
-   pkt.SamplerCount = encode_sampler_count(shader);                       \
+   pkt.BindingTableEntryCount = encode_surface_count(screen, shader);     \
+   pkt.SamplerCount = encode_sampler_count(screen, shader);               \
   pkt.FloatingPointMode = shader->use_alt_mode;                          \
                                                                          \
   pkt.DispatchGRFStartRegisterForURBData =                               \
@ -5180,9 +5200,10 @@ encode_sampler_count(const struct iris_compiled_shader *shader)
 * Encode most of 3DSTATE_VS based on the compiled shader.
 */
 static void
-iris_store_vs_state(const struct intel_device_info *devinfo,
+iris_store_vs_state(const struct iris_screen *screen,
                    struct iris_compiled_shader *shader)
 {
+   const struct intel_device_info *devinfo = screen->devinfo;
   struct iris_vue_data *vue_data = iris_vue_data(shader);

   iris_pack_command(GENX(3DSTATE_VS), shader->derived_data, vs) {
@ -5203,9 +5224,10 @@ iris_store_vs_state(const struct intel_device_info *devinfo,
 * Encode most of 3DSTATE_HS based on the compiled shader.
 */
 static void
-iris_store_tcs_state(const struct intel_device_info *devinfo,
+iris_store_tcs_state(const struct iris_screen *screen,
                     struct iris_compiled_shader *shader)
 {
+   const struct intel_device_info *devinfo = screen->devinfo;
   struct iris_tcs_data *tcs_data = iris_tcs_data(shader);
   struct iris_vue_data *vue_data = &tcs_data->base;

@ -5252,9 +5274,10 @@ iris_store_tcs_state(const struct intel_device_info *devinfo,
 * Encode 3DSTATE_TE and most of 3DSTATE_DS based on the compiled shader.
 */
 static void
-iris_store_tes_state(const struct intel_device_info *devinfo,
+iris_store_tes_state(const struct iris_screen *screen,
                     struct iris_compiled_shader *shader)
 {
+   const struct intel_device_info *devinfo = screen->devinfo;
   struct iris_tes_data *tes_data = iris_tes_data(shader);
   struct iris_vue_data *vue_data = &tes_data->base;

@ -5319,9 +5342,10 @@ iris_store_tes_state(const struct intel_device_info *devinfo,
 * Encode most of 3DSTATE_GS based on the compiled shader.
 */
 static void
-iris_store_gs_state(const struct intel_device_info *devinfo,
+iris_store_gs_state(const struct iris_screen *screen,
                    struct iris_compiled_shader *shader)
 {
+   const struct intel_device_info *devinfo = screen->devinfo;
   struct iris_gs_data *gs_data = iris_gs_data(shader);
   struct iris_vue_data *vue_data = &gs_data->base;

@ -5367,9 +5391,10 @@ iris_store_gs_state(const struct intel_device_info *devinfo,
 * Encode most of 3DSTATE_PS and 3DSTATE_PS_EXTRA based on the shader.
 */
 static void
-iris_store_fs_state(const struct intel_device_info *devinfo,
+iris_store_fs_state(const struct iris_screen *screen,
                    struct iris_compiled_shader *shader)
 {
+   const struct intel_device_info *devinfo = screen->devinfo;
   struct iris_fs_data *fs_data = iris_fs_data(shader);

   uint32_t *ps_state = (void *) shader->derived_data;
@ -5377,8 +5402,8 @@ iris_store_fs_state(const struct intel_device_info *devinfo,

   iris_pack_command(GENX(3DSTATE_PS), ps_state, ps) {
      ps.VectorMaskEnable = fs_data->uses_vmask;
-      ps.BindingTableEntryCount = shader->bt.size_bytes / 4;
-      ps.SamplerCount = encode_sampler_count(shader);
+      ps.BindingTableEntryCount = encode_surface_count(screen, shader);
+      ps.SamplerCount = encode_sampler_count(screen, shader);
      ps.FloatingPointMode = shader->use_alt_mode;
      ps.MaximumNumberofThreadsPerPSD =
         devinfo->max_threads_per_psd - (GFX_VER == 8 ? 2 : 1);
@ -5453,7 +5478,7 @@ iris_store_fs_state(const struct intel_device_info *devinfo,
 * This must match the data written by the iris_store_xs_state() functions.
 */
 static void
-iris_store_cs_state(const struct intel_device_info *devinfo,
+iris_store_cs_state(const struct iris_screen *screen,
                    struct iris_compiled_shader *shader)
 {
   struct iris_cs_data *cs_data = iris_cs_data(shader);
@ -5471,10 +5496,8 @@ iris_store_cs_state(const struct intel_device_info *devinfo,
 #if GFX_VERx10 <= 125
      desc.BarrierEnable = cs_data->uses_barrier;
 #endif
-      /* Typically set to 0 to avoid prefetching on every thread dispatch. */
-      desc.BindingTableEntryCount = devinfo->verx10 == 125 ?
-         0 : MIN2(shader->bt.size_bytes / 4, 31);
-      desc.SamplerCount = encode_sampler_count(shader);
+      desc.BindingTableEntryCount = MIN2(encode_surface_count(screen, shader), 31);
+      desc.SamplerCount = encode_sampler_count(screen, shader);
      /* TODO: Check if we are missing workarounds and enable mid-thread
       * preemption.
       *
@ -5522,28 +5545,28 @@ iris_derived_program_state_size(enum iris_program_cache_id cache_id)
 * get most of the state packet without having to reconstruct it.
 */
 static void
-iris_store_derived_program_state(const struct intel_device_info *devinfo,
+iris_store_derived_program_state(const struct iris_screen *screen,
                                 enum iris_program_cache_id cache_id,
                                 struct iris_compiled_shader *shader)
 {
   switch (cache_id) {
   case IRIS_CACHE_VS:
-      iris_store_vs_state(devinfo, shader);
+      iris_store_vs_state(screen, shader);
      break;
   case IRIS_CACHE_TCS:
-      iris_store_tcs_state(devinfo, shader);
+      iris_store_tcs_state(screen, shader);
      break;
   case IRIS_CACHE_TES:
-      iris_store_tes_state(devinfo, shader);
+      iris_store_tes_state(screen, shader);
      break;
   case IRIS_CACHE_GS:
-      iris_store_gs_state(devinfo, shader);
+      iris_store_gs_state(screen, shader);
      break;
   case IRIS_CACHE_FS:
-      iris_store_fs_state(devinfo, shader);
+      iris_store_fs_state(screen, shader);
      break;
   case IRIS_CACHE_CS:
-      iris_store_cs_state(devinfo, shader);
+      iris_store_cs_state(screen, shader);
      break;
   case IRIS_CACHE_BLORP:
      break;
@ -9248,11 +9271,9 @@ iris_upload_compute_walker(struct iris_context *ice,
                                                   dispatch.group_size,
                                                   dispatch.simd_size);
   idd.SamplerStatePointer = shs->sampler_table.offset;
-   idd.SamplerCount = encode_sampler_count(shader),
+   idd.SamplerCount = encode_sampler_count(screen, shader),
   idd.BindingTablePointer = binder->bt_offset[MESA_SHADER_COMPUTE];
-   /* Typically set to 0 to avoid prefetching on every thread dispatch. */
-   idd.BindingTableEntryCount = devinfo->verx10 == 125 ?
-      0 : MIN2(shader->bt.size_bytes / 4, 31);
+   idd.BindingTableEntryCount = MIN2(encode_surface_count(screen, shader), 31);
   idd.NumberOfBarriers = cs_data->uses_barrier;
 #if GFX_VER >= 30
   idd.RegistersPerThread = ptl_register_blocks(shader->brw_prog_data->grf_used);
--- a/src/intel/vulkan/anv_instance.c
+++ b/src/intel/vulkan/anv_instance.c
@ -36,6 +36,8 @@ static const driOptionDescription anv_dri_options[] = {
      DRI_CONF_ANV_FORCE_INDIRECT_DESCRIPTORS(false)
      DRI_CONF_ANV_DISABLE_LINK_TIME_OPTIMIZATION(false)
      DRI_CONF_SHADER_SPILLING_RATE(11)
+      DRI_CONFIG_INTEL_FORCE_COMPUTE_SURFACE_PREFETCH(false)
+      DRI_CONFIG_INTEL_FORCE_SAMPLER_PREFETCH(false)
      DRI_CONFIG_INTEL_TBIMR(true)
      DRI_CONFIG_INTEL_VF_DISTRIBUTION(true)
      DRI_CONFIG_INTEL_TE_DISTRIBUTION(true)
@ -200,6 +202,10 @@ anv_init_dri_options(struct anv_instance *instance)
       driQueryOptioni(&instance->dri_options, "force_vk_vendor");
    instance->has_fake_sparse =
       driQueryOptionb(&instance->dri_options, "fake_sparse");
+    instance->force_sampler_prefetch =
+       driQueryOptionb(&instance->dri_options, "intel_force_sampler_prefetch");
+    instance->force_compute_surface_prefetch =
+       driQueryOptionb(&instance->dri_options, "intel_force_compute_surface_prefetch");
    instance->enable_tbimr = driQueryOptionb(&instance->dri_options, "intel_tbimr");
    instance->enable_vf_distribution =
       driQueryOptionb(&instance->dri_options, "intel_vf_distribution");
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@ -1782,6 +1782,8 @@ struct anv_instance {
    bool                                        custom_border_colors_without_format;
    bool                                        vf_component_packing;
    bool                                        large_workgroup_non_coherent_image_workaround;
+    bool                                        force_sampler_prefetch;
+    bool                                        force_compute_surface_prefetch;

    /* HW workarounds */
    bool                                        no_16bit;
--- a/src/intel/vulkan/genX_shader.c
+++ b/src/intel/vulkan/genX_shader.c
@ -27,15 +27,36 @@
        }))

 static uint32_t
-get_sampler_count(const struct anv_shader *shader)
+get_surface_count(const struct anv_device *device,
+                  const struct anv_shader *shader)
 {
-   uint32_t count_by_4 = DIV_ROUND_UP(shader->bind_map.sampler_count, 4);
+#if GFX_VERx10 >= 125
+   if (shader->vk.stage == MESA_SHADER_COMPUTE &&
+       !device->physical->instance->force_compute_surface_prefetch)
+      return 0;
+#endif
+   return shader->bind_map.surface_count;
+}

-   /* We can potentially have way more than 32 samplers and that's ok.
-    * However, the 3DSTATE_XS packets only have 3 bits to specify how
-    * many to pre-fetch and all values above 4 are marked reserved.
+static uint32_t
+get_sampler_count(const struct anv_device *device,
+                  const struct anv_shader *shader)
+{
+#if GFX_VER == 11
+   /* Wa_1606682166:
+    *
+    * Incorrect TDL's SSP address shift in SARB for 16:6 & 18:8 modes. Disable
+    * the Sampler state prefetch functionality in the SARB by programming
+    * 0xB000[30] to '1'.
    */
-   return MIN2(count_by_4, 4);
+   return 0;
+#else
+   if (!device->physical->instance->force_sampler_prefetch)
+      return 0;
+
+   return DIV_ROUND_UP(
+      CLAMP(shader->bind_map.sampler_count, 0, 16), 4);
+#endif
 }

 static UNUSED struct anv_address
@ -557,13 +578,8 @@ emit_vs_shader(struct anv_batch *batch,
      vs.SingleVertexDispatch       = false;
 #endif
      vs.VectorMaskEnable           = false;
-      /* Wa_1606682166:
-       * Incorrect TDL's SSP address shift in SARB for 16:6 & 18:8 modes.
-       * Disable the Sampler state prefetch functionality in the SARB by
-       * programming 0xB000[30] to '1'.
-       */
-      vs.SamplerCount               = GFX_VER == 11 ? 0 : get_sampler_count(shader);
-      vs.BindingTableEntryCount     = shader->bind_map.surface_count;
+      vs.SamplerCount               = get_sampler_count(device, shader);
+      vs.BindingTableEntryCount     = get_surface_count(device, shader);
      vs.FloatingPointMode          = IEEE754;
      vs.IllegalOpcodeExceptionEnable = false;
      vs.SoftwareExceptionEnable    = false;
@ -619,9 +635,8 @@ emit_hs_shader(struct anv_batch *batch,
      hs.Enable = true;
      hs.StatisticsEnable = true;
      hs.KernelStartPointer = shader->kernel.offset;
-      /* Wa_1606682166 */
-      hs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(shader);
-      hs.BindingTableEntryCount = shader->bind_map.surface_count;
+      hs.SamplerCount = get_sampler_count(device, shader);
+      hs.BindingTableEntryCount = get_surface_count(device, shader);

 #if GFX_VER >= 12
      /* Wa_1604578095:
@ -724,9 +739,8 @@ emit_ds_shader(struct anv_batch *batch,
      ds.Enable = true;
      ds.StatisticsEnable = true;
      ds.KernelStartPointer = shader->kernel.offset;
-      /* Wa_1606682166 */
-      ds.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(shader);
-      ds.BindingTableEntryCount = shader->bind_map.surface_count;
+      ds.SamplerCount = get_sampler_count(device, shader);
+      ds.BindingTableEntryCount = get_surface_count(device, shader);
      ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;

      ds.PatchURBEntryReadLength = tes_prog_data->base.urb_read_length;
@ -799,9 +813,8 @@ emit_gs_shader(struct anv_batch *batch,

      gs.SingleProgramFlow       = false;
      gs.VectorMaskEnable        = false;
-      /* Wa_1606682166 */
-      gs.SamplerCount            = GFX_VER == 11 ? 0 : get_sampler_count(shader);
-      gs.BindingTableEntryCount  = shader->bind_map.surface_count;
+      gs.SamplerCount            = get_sampler_count(device, shader);
+      gs.BindingTableEntryCount  = get_surface_count(device, shader);
      gs.IncludeVertexHandles    = gs_prog_data->base.include_vue_handles;
      gs.IncludePrimitiveID      = gs_prog_data->include_primitive_id;

@ -1060,9 +1073,8 @@ emit_ps_shader(struct anv_batch *batch,

      ps.SingleProgramFlow          = false;
      ps.VectorMaskEnable           = wm_prog_data->uses_vmask;
-      /* Wa_1606682166 */
-      ps.SamplerCount               = GFX_VER == 11 ? 0 : get_sampler_count(shader);
-      ps.BindingTableEntryCount     = shader->bind_map.surface_count;
+      ps.SamplerCount               = get_sampler_count(device, shader);
+      ps.BindingTableEntryCount     = get_surface_count(device, shader);
 #if GFX_VER < 20
      ps.PushConstantEnable         = wm_prog_data->base.push_sizes[0] > 0;
 #endif
@ -1177,11 +1189,8 @@ emit_cs_shader(struct anv_batch *batch,
      },
      .InterfaceDescriptor            = {
         .KernelStartPointer                = shader->kernel.offset,
-         .SamplerCount                      = DIV_ROUND_UP(
-            CLAMP(shader->bind_map.sampler_count, 0, 16), 4),
-         /* Typically set to 0 to avoid prefetching on every thread dispatch. */
-         .BindingTableEntryCount            = devinfo->verx10 == 125 ?
-                                              0 : 1 + MIN2(shader->bind_map.surface_count, 30),
+         .SamplerCount                      = get_sampler_count(device, shader),
+         .BindingTableEntryCount            = MIN2(get_surface_count(device, shader), 31),
         .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
         .SharedLocalMemorySize             = intel_compute_slm_encode_size(
            GFX_VER, cs_prog_data->base.total_shared),
@ -1231,16 +1240,8 @@ emit_cs_shader(struct anv_batch *batch,
         shader->kernel.offset +
         brw_cs_prog_data_prog_offset(cs_prog_data, dispatch.simd_size),

-      /* Wa_1606682166 */
-      .SamplerCount           = GFX_VER == 11 ? 0 : get_sampler_count(shader),
-
-      /* We add 1 because the CS indirect parameters buffer isn't accounted
-       * for in bind_map.surface_count.
-       *
-       * Typically set to 0 to avoid prefetching on every thread dispatch.
-       */
-      .BindingTableEntryCount = devinfo->verx10 == 125 ?
-         0 : MIN2(shader->bind_map.surface_count, 30),
+      .SamplerCount           = get_sampler_count(device, shader),
+      .BindingTableEntryCount = MIN2(get_surface_count(device, shader), 31),
      .BarrierEnable          = cs_prog_data->uses_barrier,
      .SharedLocalMemorySize  =
         intel_compute_slm_encode_size(GFX_VER, cs_prog_data->base.total_shared),
--- a/src/util/driconf.h
+++ b/src/util/driconf.h
@ -353,6 +353,14 @@
 #define DRI_CONFIG_INTEL_TBIMR(def) \
   DRI_CONF_OPT_B(intel_tbimr, def, "Enable TBIMR tiled rendering")

+#define DRI_CONFIG_INTEL_FORCE_COMPUTE_SURFACE_PREFETCH(def) \
+   DRI_CONF_OPT_B(intel_force_compute_surface_prefetch, def, \
+                  "Enable binding table surface prefteching for compute shaders")
+
+#define DRI_CONFIG_INTEL_FORCE_SAMPLER_PREFETCH(def) \
+   DRI_CONF_OPT_B(intel_force_sampler_prefetch, def, \
+                  "Enable binding table sampler prefteching")
+
 #define DRI_CONFIG_INTEL_VF_DISTRIBUTION(def) \
   DRI_CONF_OPT_B(intel_vf_distribution, def, "Enable geometry distribution")