anv/iris: add drirc to enable sampler state & compute surface state prefetch

I noticed we disable the prefetch only on Gfx12.5. But surely that
recommendation carries on on later platforms.

It seems other drivers just disable it all the time and only have an
option to force the prefetch. So implementing the same thing here.

Blorp path is left untouched.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Tapani Pälli <tapani.palli@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39424>
This commit is contained in:
Lionel Landwerlin 2026-01-20 17:42:27 +02:00 committed by Marge Bot
parent 2f0d18f6af
commit a05fc97bc9
9 changed files with 114 additions and 70 deletions

View file

@ -14,6 +14,8 @@ DRI_CONF_SECTION_END
DRI_CONF_SECTION_PERFORMANCE
DRI_CONF_ADAPTIVE_SYNC(true)
DRI_CONFIG_INTEL_FORCE_COMPUTE_SURFACE_PREFETCH(false)
DRI_CONFIG_INTEL_FORCE_SAMPLER_PREFETCH(false)
DRI_CONFIG_INTEL_TBIMR(true)
DRI_CONFIG_INTEL_VF_DISTRIBUTION(true)
DRI_CONFIG_INTEL_TE_DISTRIBUTION(true)

View file

@ -163,8 +163,6 @@ iris_upload_shader(struct iris_screen *screen,
const void *key,
const void *assembly)
{
const struct intel_device_info *devinfo = screen->devinfo;
u_upload_alloc_ref(uploader, 0, shader->program_size, 64,
&shader->assembly.offset, &shader->assembly.res,
&shader->map);
@ -200,7 +198,7 @@ iris_upload_shader(struct iris_screen *screen,
}
/* Store the 3DSTATE shader packets and other derived state. */
screen->vtbl.store_derived_program_state(devinfo, cache_id, shader);
screen->vtbl.store_derived_program_state(screen, cache_id, shader);
util_queue_fence_signal(&shader->ready);

View file

@ -745,6 +745,10 @@ iris_screen_create(int fd, const struct pipe_screen_config *config)
driQueryOptionb(config->options, "intel_enable_wa_14018912822");
screen->driconf.intel_enable_wa_14024015672_msaa =
driQueryOptionb(config->options, "intel_enable_wa_14024015672_msaa");
screen->driconf.force_sampler_prefetch =
driQueryOptionb(config->options, "intel_force_sampler_prefetch");
screen->driconf.force_compute_surface_prefetch =
driQueryOptionb(config->options, "intel_force_compute_surface_prefetch");
screen->driconf.enable_tbimr =
driQueryOptionb(config->options, "intel_tbimr");
screen->driconf.enable_vf_distribution =

View file

@ -132,7 +132,7 @@ struct iris_vtable {
uint32_t offset);
unsigned (*derived_program_state_size)(enum iris_program_cache_id id);
void (*store_derived_program_state)(const struct intel_device_info *devinfo,
void (*store_derived_program_state)(const struct iris_screen *screen,
enum iris_program_cache_id cache_id,
struct iris_compiled_shader *shader);
uint32_t *(*create_so_decl_list)(const struct pipe_stream_output_info *sol,
@ -209,6 +209,8 @@ struct iris_screen {
bool enable_te_distribution;
unsigned generated_indirect_threshold;
bool disable_threaded_context;
bool force_sampler_prefetch;
bool force_compute_surface_prefetch;
} driconf;
/** Does the kernel support various features (KERNEL_HAS_* bitfield)? */

View file

@ -5111,20 +5111,40 @@ iris_populate_cs_key(const struct iris_context *ice,
}
static inline uint32_t
encode_sampler_count(const struct iris_compiled_shader *shader)
encode_sampler_count(const struct iris_screen *screen,
const struct iris_compiled_shader *shader)
{
#if GFX_VER == 11
/* Wa_1606682166 */
return 0;
#else
if (!screen->driconf.force_sampler_prefetch)
return 0;
/* We can potentially have way more than 32 samplers and that's ok.
* However, the 3DSTATE_XS packets only have 3 bits to specify how
* many to pre-fetch and all values above 4 are marked reserved.
*/
uint32_t count = util_last_bit64(shader->bt.samplers_used_mask);
return DIV_ROUND_UP(CLAMP(count, 0, 16), 4);
#endif
}
static inline uint32_t
encode_surface_count(const struct iris_screen *screen,
const struct iris_compiled_shader *shader)
{
#if GFX_VERx10 >= 125
if (shader->stage == MESA_SHADER_COMPUTE &&
!screen->driconf.force_compute_surface_prefetch)
return 0;
#endif
return shader->bt.size_bytes / 4;
}
#define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix, stage) \
pkt.KernelStartPointer = KSP(shader); \
pkt.BindingTableEntryCount = shader->bt.size_bytes / 4; \
pkt.SamplerCount = encode_sampler_count(shader); \
pkt.BindingTableEntryCount = encode_surface_count(screen, shader); \
pkt.SamplerCount = encode_sampler_count(screen, shader); \
pkt.FloatingPointMode = shader->use_alt_mode; \
\
pkt.DispatchGRFStartRegisterForURBData = \
@ -5180,9 +5200,10 @@ encode_sampler_count(const struct iris_compiled_shader *shader)
* Encode most of 3DSTATE_VS based on the compiled shader.
*/
static void
iris_store_vs_state(const struct intel_device_info *devinfo,
iris_store_vs_state(const struct iris_screen *screen,
struct iris_compiled_shader *shader)
{
const struct intel_device_info *devinfo = screen->devinfo;
struct iris_vue_data *vue_data = iris_vue_data(shader);
iris_pack_command(GENX(3DSTATE_VS), shader->derived_data, vs) {
@ -5203,9 +5224,10 @@ iris_store_vs_state(const struct intel_device_info *devinfo,
* Encode most of 3DSTATE_HS based on the compiled shader.
*/
static void
iris_store_tcs_state(const struct intel_device_info *devinfo,
iris_store_tcs_state(const struct iris_screen *screen,
struct iris_compiled_shader *shader)
{
const struct intel_device_info *devinfo = screen->devinfo;
struct iris_tcs_data *tcs_data = iris_tcs_data(shader);
struct iris_vue_data *vue_data = &tcs_data->base;
@ -5252,9 +5274,10 @@ iris_store_tcs_state(const struct intel_device_info *devinfo,
* Encode 3DSTATE_TE and most of 3DSTATE_DS based on the compiled shader.
*/
static void
iris_store_tes_state(const struct intel_device_info *devinfo,
iris_store_tes_state(const struct iris_screen *screen,
struct iris_compiled_shader *shader)
{
const struct intel_device_info *devinfo = screen->devinfo;
struct iris_tes_data *tes_data = iris_tes_data(shader);
struct iris_vue_data *vue_data = &tes_data->base;
@ -5319,9 +5342,10 @@ iris_store_tes_state(const struct intel_device_info *devinfo,
* Encode most of 3DSTATE_GS based on the compiled shader.
*/
static void
iris_store_gs_state(const struct intel_device_info *devinfo,
iris_store_gs_state(const struct iris_screen *screen,
struct iris_compiled_shader *shader)
{
const struct intel_device_info *devinfo = screen->devinfo;
struct iris_gs_data *gs_data = iris_gs_data(shader);
struct iris_vue_data *vue_data = &gs_data->base;
@ -5367,9 +5391,10 @@ iris_store_gs_state(const struct intel_device_info *devinfo,
* Encode most of 3DSTATE_PS and 3DSTATE_PS_EXTRA based on the shader.
*/
static void
iris_store_fs_state(const struct intel_device_info *devinfo,
iris_store_fs_state(const struct iris_screen *screen,
struct iris_compiled_shader *shader)
{
const struct intel_device_info *devinfo = screen->devinfo;
struct iris_fs_data *fs_data = iris_fs_data(shader);
uint32_t *ps_state = (void *) shader->derived_data;
@ -5377,8 +5402,8 @@ iris_store_fs_state(const struct intel_device_info *devinfo,
iris_pack_command(GENX(3DSTATE_PS), ps_state, ps) {
ps.VectorMaskEnable = fs_data->uses_vmask;
ps.BindingTableEntryCount = shader->bt.size_bytes / 4;
ps.SamplerCount = encode_sampler_count(shader);
ps.BindingTableEntryCount = encode_surface_count(screen, shader);
ps.SamplerCount = encode_sampler_count(screen, shader);
ps.FloatingPointMode = shader->use_alt_mode;
ps.MaximumNumberofThreadsPerPSD =
devinfo->max_threads_per_psd - (GFX_VER == 8 ? 2 : 1);
@ -5453,7 +5478,7 @@ iris_store_fs_state(const struct intel_device_info *devinfo,
* This must match the data written by the iris_store_xs_state() functions.
*/
static void
iris_store_cs_state(const struct intel_device_info *devinfo,
iris_store_cs_state(const struct iris_screen *screen,
struct iris_compiled_shader *shader)
{
struct iris_cs_data *cs_data = iris_cs_data(shader);
@ -5471,10 +5496,8 @@ iris_store_cs_state(const struct intel_device_info *devinfo,
#if GFX_VERx10 <= 125
desc.BarrierEnable = cs_data->uses_barrier;
#endif
/* Typically set to 0 to avoid prefetching on every thread dispatch. */
desc.BindingTableEntryCount = devinfo->verx10 == 125 ?
0 : MIN2(shader->bt.size_bytes / 4, 31);
desc.SamplerCount = encode_sampler_count(shader);
desc.BindingTableEntryCount = MIN2(encode_surface_count(screen, shader), 31);
desc.SamplerCount = encode_sampler_count(screen, shader);
/* TODO: Check if we are missing workarounds and enable mid-thread
* preemption.
*
@ -5522,28 +5545,28 @@ iris_derived_program_state_size(enum iris_program_cache_id cache_id)
* get most of the state packet without having to reconstruct it.
*/
static void
iris_store_derived_program_state(const struct intel_device_info *devinfo,
iris_store_derived_program_state(const struct iris_screen *screen,
enum iris_program_cache_id cache_id,
struct iris_compiled_shader *shader)
{
switch (cache_id) {
case IRIS_CACHE_VS:
iris_store_vs_state(devinfo, shader);
iris_store_vs_state(screen, shader);
break;
case IRIS_CACHE_TCS:
iris_store_tcs_state(devinfo, shader);
iris_store_tcs_state(screen, shader);
break;
case IRIS_CACHE_TES:
iris_store_tes_state(devinfo, shader);
iris_store_tes_state(screen, shader);
break;
case IRIS_CACHE_GS:
iris_store_gs_state(devinfo, shader);
iris_store_gs_state(screen, shader);
break;
case IRIS_CACHE_FS:
iris_store_fs_state(devinfo, shader);
iris_store_fs_state(screen, shader);
break;
case IRIS_CACHE_CS:
iris_store_cs_state(devinfo, shader);
iris_store_cs_state(screen, shader);
break;
case IRIS_CACHE_BLORP:
break;
@ -9248,11 +9271,9 @@ iris_upload_compute_walker(struct iris_context *ice,
dispatch.group_size,
dispatch.simd_size);
idd.SamplerStatePointer = shs->sampler_table.offset;
idd.SamplerCount = encode_sampler_count(shader),
idd.SamplerCount = encode_sampler_count(screen, shader),
idd.BindingTablePointer = binder->bt_offset[MESA_SHADER_COMPUTE];
/* Typically set to 0 to avoid prefetching on every thread dispatch. */
idd.BindingTableEntryCount = devinfo->verx10 == 125 ?
0 : MIN2(shader->bt.size_bytes / 4, 31);
idd.BindingTableEntryCount = MIN2(encode_surface_count(screen, shader), 31);
idd.NumberOfBarriers = cs_data->uses_barrier;
#if GFX_VER >= 30
idd.RegistersPerThread = ptl_register_blocks(shader->brw_prog_data->grf_used);

View file

@ -36,6 +36,8 @@ static const driOptionDescription anv_dri_options[] = {
DRI_CONF_ANV_FORCE_INDIRECT_DESCRIPTORS(false)
DRI_CONF_ANV_DISABLE_LINK_TIME_OPTIMIZATION(false)
DRI_CONF_SHADER_SPILLING_RATE(11)
DRI_CONFIG_INTEL_FORCE_COMPUTE_SURFACE_PREFETCH(false)
DRI_CONFIG_INTEL_FORCE_SAMPLER_PREFETCH(false)
DRI_CONFIG_INTEL_TBIMR(true)
DRI_CONFIG_INTEL_VF_DISTRIBUTION(true)
DRI_CONFIG_INTEL_TE_DISTRIBUTION(true)
@ -200,6 +202,10 @@ anv_init_dri_options(struct anv_instance *instance)
driQueryOptioni(&instance->dri_options, "force_vk_vendor");
instance->has_fake_sparse =
driQueryOptionb(&instance->dri_options, "fake_sparse");
instance->force_sampler_prefetch =
driQueryOptionb(&instance->dri_options, "intel_force_sampler_prefetch");
instance->force_compute_surface_prefetch =
driQueryOptionb(&instance->dri_options, "intel_force_compute_surface_prefetch");
instance->enable_tbimr = driQueryOptionb(&instance->dri_options, "intel_tbimr");
instance->enable_vf_distribution =
driQueryOptionb(&instance->dri_options, "intel_vf_distribution");

View file

@ -1782,6 +1782,8 @@ struct anv_instance {
bool custom_border_colors_without_format;
bool vf_component_packing;
bool large_workgroup_non_coherent_image_workaround;
bool force_sampler_prefetch;
bool force_compute_surface_prefetch;
/* HW workarounds */
bool no_16bit;

View file

@ -27,15 +27,36 @@
}))
static uint32_t
get_sampler_count(const struct anv_shader *shader)
get_surface_count(const struct anv_device *device,
const struct anv_shader *shader)
{
uint32_t count_by_4 = DIV_ROUND_UP(shader->bind_map.sampler_count, 4);
#if GFX_VERx10 >= 125
if (shader->vk.stage == MESA_SHADER_COMPUTE &&
!device->physical->instance->force_compute_surface_prefetch)
return 0;
#endif
return shader->bind_map.surface_count;
}
/* We can potentially have way more than 32 samplers and that's ok.
* However, the 3DSTATE_XS packets only have 3 bits to specify how
* many to pre-fetch and all values above 4 are marked reserved.
static uint32_t
get_sampler_count(const struct anv_device *device,
const struct anv_shader *shader)
{
#if GFX_VER == 11
/* Wa_1606682166:
*
* Incorrect TDL's SSP address shift in SARB for 16:6 & 18:8 modes. Disable
* the Sampler state prefetch functionality in the SARB by programming
* 0xB000[30] to '1'.
*/
return MIN2(count_by_4, 4);
return 0;
#else
if (!device->physical->instance->force_sampler_prefetch)
return 0;
return DIV_ROUND_UP(
CLAMP(shader->bind_map.sampler_count, 0, 16), 4);
#endif
}
static UNUSED struct anv_address
@ -557,13 +578,8 @@ emit_vs_shader(struct anv_batch *batch,
vs.SingleVertexDispatch = false;
#endif
vs.VectorMaskEnable = false;
/* Wa_1606682166:
* Incorrect TDL's SSP address shift in SARB for 16:6 & 18:8 modes.
* Disable the Sampler state prefetch functionality in the SARB by
* programming 0xB000[30] to '1'.
*/
vs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(shader);
vs.BindingTableEntryCount = shader->bind_map.surface_count;
vs.SamplerCount = get_sampler_count(device, shader);
vs.BindingTableEntryCount = get_surface_count(device, shader);
vs.FloatingPointMode = IEEE754;
vs.IllegalOpcodeExceptionEnable = false;
vs.SoftwareExceptionEnable = false;
@ -619,9 +635,8 @@ emit_hs_shader(struct anv_batch *batch,
hs.Enable = true;
hs.StatisticsEnable = true;
hs.KernelStartPointer = shader->kernel.offset;
/* Wa_1606682166 */
hs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(shader);
hs.BindingTableEntryCount = shader->bind_map.surface_count;
hs.SamplerCount = get_sampler_count(device, shader);
hs.BindingTableEntryCount = get_surface_count(device, shader);
#if GFX_VER >= 12
/* Wa_1604578095:
@ -724,9 +739,8 @@ emit_ds_shader(struct anv_batch *batch,
ds.Enable = true;
ds.StatisticsEnable = true;
ds.KernelStartPointer = shader->kernel.offset;
/* Wa_1606682166 */
ds.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(shader);
ds.BindingTableEntryCount = shader->bind_map.surface_count;
ds.SamplerCount = get_sampler_count(device, shader);
ds.BindingTableEntryCount = get_surface_count(device, shader);
ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;
ds.PatchURBEntryReadLength = tes_prog_data->base.urb_read_length;
@ -799,9 +813,8 @@ emit_gs_shader(struct anv_batch *batch,
gs.SingleProgramFlow = false;
gs.VectorMaskEnable = false;
/* Wa_1606682166 */
gs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(shader);
gs.BindingTableEntryCount = shader->bind_map.surface_count;
gs.SamplerCount = get_sampler_count(device, shader);
gs.BindingTableEntryCount = get_surface_count(device, shader);
gs.IncludeVertexHandles = gs_prog_data->base.include_vue_handles;
gs.IncludePrimitiveID = gs_prog_data->include_primitive_id;
@ -1060,9 +1073,8 @@ emit_ps_shader(struct anv_batch *batch,
ps.SingleProgramFlow = false;
ps.VectorMaskEnable = wm_prog_data->uses_vmask;
/* Wa_1606682166 */
ps.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(shader);
ps.BindingTableEntryCount = shader->bind_map.surface_count;
ps.SamplerCount = get_sampler_count(device, shader);
ps.BindingTableEntryCount = get_surface_count(device, shader);
#if GFX_VER < 20
ps.PushConstantEnable = wm_prog_data->base.push_sizes[0] > 0;
#endif
@ -1177,11 +1189,8 @@ emit_cs_shader(struct anv_batch *batch,
},
.InterfaceDescriptor = {
.KernelStartPointer = shader->kernel.offset,
.SamplerCount = DIV_ROUND_UP(
CLAMP(shader->bind_map.sampler_count, 0, 16), 4),
/* Typically set to 0 to avoid prefetching on every thread dispatch. */
.BindingTableEntryCount = devinfo->verx10 == 125 ?
0 : 1 + MIN2(shader->bind_map.surface_count, 30),
.SamplerCount = get_sampler_count(device, shader),
.BindingTableEntryCount = MIN2(get_surface_count(device, shader), 31),
.NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
.SharedLocalMemorySize = intel_compute_slm_encode_size(
GFX_VER, cs_prog_data->base.total_shared),
@ -1231,16 +1240,8 @@ emit_cs_shader(struct anv_batch *batch,
shader->kernel.offset +
brw_cs_prog_data_prog_offset(cs_prog_data, dispatch.simd_size),
/* Wa_1606682166 */
.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(shader),
/* We add 1 because the CS indirect parameters buffer isn't accounted
* for in bind_map.surface_count.
*
* Typically set to 0 to avoid prefetching on every thread dispatch.
*/
.BindingTableEntryCount = devinfo->verx10 == 125 ?
0 : MIN2(shader->bind_map.surface_count, 30),
.SamplerCount = get_sampler_count(device, shader),
.BindingTableEntryCount = MIN2(get_surface_count(device, shader), 31),
.BarrierEnable = cs_prog_data->uses_barrier,
.SharedLocalMemorySize =
intel_compute_slm_encode_size(GFX_VER, cs_prog_data->base.total_shared),

View file

@ -353,6 +353,14 @@
#define DRI_CONFIG_INTEL_TBIMR(def) \
DRI_CONF_OPT_B(intel_tbimr, def, "Enable TBIMR tiled rendering")
#define DRI_CONFIG_INTEL_FORCE_COMPUTE_SURFACE_PREFETCH(def) \
DRI_CONF_OPT_B(intel_force_compute_surface_prefetch, def, \
"Enable binding table surface prefteching for compute shaders")
#define DRI_CONFIG_INTEL_FORCE_SAMPLER_PREFETCH(def) \
DRI_CONF_OPT_B(intel_force_sampler_prefetch, def, \
"Enable binding table sampler prefteching")
#define DRI_CONFIG_INTEL_VF_DISTRIBUTION(def) \
DRI_CONF_OPT_B(intel_vf_distribution, def, "Enable geometry distribution")