From a05fc97bc958bb9d1d485520c6989b98cb775dde Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Tue, 20 Jan 2026 17:42:27 +0200 Subject: [PATCH] anv/iris: add drirc to enable sampler state & compute surface state prefetch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I noticed we disable the prefetch only on Gfx12.5. But surely that recommendation carries on on later platforms. It seems other drivers just disable it all the time and only have an option to force the prefetch. So implementing the same thing here. Blorp path is left untouched. Signed-off-by: Lionel Landwerlin Reviewed-by: Tapani Pälli Part-of: --- src/gallium/drivers/iris/driinfo_iris.h | 2 + src/gallium/drivers/iris/iris_program_cache.c | 4 +- src/gallium/drivers/iris/iris_screen.c | 4 + src/gallium/drivers/iris/iris_screen.h | 4 +- src/gallium/drivers/iris/iris_state.c | 73 +++++++++++------ src/intel/vulkan/anv_instance.c | 6 ++ src/intel/vulkan/anv_private.h | 2 + src/intel/vulkan/genX_shader.c | 81 ++++++++++--------- src/util/driconf.h | 8 ++ 9 files changed, 114 insertions(+), 70 deletions(-) diff --git a/src/gallium/drivers/iris/driinfo_iris.h b/src/gallium/drivers/iris/driinfo_iris.h index d0fa3b6c1d9..9486da874fb 100644 --- a/src/gallium/drivers/iris/driinfo_iris.h +++ b/src/gallium/drivers/iris/driinfo_iris.h @@ -14,6 +14,8 @@ DRI_CONF_SECTION_END DRI_CONF_SECTION_PERFORMANCE DRI_CONF_ADAPTIVE_SYNC(true) + DRI_CONFIG_INTEL_FORCE_COMPUTE_SURFACE_PREFETCH(false) + DRI_CONFIG_INTEL_FORCE_SAMPLER_PREFETCH(false) DRI_CONFIG_INTEL_TBIMR(true) DRI_CONFIG_INTEL_VF_DISTRIBUTION(true) DRI_CONFIG_INTEL_TE_DISTRIBUTION(true) diff --git a/src/gallium/drivers/iris/iris_program_cache.c b/src/gallium/drivers/iris/iris_program_cache.c index 4e7cae5bb9a..dede919098f 100644 --- a/src/gallium/drivers/iris/iris_program_cache.c +++ b/src/gallium/drivers/iris/iris_program_cache.c @@ -163,8 +163,6 @@ iris_upload_shader(struct iris_screen *screen, const void *key, const void *assembly) { - const struct intel_device_info *devinfo = screen->devinfo; - u_upload_alloc_ref(uploader, 0, shader->program_size, 64, &shader->assembly.offset, &shader->assembly.res, &shader->map); @@ -200,7 +198,7 @@ iris_upload_shader(struct iris_screen *screen, } /* Store the 3DSTATE shader packets and other derived state. */ - screen->vtbl.store_derived_program_state(devinfo, cache_id, shader); + screen->vtbl.store_derived_program_state(screen, cache_id, shader); util_queue_fence_signal(&shader->ready); diff --git a/src/gallium/drivers/iris/iris_screen.c b/src/gallium/drivers/iris/iris_screen.c index bb2e3bbf936..54374f019e8 100644 --- a/src/gallium/drivers/iris/iris_screen.c +++ b/src/gallium/drivers/iris/iris_screen.c @@ -745,6 +745,10 @@ iris_screen_create(int fd, const struct pipe_screen_config *config) driQueryOptionb(config->options, "intel_enable_wa_14018912822"); screen->driconf.intel_enable_wa_14024015672_msaa = driQueryOptionb(config->options, "intel_enable_wa_14024015672_msaa"); + screen->driconf.force_sampler_prefetch = + driQueryOptionb(config->options, "intel_force_sampler_prefetch"); + screen->driconf.force_compute_surface_prefetch = + driQueryOptionb(config->options, "intel_force_compute_surface_prefetch"); screen->driconf.enable_tbimr = driQueryOptionb(config->options, "intel_tbimr"); screen->driconf.enable_vf_distribution = diff --git a/src/gallium/drivers/iris/iris_screen.h b/src/gallium/drivers/iris/iris_screen.h index 0e6a7f540b5..902c0c95cd1 100644 --- a/src/gallium/drivers/iris/iris_screen.h +++ b/src/gallium/drivers/iris/iris_screen.h @@ -132,7 +132,7 @@ struct iris_vtable { uint32_t offset); unsigned (*derived_program_state_size)(enum iris_program_cache_id id); - void (*store_derived_program_state)(const struct intel_device_info *devinfo, + void (*store_derived_program_state)(const struct iris_screen *screen, enum iris_program_cache_id cache_id, struct iris_compiled_shader *shader); uint32_t *(*create_so_decl_list)(const struct pipe_stream_output_info *sol, @@ -209,6 +209,8 @@ struct iris_screen { bool enable_te_distribution; unsigned generated_indirect_threshold; bool disable_threaded_context; + bool force_sampler_prefetch; + bool force_compute_surface_prefetch; } driconf; /** Does the kernel support various features (KERNEL_HAS_* bitfield)? */ diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c index 0de2cd5d6f3..bfccf68850d 100644 --- a/src/gallium/drivers/iris/iris_state.c +++ b/src/gallium/drivers/iris/iris_state.c @@ -5111,20 +5111,40 @@ iris_populate_cs_key(const struct iris_context *ice, } static inline uint32_t -encode_sampler_count(const struct iris_compiled_shader *shader) +encode_sampler_count(const struct iris_screen *screen, + const struct iris_compiled_shader *shader) { +#if GFX_VER == 11 + /* Wa_1606682166 */ + return 0; +#else + if (!screen->driconf.force_sampler_prefetch) + return 0; /* We can potentially have way more than 32 samplers and that's ok. * However, the 3DSTATE_XS packets only have 3 bits to specify how * many to pre-fetch and all values above 4 are marked reserved. */ uint32_t count = util_last_bit64(shader->bt.samplers_used_mask); return DIV_ROUND_UP(CLAMP(count, 0, 16), 4); +#endif +} + +static inline uint32_t +encode_surface_count(const struct iris_screen *screen, + const struct iris_compiled_shader *shader) +{ +#if GFX_VERx10 >= 125 + if (shader->stage == MESA_SHADER_COMPUTE && + !screen->driconf.force_compute_surface_prefetch) + return 0; +#endif + return shader->bt.size_bytes / 4; } #define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix, stage) \ pkt.KernelStartPointer = KSP(shader); \ - pkt.BindingTableEntryCount = shader->bt.size_bytes / 4; \ - pkt.SamplerCount = encode_sampler_count(shader); \ + pkt.BindingTableEntryCount = encode_surface_count(screen, shader); \ + pkt.SamplerCount = encode_sampler_count(screen, shader); \ pkt.FloatingPointMode = shader->use_alt_mode; \ \ pkt.DispatchGRFStartRegisterForURBData = \ @@ -5180,9 +5200,10 @@ encode_sampler_count(const struct iris_compiled_shader *shader) * Encode most of 3DSTATE_VS based on the compiled shader. */ static void -iris_store_vs_state(const struct intel_device_info *devinfo, +iris_store_vs_state(const struct iris_screen *screen, struct iris_compiled_shader *shader) { + const struct intel_device_info *devinfo = screen->devinfo; struct iris_vue_data *vue_data = iris_vue_data(shader); iris_pack_command(GENX(3DSTATE_VS), shader->derived_data, vs) { @@ -5203,9 +5224,10 @@ iris_store_vs_state(const struct intel_device_info *devinfo, * Encode most of 3DSTATE_HS based on the compiled shader. */ static void -iris_store_tcs_state(const struct intel_device_info *devinfo, +iris_store_tcs_state(const struct iris_screen *screen, struct iris_compiled_shader *shader) { + const struct intel_device_info *devinfo = screen->devinfo; struct iris_tcs_data *tcs_data = iris_tcs_data(shader); struct iris_vue_data *vue_data = &tcs_data->base; @@ -5252,9 +5274,10 @@ iris_store_tcs_state(const struct intel_device_info *devinfo, * Encode 3DSTATE_TE and most of 3DSTATE_DS based on the compiled shader. */ static void -iris_store_tes_state(const struct intel_device_info *devinfo, +iris_store_tes_state(const struct iris_screen *screen, struct iris_compiled_shader *shader) { + const struct intel_device_info *devinfo = screen->devinfo; struct iris_tes_data *tes_data = iris_tes_data(shader); struct iris_vue_data *vue_data = &tes_data->base; @@ -5319,9 +5342,10 @@ iris_store_tes_state(const struct intel_device_info *devinfo, * Encode most of 3DSTATE_GS based on the compiled shader. */ static void -iris_store_gs_state(const struct intel_device_info *devinfo, +iris_store_gs_state(const struct iris_screen *screen, struct iris_compiled_shader *shader) { + const struct intel_device_info *devinfo = screen->devinfo; struct iris_gs_data *gs_data = iris_gs_data(shader); struct iris_vue_data *vue_data = &gs_data->base; @@ -5367,9 +5391,10 @@ iris_store_gs_state(const struct intel_device_info *devinfo, * Encode most of 3DSTATE_PS and 3DSTATE_PS_EXTRA based on the shader. */ static void -iris_store_fs_state(const struct intel_device_info *devinfo, +iris_store_fs_state(const struct iris_screen *screen, struct iris_compiled_shader *shader) { + const struct intel_device_info *devinfo = screen->devinfo; struct iris_fs_data *fs_data = iris_fs_data(shader); uint32_t *ps_state = (void *) shader->derived_data; @@ -5377,8 +5402,8 @@ iris_store_fs_state(const struct intel_device_info *devinfo, iris_pack_command(GENX(3DSTATE_PS), ps_state, ps) { ps.VectorMaskEnable = fs_data->uses_vmask; - ps.BindingTableEntryCount = shader->bt.size_bytes / 4; - ps.SamplerCount = encode_sampler_count(shader); + ps.BindingTableEntryCount = encode_surface_count(screen, shader); + ps.SamplerCount = encode_sampler_count(screen, shader); ps.FloatingPointMode = shader->use_alt_mode; ps.MaximumNumberofThreadsPerPSD = devinfo->max_threads_per_psd - (GFX_VER == 8 ? 2 : 1); @@ -5453,7 +5478,7 @@ iris_store_fs_state(const struct intel_device_info *devinfo, * This must match the data written by the iris_store_xs_state() functions. */ static void -iris_store_cs_state(const struct intel_device_info *devinfo, +iris_store_cs_state(const struct iris_screen *screen, struct iris_compiled_shader *shader) { struct iris_cs_data *cs_data = iris_cs_data(shader); @@ -5471,10 +5496,8 @@ iris_store_cs_state(const struct intel_device_info *devinfo, #if GFX_VERx10 <= 125 desc.BarrierEnable = cs_data->uses_barrier; #endif - /* Typically set to 0 to avoid prefetching on every thread dispatch. */ - desc.BindingTableEntryCount = devinfo->verx10 == 125 ? - 0 : MIN2(shader->bt.size_bytes / 4, 31); - desc.SamplerCount = encode_sampler_count(shader); + desc.BindingTableEntryCount = MIN2(encode_surface_count(screen, shader), 31); + desc.SamplerCount = encode_sampler_count(screen, shader); /* TODO: Check if we are missing workarounds and enable mid-thread * preemption. * @@ -5522,28 +5545,28 @@ iris_derived_program_state_size(enum iris_program_cache_id cache_id) * get most of the state packet without having to reconstruct it. */ static void -iris_store_derived_program_state(const struct intel_device_info *devinfo, +iris_store_derived_program_state(const struct iris_screen *screen, enum iris_program_cache_id cache_id, struct iris_compiled_shader *shader) { switch (cache_id) { case IRIS_CACHE_VS: - iris_store_vs_state(devinfo, shader); + iris_store_vs_state(screen, shader); break; case IRIS_CACHE_TCS: - iris_store_tcs_state(devinfo, shader); + iris_store_tcs_state(screen, shader); break; case IRIS_CACHE_TES: - iris_store_tes_state(devinfo, shader); + iris_store_tes_state(screen, shader); break; case IRIS_CACHE_GS: - iris_store_gs_state(devinfo, shader); + iris_store_gs_state(screen, shader); break; case IRIS_CACHE_FS: - iris_store_fs_state(devinfo, shader); + iris_store_fs_state(screen, shader); break; case IRIS_CACHE_CS: - iris_store_cs_state(devinfo, shader); + iris_store_cs_state(screen, shader); break; case IRIS_CACHE_BLORP: break; @@ -9248,11 +9271,9 @@ iris_upload_compute_walker(struct iris_context *ice, dispatch.group_size, dispatch.simd_size); idd.SamplerStatePointer = shs->sampler_table.offset; - idd.SamplerCount = encode_sampler_count(shader), + idd.SamplerCount = encode_sampler_count(screen, shader), idd.BindingTablePointer = binder->bt_offset[MESA_SHADER_COMPUTE]; - /* Typically set to 0 to avoid prefetching on every thread dispatch. */ - idd.BindingTableEntryCount = devinfo->verx10 == 125 ? - 0 : MIN2(shader->bt.size_bytes / 4, 31); + idd.BindingTableEntryCount = MIN2(encode_surface_count(screen, shader), 31); idd.NumberOfBarriers = cs_data->uses_barrier; #if GFX_VER >= 30 idd.RegistersPerThread = ptl_register_blocks(shader->brw_prog_data->grf_used); diff --git a/src/intel/vulkan/anv_instance.c b/src/intel/vulkan/anv_instance.c index 27b540ea4d9..2e1a3e76c38 100644 --- a/src/intel/vulkan/anv_instance.c +++ b/src/intel/vulkan/anv_instance.c @@ -36,6 +36,8 @@ static const driOptionDescription anv_dri_options[] = { DRI_CONF_ANV_FORCE_INDIRECT_DESCRIPTORS(false) DRI_CONF_ANV_DISABLE_LINK_TIME_OPTIMIZATION(false) DRI_CONF_SHADER_SPILLING_RATE(11) + DRI_CONFIG_INTEL_FORCE_COMPUTE_SURFACE_PREFETCH(false) + DRI_CONFIG_INTEL_FORCE_SAMPLER_PREFETCH(false) DRI_CONFIG_INTEL_TBIMR(true) DRI_CONFIG_INTEL_VF_DISTRIBUTION(true) DRI_CONFIG_INTEL_TE_DISTRIBUTION(true) @@ -200,6 +202,10 @@ anv_init_dri_options(struct anv_instance *instance) driQueryOptioni(&instance->dri_options, "force_vk_vendor"); instance->has_fake_sparse = driQueryOptionb(&instance->dri_options, "fake_sparse"); + instance->force_sampler_prefetch = + driQueryOptionb(&instance->dri_options, "intel_force_sampler_prefetch"); + instance->force_compute_surface_prefetch = + driQueryOptionb(&instance->dri_options, "intel_force_compute_surface_prefetch"); instance->enable_tbimr = driQueryOptionb(&instance->dri_options, "intel_tbimr"); instance->enable_vf_distribution = driQueryOptionb(&instance->dri_options, "intel_vf_distribution"); diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 5d1fccb6bcb..fb5a58f3888 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -1782,6 +1782,8 @@ struct anv_instance { bool custom_border_colors_without_format; bool vf_component_packing; bool large_workgroup_non_coherent_image_workaround; + bool force_sampler_prefetch; + bool force_compute_surface_prefetch; /* HW workarounds */ bool no_16bit; diff --git a/src/intel/vulkan/genX_shader.c b/src/intel/vulkan/genX_shader.c index 747427b8890..5feddc1f62b 100644 --- a/src/intel/vulkan/genX_shader.c +++ b/src/intel/vulkan/genX_shader.c @@ -27,15 +27,36 @@ })) static uint32_t -get_sampler_count(const struct anv_shader *shader) +get_surface_count(const struct anv_device *device, + const struct anv_shader *shader) { - uint32_t count_by_4 = DIV_ROUND_UP(shader->bind_map.sampler_count, 4); +#if GFX_VERx10 >= 125 + if (shader->vk.stage == MESA_SHADER_COMPUTE && + !device->physical->instance->force_compute_surface_prefetch) + return 0; +#endif + return shader->bind_map.surface_count; +} - /* We can potentially have way more than 32 samplers and that's ok. - * However, the 3DSTATE_XS packets only have 3 bits to specify how - * many to pre-fetch and all values above 4 are marked reserved. +static uint32_t +get_sampler_count(const struct anv_device *device, + const struct anv_shader *shader) +{ +#if GFX_VER == 11 + /* Wa_1606682166: + * + * Incorrect TDL's SSP address shift in SARB for 16:6 & 18:8 modes. Disable + * the Sampler state prefetch functionality in the SARB by programming + * 0xB000[30] to '1'. */ - return MIN2(count_by_4, 4); + return 0; +#else + if (!device->physical->instance->force_sampler_prefetch) + return 0; + + return DIV_ROUND_UP( + CLAMP(shader->bind_map.sampler_count, 0, 16), 4); +#endif } static UNUSED struct anv_address @@ -557,13 +578,8 @@ emit_vs_shader(struct anv_batch *batch, vs.SingleVertexDispatch = false; #endif vs.VectorMaskEnable = false; - /* Wa_1606682166: - * Incorrect TDL's SSP address shift in SARB for 16:6 & 18:8 modes. - * Disable the Sampler state prefetch functionality in the SARB by - * programming 0xB000[30] to '1'. - */ - vs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(shader); - vs.BindingTableEntryCount = shader->bind_map.surface_count; + vs.SamplerCount = get_sampler_count(device, shader); + vs.BindingTableEntryCount = get_surface_count(device, shader); vs.FloatingPointMode = IEEE754; vs.IllegalOpcodeExceptionEnable = false; vs.SoftwareExceptionEnable = false; @@ -619,9 +635,8 @@ emit_hs_shader(struct anv_batch *batch, hs.Enable = true; hs.StatisticsEnable = true; hs.KernelStartPointer = shader->kernel.offset; - /* Wa_1606682166 */ - hs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(shader); - hs.BindingTableEntryCount = shader->bind_map.surface_count; + hs.SamplerCount = get_sampler_count(device, shader); + hs.BindingTableEntryCount = get_surface_count(device, shader); #if GFX_VER >= 12 /* Wa_1604578095: @@ -724,9 +739,8 @@ emit_ds_shader(struct anv_batch *batch, ds.Enable = true; ds.StatisticsEnable = true; ds.KernelStartPointer = shader->kernel.offset; - /* Wa_1606682166 */ - ds.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(shader); - ds.BindingTableEntryCount = shader->bind_map.surface_count; + ds.SamplerCount = get_sampler_count(device, shader); + ds.BindingTableEntryCount = get_surface_count(device, shader); ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1; ds.PatchURBEntryReadLength = tes_prog_data->base.urb_read_length; @@ -799,9 +813,8 @@ emit_gs_shader(struct anv_batch *batch, gs.SingleProgramFlow = false; gs.VectorMaskEnable = false; - /* Wa_1606682166 */ - gs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(shader); - gs.BindingTableEntryCount = shader->bind_map.surface_count; + gs.SamplerCount = get_sampler_count(device, shader); + gs.BindingTableEntryCount = get_surface_count(device, shader); gs.IncludeVertexHandles = gs_prog_data->base.include_vue_handles; gs.IncludePrimitiveID = gs_prog_data->include_primitive_id; @@ -1060,9 +1073,8 @@ emit_ps_shader(struct anv_batch *batch, ps.SingleProgramFlow = false; ps.VectorMaskEnable = wm_prog_data->uses_vmask; - /* Wa_1606682166 */ - ps.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(shader); - ps.BindingTableEntryCount = shader->bind_map.surface_count; + ps.SamplerCount = get_sampler_count(device, shader); + ps.BindingTableEntryCount = get_surface_count(device, shader); #if GFX_VER < 20 ps.PushConstantEnable = wm_prog_data->base.push_sizes[0] > 0; #endif @@ -1177,11 +1189,8 @@ emit_cs_shader(struct anv_batch *batch, }, .InterfaceDescriptor = { .KernelStartPointer = shader->kernel.offset, - .SamplerCount = DIV_ROUND_UP( - CLAMP(shader->bind_map.sampler_count, 0, 16), 4), - /* Typically set to 0 to avoid prefetching on every thread dispatch. */ - .BindingTableEntryCount = devinfo->verx10 == 125 ? - 0 : 1 + MIN2(shader->bind_map.surface_count, 30), + .SamplerCount = get_sampler_count(device, shader), + .BindingTableEntryCount = MIN2(get_surface_count(device, shader), 31), .NumberofThreadsinGPGPUThreadGroup = dispatch.threads, .SharedLocalMemorySize = intel_compute_slm_encode_size( GFX_VER, cs_prog_data->base.total_shared), @@ -1231,16 +1240,8 @@ emit_cs_shader(struct anv_batch *batch, shader->kernel.offset + brw_cs_prog_data_prog_offset(cs_prog_data, dispatch.simd_size), - /* Wa_1606682166 */ - .SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(shader), - - /* We add 1 because the CS indirect parameters buffer isn't accounted - * for in bind_map.surface_count. - * - * Typically set to 0 to avoid prefetching on every thread dispatch. - */ - .BindingTableEntryCount = devinfo->verx10 == 125 ? - 0 : MIN2(shader->bind_map.surface_count, 30), + .SamplerCount = get_sampler_count(device, shader), + .BindingTableEntryCount = MIN2(get_surface_count(device, shader), 31), .BarrierEnable = cs_prog_data->uses_barrier, .SharedLocalMemorySize = intel_compute_slm_encode_size(GFX_VER, cs_prog_data->base.total_shared), diff --git a/src/util/driconf.h b/src/util/driconf.h index 4ed27b66443..055dbaa8223 100644 --- a/src/util/driconf.h +++ b/src/util/driconf.h @@ -353,6 +353,14 @@ #define DRI_CONFIG_INTEL_TBIMR(def) \ DRI_CONF_OPT_B(intel_tbimr, def, "Enable TBIMR tiled rendering") +#define DRI_CONFIG_INTEL_FORCE_COMPUTE_SURFACE_PREFETCH(def) \ + DRI_CONF_OPT_B(intel_force_compute_surface_prefetch, def, \ + "Enable binding table surface prefteching for compute shaders") + +#define DRI_CONFIG_INTEL_FORCE_SAMPLER_PREFETCH(def) \ + DRI_CONF_OPT_B(intel_force_sampler_prefetch, def, \ + "Enable binding table sampler prefteching") + #define DRI_CONFIG_INTEL_VF_DISTRIBUTION(def) \ DRI_CONF_OPT_B(intel_vf_distribution, def, "Enable geometry distribution")