diff --git a/src/gallium/drivers/iris/driinfo_iris.h b/src/gallium/drivers/iris/driinfo_iris.h index d0fa3b6c1d9..9486da874fb 100644 --- a/src/gallium/drivers/iris/driinfo_iris.h +++ b/src/gallium/drivers/iris/driinfo_iris.h @@ -14,6 +14,8 @@ DRI_CONF_SECTION_END DRI_CONF_SECTION_PERFORMANCE DRI_CONF_ADAPTIVE_SYNC(true) + DRI_CONFIG_INTEL_FORCE_COMPUTE_SURFACE_PREFETCH(false) + DRI_CONFIG_INTEL_FORCE_SAMPLER_PREFETCH(false) DRI_CONFIG_INTEL_TBIMR(true) DRI_CONFIG_INTEL_VF_DISTRIBUTION(true) DRI_CONFIG_INTEL_TE_DISTRIBUTION(true) diff --git a/src/gallium/drivers/iris/iris_program_cache.c b/src/gallium/drivers/iris/iris_program_cache.c index 4e7cae5bb9a..dede919098f 100644 --- a/src/gallium/drivers/iris/iris_program_cache.c +++ b/src/gallium/drivers/iris/iris_program_cache.c @@ -163,8 +163,6 @@ iris_upload_shader(struct iris_screen *screen, const void *key, const void *assembly) { - const struct intel_device_info *devinfo = screen->devinfo; - u_upload_alloc_ref(uploader, 0, shader->program_size, 64, &shader->assembly.offset, &shader->assembly.res, &shader->map); @@ -200,7 +198,7 @@ iris_upload_shader(struct iris_screen *screen, } /* Store the 3DSTATE shader packets and other derived state. */ - screen->vtbl.store_derived_program_state(devinfo, cache_id, shader); + screen->vtbl.store_derived_program_state(screen, cache_id, shader); util_queue_fence_signal(&shader->ready); diff --git a/src/gallium/drivers/iris/iris_screen.c b/src/gallium/drivers/iris/iris_screen.c index bb2e3bbf936..54374f019e8 100644 --- a/src/gallium/drivers/iris/iris_screen.c +++ b/src/gallium/drivers/iris/iris_screen.c @@ -745,6 +745,10 @@ iris_screen_create(int fd, const struct pipe_screen_config *config) driQueryOptionb(config->options, "intel_enable_wa_14018912822"); screen->driconf.intel_enable_wa_14024015672_msaa = driQueryOptionb(config->options, "intel_enable_wa_14024015672_msaa"); + screen->driconf.force_sampler_prefetch = + driQueryOptionb(config->options, "intel_force_sampler_prefetch"); + screen->driconf.force_compute_surface_prefetch = + driQueryOptionb(config->options, "intel_force_compute_surface_prefetch"); screen->driconf.enable_tbimr = driQueryOptionb(config->options, "intel_tbimr"); screen->driconf.enable_vf_distribution = diff --git a/src/gallium/drivers/iris/iris_screen.h b/src/gallium/drivers/iris/iris_screen.h index 0e6a7f540b5..902c0c95cd1 100644 --- a/src/gallium/drivers/iris/iris_screen.h +++ b/src/gallium/drivers/iris/iris_screen.h @@ -132,7 +132,7 @@ struct iris_vtable { uint32_t offset); unsigned (*derived_program_state_size)(enum iris_program_cache_id id); - void (*store_derived_program_state)(const struct intel_device_info *devinfo, + void (*store_derived_program_state)(const struct iris_screen *screen, enum iris_program_cache_id cache_id, struct iris_compiled_shader *shader); uint32_t *(*create_so_decl_list)(const struct pipe_stream_output_info *sol, @@ -209,6 +209,8 @@ struct iris_screen { bool enable_te_distribution; unsigned generated_indirect_threshold; bool disable_threaded_context; + bool force_sampler_prefetch; + bool force_compute_surface_prefetch; } driconf; /** Does the kernel support various features (KERNEL_HAS_* bitfield)? */ diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c index 0de2cd5d6f3..bfccf68850d 100644 --- a/src/gallium/drivers/iris/iris_state.c +++ b/src/gallium/drivers/iris/iris_state.c @@ -5111,20 +5111,40 @@ iris_populate_cs_key(const struct iris_context *ice, } static inline uint32_t -encode_sampler_count(const struct iris_compiled_shader *shader) +encode_sampler_count(const struct iris_screen *screen, + const struct iris_compiled_shader *shader) { +#if GFX_VER == 11 + /* Wa_1606682166 */ + return 0; +#else + if (!screen->driconf.force_sampler_prefetch) + return 0; /* We can potentially have way more than 32 samplers and that's ok. * However, the 3DSTATE_XS packets only have 3 bits to specify how * many to pre-fetch and all values above 4 are marked reserved. */ uint32_t count = util_last_bit64(shader->bt.samplers_used_mask); return DIV_ROUND_UP(CLAMP(count, 0, 16), 4); +#endif +} + +static inline uint32_t +encode_surface_count(const struct iris_screen *screen, + const struct iris_compiled_shader *shader) +{ +#if GFX_VERx10 >= 125 + if (shader->stage == MESA_SHADER_COMPUTE && + !screen->driconf.force_compute_surface_prefetch) + return 0; +#endif + return shader->bt.size_bytes / 4; } #define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix, stage) \ pkt.KernelStartPointer = KSP(shader); \ - pkt.BindingTableEntryCount = shader->bt.size_bytes / 4; \ - pkt.SamplerCount = encode_sampler_count(shader); \ + pkt.BindingTableEntryCount = encode_surface_count(screen, shader); \ + pkt.SamplerCount = encode_sampler_count(screen, shader); \ pkt.FloatingPointMode = shader->use_alt_mode; \ \ pkt.DispatchGRFStartRegisterForURBData = \ @@ -5180,9 +5200,10 @@ encode_sampler_count(const struct iris_compiled_shader *shader) * Encode most of 3DSTATE_VS based on the compiled shader. */ static void -iris_store_vs_state(const struct intel_device_info *devinfo, +iris_store_vs_state(const struct iris_screen *screen, struct iris_compiled_shader *shader) { + const struct intel_device_info *devinfo = screen->devinfo; struct iris_vue_data *vue_data = iris_vue_data(shader); iris_pack_command(GENX(3DSTATE_VS), shader->derived_data, vs) { @@ -5203,9 +5224,10 @@ iris_store_vs_state(const struct intel_device_info *devinfo, * Encode most of 3DSTATE_HS based on the compiled shader. */ static void -iris_store_tcs_state(const struct intel_device_info *devinfo, +iris_store_tcs_state(const struct iris_screen *screen, struct iris_compiled_shader *shader) { + const struct intel_device_info *devinfo = screen->devinfo; struct iris_tcs_data *tcs_data = iris_tcs_data(shader); struct iris_vue_data *vue_data = &tcs_data->base; @@ -5252,9 +5274,10 @@ iris_store_tcs_state(const struct intel_device_info *devinfo, * Encode 3DSTATE_TE and most of 3DSTATE_DS based on the compiled shader. */ static void -iris_store_tes_state(const struct intel_device_info *devinfo, +iris_store_tes_state(const struct iris_screen *screen, struct iris_compiled_shader *shader) { + const struct intel_device_info *devinfo = screen->devinfo; struct iris_tes_data *tes_data = iris_tes_data(shader); struct iris_vue_data *vue_data = &tes_data->base; @@ -5319,9 +5342,10 @@ iris_store_tes_state(const struct intel_device_info *devinfo, * Encode most of 3DSTATE_GS based on the compiled shader. */ static void -iris_store_gs_state(const struct intel_device_info *devinfo, +iris_store_gs_state(const struct iris_screen *screen, struct iris_compiled_shader *shader) { + const struct intel_device_info *devinfo = screen->devinfo; struct iris_gs_data *gs_data = iris_gs_data(shader); struct iris_vue_data *vue_data = &gs_data->base; @@ -5367,9 +5391,10 @@ iris_store_gs_state(const struct intel_device_info *devinfo, * Encode most of 3DSTATE_PS and 3DSTATE_PS_EXTRA based on the shader. */ static void -iris_store_fs_state(const struct intel_device_info *devinfo, +iris_store_fs_state(const struct iris_screen *screen, struct iris_compiled_shader *shader) { + const struct intel_device_info *devinfo = screen->devinfo; struct iris_fs_data *fs_data = iris_fs_data(shader); uint32_t *ps_state = (void *) shader->derived_data; @@ -5377,8 +5402,8 @@ iris_store_fs_state(const struct intel_device_info *devinfo, iris_pack_command(GENX(3DSTATE_PS), ps_state, ps) { ps.VectorMaskEnable = fs_data->uses_vmask; - ps.BindingTableEntryCount = shader->bt.size_bytes / 4; - ps.SamplerCount = encode_sampler_count(shader); + ps.BindingTableEntryCount = encode_surface_count(screen, shader); + ps.SamplerCount = encode_sampler_count(screen, shader); ps.FloatingPointMode = shader->use_alt_mode; ps.MaximumNumberofThreadsPerPSD = devinfo->max_threads_per_psd - (GFX_VER == 8 ? 2 : 1); @@ -5453,7 +5478,7 @@ iris_store_fs_state(const struct intel_device_info *devinfo, * This must match the data written by the iris_store_xs_state() functions. */ static void -iris_store_cs_state(const struct intel_device_info *devinfo, +iris_store_cs_state(const struct iris_screen *screen, struct iris_compiled_shader *shader) { struct iris_cs_data *cs_data = iris_cs_data(shader); @@ -5471,10 +5496,8 @@ iris_store_cs_state(const struct intel_device_info *devinfo, #if GFX_VERx10 <= 125 desc.BarrierEnable = cs_data->uses_barrier; #endif - /* Typically set to 0 to avoid prefetching on every thread dispatch. */ - desc.BindingTableEntryCount = devinfo->verx10 == 125 ? - 0 : MIN2(shader->bt.size_bytes / 4, 31); - desc.SamplerCount = encode_sampler_count(shader); + desc.BindingTableEntryCount = MIN2(encode_surface_count(screen, shader), 31); + desc.SamplerCount = encode_sampler_count(screen, shader); /* TODO: Check if we are missing workarounds and enable mid-thread * preemption. * @@ -5522,28 +5545,28 @@ iris_derived_program_state_size(enum iris_program_cache_id cache_id) * get most of the state packet without having to reconstruct it. */ static void -iris_store_derived_program_state(const struct intel_device_info *devinfo, +iris_store_derived_program_state(const struct iris_screen *screen, enum iris_program_cache_id cache_id, struct iris_compiled_shader *shader) { switch (cache_id) { case IRIS_CACHE_VS: - iris_store_vs_state(devinfo, shader); + iris_store_vs_state(screen, shader); break; case IRIS_CACHE_TCS: - iris_store_tcs_state(devinfo, shader); + iris_store_tcs_state(screen, shader); break; case IRIS_CACHE_TES: - iris_store_tes_state(devinfo, shader); + iris_store_tes_state(screen, shader); break; case IRIS_CACHE_GS: - iris_store_gs_state(devinfo, shader); + iris_store_gs_state(screen, shader); break; case IRIS_CACHE_FS: - iris_store_fs_state(devinfo, shader); + iris_store_fs_state(screen, shader); break; case IRIS_CACHE_CS: - iris_store_cs_state(devinfo, shader); + iris_store_cs_state(screen, shader); break; case IRIS_CACHE_BLORP: break; @@ -9248,11 +9271,9 @@ iris_upload_compute_walker(struct iris_context *ice, dispatch.group_size, dispatch.simd_size); idd.SamplerStatePointer = shs->sampler_table.offset; - idd.SamplerCount = encode_sampler_count(shader), + idd.SamplerCount = encode_sampler_count(screen, shader), idd.BindingTablePointer = binder->bt_offset[MESA_SHADER_COMPUTE]; - /* Typically set to 0 to avoid prefetching on every thread dispatch. */ - idd.BindingTableEntryCount = devinfo->verx10 == 125 ? - 0 : MIN2(shader->bt.size_bytes / 4, 31); + idd.BindingTableEntryCount = MIN2(encode_surface_count(screen, shader), 31); idd.NumberOfBarriers = cs_data->uses_barrier; #if GFX_VER >= 30 idd.RegistersPerThread = ptl_register_blocks(shader->brw_prog_data->grf_used); diff --git a/src/intel/vulkan/anv_instance.c b/src/intel/vulkan/anv_instance.c index 27b540ea4d9..2e1a3e76c38 100644 --- a/src/intel/vulkan/anv_instance.c +++ b/src/intel/vulkan/anv_instance.c @@ -36,6 +36,8 @@ static const driOptionDescription anv_dri_options[] = { DRI_CONF_ANV_FORCE_INDIRECT_DESCRIPTORS(false) DRI_CONF_ANV_DISABLE_LINK_TIME_OPTIMIZATION(false) DRI_CONF_SHADER_SPILLING_RATE(11) + DRI_CONFIG_INTEL_FORCE_COMPUTE_SURFACE_PREFETCH(false) + DRI_CONFIG_INTEL_FORCE_SAMPLER_PREFETCH(false) DRI_CONFIG_INTEL_TBIMR(true) DRI_CONFIG_INTEL_VF_DISTRIBUTION(true) DRI_CONFIG_INTEL_TE_DISTRIBUTION(true) @@ -200,6 +202,10 @@ anv_init_dri_options(struct anv_instance *instance) driQueryOptioni(&instance->dri_options, "force_vk_vendor"); instance->has_fake_sparse = driQueryOptionb(&instance->dri_options, "fake_sparse"); + instance->force_sampler_prefetch = + driQueryOptionb(&instance->dri_options, "intel_force_sampler_prefetch"); + instance->force_compute_surface_prefetch = + driQueryOptionb(&instance->dri_options, "intel_force_compute_surface_prefetch"); instance->enable_tbimr = driQueryOptionb(&instance->dri_options, "intel_tbimr"); instance->enable_vf_distribution = driQueryOptionb(&instance->dri_options, "intel_vf_distribution"); diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 5d1fccb6bcb..fb5a58f3888 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -1782,6 +1782,8 @@ struct anv_instance { bool custom_border_colors_without_format; bool vf_component_packing; bool large_workgroup_non_coherent_image_workaround; + bool force_sampler_prefetch; + bool force_compute_surface_prefetch; /* HW workarounds */ bool no_16bit; diff --git a/src/intel/vulkan/genX_shader.c b/src/intel/vulkan/genX_shader.c index 747427b8890..5feddc1f62b 100644 --- a/src/intel/vulkan/genX_shader.c +++ b/src/intel/vulkan/genX_shader.c @@ -27,15 +27,36 @@ })) static uint32_t -get_sampler_count(const struct anv_shader *shader) +get_surface_count(const struct anv_device *device, + const struct anv_shader *shader) { - uint32_t count_by_4 = DIV_ROUND_UP(shader->bind_map.sampler_count, 4); +#if GFX_VERx10 >= 125 + if (shader->vk.stage == MESA_SHADER_COMPUTE && + !device->physical->instance->force_compute_surface_prefetch) + return 0; +#endif + return shader->bind_map.surface_count; +} - /* We can potentially have way more than 32 samplers and that's ok. - * However, the 3DSTATE_XS packets only have 3 bits to specify how - * many to pre-fetch and all values above 4 are marked reserved. +static uint32_t +get_sampler_count(const struct anv_device *device, + const struct anv_shader *shader) +{ +#if GFX_VER == 11 + /* Wa_1606682166: + * + * Incorrect TDL's SSP address shift in SARB for 16:6 & 18:8 modes. Disable + * the Sampler state prefetch functionality in the SARB by programming + * 0xB000[30] to '1'. */ - return MIN2(count_by_4, 4); + return 0; +#else + if (!device->physical->instance->force_sampler_prefetch) + return 0; + + return DIV_ROUND_UP( + CLAMP(shader->bind_map.sampler_count, 0, 16), 4); +#endif } static UNUSED struct anv_address @@ -557,13 +578,8 @@ emit_vs_shader(struct anv_batch *batch, vs.SingleVertexDispatch = false; #endif vs.VectorMaskEnable = false; - /* Wa_1606682166: - * Incorrect TDL's SSP address shift in SARB for 16:6 & 18:8 modes. - * Disable the Sampler state prefetch functionality in the SARB by - * programming 0xB000[30] to '1'. - */ - vs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(shader); - vs.BindingTableEntryCount = shader->bind_map.surface_count; + vs.SamplerCount = get_sampler_count(device, shader); + vs.BindingTableEntryCount = get_surface_count(device, shader); vs.FloatingPointMode = IEEE754; vs.IllegalOpcodeExceptionEnable = false; vs.SoftwareExceptionEnable = false; @@ -619,9 +635,8 @@ emit_hs_shader(struct anv_batch *batch, hs.Enable = true; hs.StatisticsEnable = true; hs.KernelStartPointer = shader->kernel.offset; - /* Wa_1606682166 */ - hs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(shader); - hs.BindingTableEntryCount = shader->bind_map.surface_count; + hs.SamplerCount = get_sampler_count(device, shader); + hs.BindingTableEntryCount = get_surface_count(device, shader); #if GFX_VER >= 12 /* Wa_1604578095: @@ -724,9 +739,8 @@ emit_ds_shader(struct anv_batch *batch, ds.Enable = true; ds.StatisticsEnable = true; ds.KernelStartPointer = shader->kernel.offset; - /* Wa_1606682166 */ - ds.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(shader); - ds.BindingTableEntryCount = shader->bind_map.surface_count; + ds.SamplerCount = get_sampler_count(device, shader); + ds.BindingTableEntryCount = get_surface_count(device, shader); ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1; ds.PatchURBEntryReadLength = tes_prog_data->base.urb_read_length; @@ -799,9 +813,8 @@ emit_gs_shader(struct anv_batch *batch, gs.SingleProgramFlow = false; gs.VectorMaskEnable = false; - /* Wa_1606682166 */ - gs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(shader); - gs.BindingTableEntryCount = shader->bind_map.surface_count; + gs.SamplerCount = get_sampler_count(device, shader); + gs.BindingTableEntryCount = get_surface_count(device, shader); gs.IncludeVertexHandles = gs_prog_data->base.include_vue_handles; gs.IncludePrimitiveID = gs_prog_data->include_primitive_id; @@ -1060,9 +1073,8 @@ emit_ps_shader(struct anv_batch *batch, ps.SingleProgramFlow = false; ps.VectorMaskEnable = wm_prog_data->uses_vmask; - /* Wa_1606682166 */ - ps.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(shader); - ps.BindingTableEntryCount = shader->bind_map.surface_count; + ps.SamplerCount = get_sampler_count(device, shader); + ps.BindingTableEntryCount = get_surface_count(device, shader); #if GFX_VER < 20 ps.PushConstantEnable = wm_prog_data->base.push_sizes[0] > 0; #endif @@ -1177,11 +1189,8 @@ emit_cs_shader(struct anv_batch *batch, }, .InterfaceDescriptor = { .KernelStartPointer = shader->kernel.offset, - .SamplerCount = DIV_ROUND_UP( - CLAMP(shader->bind_map.sampler_count, 0, 16), 4), - /* Typically set to 0 to avoid prefetching on every thread dispatch. */ - .BindingTableEntryCount = devinfo->verx10 == 125 ? - 0 : 1 + MIN2(shader->bind_map.surface_count, 30), + .SamplerCount = get_sampler_count(device, shader), + .BindingTableEntryCount = MIN2(get_surface_count(device, shader), 31), .NumberofThreadsinGPGPUThreadGroup = dispatch.threads, .SharedLocalMemorySize = intel_compute_slm_encode_size( GFX_VER, cs_prog_data->base.total_shared), @@ -1231,16 +1240,8 @@ emit_cs_shader(struct anv_batch *batch, shader->kernel.offset + brw_cs_prog_data_prog_offset(cs_prog_data, dispatch.simd_size), - /* Wa_1606682166 */ - .SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(shader), - - /* We add 1 because the CS indirect parameters buffer isn't accounted - * for in bind_map.surface_count. - * - * Typically set to 0 to avoid prefetching on every thread dispatch. - */ - .BindingTableEntryCount = devinfo->verx10 == 125 ? - 0 : MIN2(shader->bind_map.surface_count, 30), + .SamplerCount = get_sampler_count(device, shader), + .BindingTableEntryCount = MIN2(get_surface_count(device, shader), 31), .BarrierEnable = cs_prog_data->uses_barrier, .SharedLocalMemorySize = intel_compute_slm_encode_size(GFX_VER, cs_prog_data->base.total_shared), diff --git a/src/util/driconf.h b/src/util/driconf.h index 4ed27b66443..055dbaa8223 100644 --- a/src/util/driconf.h +++ b/src/util/driconf.h @@ -353,6 +353,14 @@ #define DRI_CONFIG_INTEL_TBIMR(def) \ DRI_CONF_OPT_B(intel_tbimr, def, "Enable TBIMR tiled rendering") +#define DRI_CONFIG_INTEL_FORCE_COMPUTE_SURFACE_PREFETCH(def) \ + DRI_CONF_OPT_B(intel_force_compute_surface_prefetch, def, \ + "Enable binding table surface prefteching for compute shaders") + +#define DRI_CONFIG_INTEL_FORCE_SAMPLER_PREFETCH(def) \ + DRI_CONF_OPT_B(intel_force_sampler_prefetch, def, \ + "Enable binding table sampler prefteching") + #define DRI_CONFIG_INTEL_VF_DISTRIBUTION(def) \ DRI_CONF_OPT_B(intel_vf_distribution, def, "Enable geometry distribution")