diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c index 586c523cb70..043121b4835 100644 --- a/src/gallium/drivers/iris/iris_state.c +++ b/src/gallium/drivers/iris/iris_state.c @@ -8863,7 +8863,10 @@ iris_upload_compute_walker(struct iris_context *ice, idd.SharedLocalMemorySize = intel_compute_slm_encode_size(GFX_VER, shader->total_shared); idd.PreferredSLMAllocationSize = - intel_compute_preferred_slm_calc_encode_size(devinfo, shader->total_shared); + intel_compute_preferred_slm_calc_encode_size(devinfo, + shader->total_shared, + dispatch.group_size, + dispatch.simd_size); idd.SamplerStatePointer = shs->sampler_table.offset; idd.SamplerCount = encode_sampler_count(shader), idd.BindingTablePointer = binder->bt_offset[MESA_SHADER_COMPUTE]; diff --git a/src/intel/blorp/blorp_genX_exec_brw.h b/src/intel/blorp/blorp_genX_exec_brw.h index 677fb782849..259a7235373 100644 --- a/src/intel/blorp/blorp_genX_exec_brw.h +++ b/src/intel/blorp/blorp_genX_exec_brw.h @@ -1738,7 +1738,10 @@ blorp_exec_compute(struct blorp_batch *batch, const struct blorp_params *params) .SharedLocalMemorySize = intel_compute_slm_encode_size(GFX_VER, prog_data->total_shared), .PreferredSLMAllocationSize = - intel_compute_preferred_slm_calc_encode_size(devinfo, prog_data->total_shared), + intel_compute_preferred_slm_calc_encode_size(devinfo, + prog_data->total_shared, + dispatch.group_size, + dispatch.simd_size), .NumberOfBarriers = cs_prog_data->uses_barrier, }; } diff --git a/src/intel/common/intel_compute_slm.c b/src/intel/common/intel_compute_slm.c index d8413bed16c..3437c72cc4f 100644 --- a/src/intel/common/intel_compute_slm.c +++ b/src/intel/common/intel_compute_slm.c @@ -154,22 +154,31 @@ intel_compute_preferred_slm_encode_size(unsigned gen, uint32_t bytes) return slm_encode_lookup(table, table_len, bytes)->encode; } +/** + * Compute a shared local memory size to be allocated for each sub-slice. + * It estimate how many workgroups will run concurrently per sub-slice and + * multiply that per each workgroup SLM size. + */ uint32_t -intel_compute_preferred_slm_calc_encode_size(const struct intel_device_info *devinfo, uint32_t slm_size) +intel_compute_preferred_slm_calc_encode_size(const struct intel_device_info *devinfo, + const uint32_t slm_size_per_workgroup, + const uint32_t invocations_per_workgroup, + const uint8_t cs_simd) { - /* Older platforms than Xe2 has a encode = 0 that sets preferred SLM - * allocation to maximum supported, so keeping it until we come up - * with a formula to calculate the optimal preferred slm allocation. - */ - if (devinfo->ver < 20) - return 0; + const uint32_t max_preferred_slm_size = intel_device_info_get_max_preferred_slm_size(devinfo); + const uint32_t invocations_per_ss = intel_device_info_get_eu_count_first_subslice(devinfo) * + devinfo->num_thread_per_eu * cs_simd; + uint32_t preferred_slm_size; - /* Xe2 has 2 requirements for preferred SLM size: - * - this value needs to be >= then SLM size - * - this value must be less than shared SLM/L1$ RAM in the sub-slice of platform - * - * For now it is not calculating the optimal preferred SLM allocation, - * it is just setting the minimum value that comply with first restriction. - */ - return intel_compute_preferred_slm_encode_size(devinfo->ver, slm_size); + if (slm_size_per_workgroup) { + uint32_t workgroups_per_ss = invocations_per_ss / invocations_per_workgroup; + + preferred_slm_size = workgroups_per_ss * slm_size_per_workgroup; + preferred_slm_size = MIN2(preferred_slm_size, max_preferred_slm_size); + } else { + preferred_slm_size = 0; + } + + assert(preferred_slm_size >= slm_size_per_workgroup); + return intel_compute_preferred_slm_encode_size(devinfo->ver, preferred_slm_size); } diff --git a/src/intel/common/intel_compute_slm.h b/src/intel/common/intel_compute_slm.h index c911da75106..c3bd7a81d80 100644 --- a/src/intel/common/intel_compute_slm.h +++ b/src/intel/common/intel_compute_slm.h @@ -11,4 +11,7 @@ uint32_t intel_compute_slm_calculate_size(unsigned gen, uint32_t bytes); uint32_t intel_compute_slm_encode_size(unsigned gen, uint32_t bytes); -uint32_t intel_compute_preferred_slm_calc_encode_size(const struct intel_device_info *devinfo, uint32_t slm_size); +uint32_t intel_compute_preferred_slm_calc_encode_size(const struct intel_device_info *devinfo, + const uint32_t slm_size_per_workgroup, + const uint32_t invocations_per_workgroup, + const uint8_t cs_simd); diff --git a/src/intel/vulkan/genX_cmd_compute.c b/src/intel/vulkan/genX_cmd_compute.c index 1f547b1f389..2eaef454be9 100644 --- a/src/intel/vulkan/genX_cmd_compute.c +++ b/src/intel/vulkan/genX_cmd_compute.c @@ -286,7 +286,10 @@ get_interface_descriptor_data(struct anv_cmd_buffer *cmd_buffer, .NumberofThreadsinGPGPUThreadGroup = dispatch->threads, .SharedLocalMemorySize = intel_compute_slm_encode_size(GFX_VER, prog_data->base.total_shared), .PreferredSLMAllocationSize = - intel_compute_preferred_slm_calc_encode_size(devinfo, prog_data->base.total_shared), + intel_compute_preferred_slm_calc_encode_size(devinfo, + prog_data->base.total_shared, + dispatch->group_size, + dispatch->simd_size), .NumberOfBarriers = prog_data->uses_barrier, }; } diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c index f5782899a82..0e0a189fcf6 100644 --- a/src/intel/vulkan/genX_pipeline.c +++ b/src/intel/vulkan/genX_pipeline.c @@ -1795,7 +1795,10 @@ emit_task_state(struct anv_graphics_pipeline *pipeline) task.SharedLocalMemorySize = intel_compute_slm_encode_size(GFX_VER, task_prog_data->base.base.total_shared); task.PreferredSLMAllocationSize = - intel_compute_preferred_slm_calc_encode_size(devinfo, task_prog_data->base.base.total_shared); + intel_compute_preferred_slm_calc_encode_size(devinfo, + task_prog_data->base.base.total_shared, + task_dispatch.group_size, + task_dispatch.simd_size); /* * 3DSTATE_TASK_SHADER_DATA.InlineData[0:1] will be used for an address @@ -1876,7 +1879,10 @@ emit_mesh_state(struct anv_graphics_pipeline *pipeline) mesh.SharedLocalMemorySize = intel_compute_slm_encode_size(GFX_VER, mesh_prog_data->base.base.total_shared); mesh.PreferredSLMAllocationSize = - intel_compute_preferred_slm_calc_encode_size(devinfo, mesh_prog_data->base.base.total_shared); + intel_compute_preferred_slm_calc_encode_size(devinfo, + mesh_prog_data->base.base.total_shared, + mesh_dispatch.group_size, + mesh_dispatch.simd_size); /* * 3DSTATE_MESH_SHADER_DATA.InlineData[0:1] will be used for an address