ac: move lds_size_per_workgroup to ac_compiler_info

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41022>
This commit is contained in:
Rhys Perry 2026-04-24 13:27:43 +01:00 committed by Marge Bot
parent 9b06b0f219
commit e40457b136
10 changed files with 20 additions and 21 deletions

View file

@ -236,7 +236,7 @@ static bool handle_env_var_force_family(struct radeon_info *info)
void
ac_fill_compiler_info(struct radeon_info *info, const struct drm_amdgpu_info_device *device_info)
{
STATIC_ASSERT(sizeof(struct ac_compiler_info) == 52);
STATIC_ASSERT(sizeof(struct ac_compiler_info) == 56);
struct ac_compiler_info *out = &info->compiler_info;
@ -303,6 +303,14 @@ ac_fill_compiler_info(struct radeon_info *info, const struct drm_amdgpu_info_dev
out->num_simd_per_compute_unit = info->gfx_level >= GFX10 ? 2 : 4;
/* LDS is 64KB per CU (4 SIMDs on GFX6-9, which is 16KB per SIMD).
*
* GFX10+: LDS is 128KB in WGP mode, but a workgroup can only use up to 64KB.
* GFX7+: Workgroups can use up to 64KB.
* GFX6: There is 64KB LDS per CU, but a workgroup can only use up to 32KB.
*/
out->lds_size_per_workgroup = info->gfx_level >= GFX7 ? 64 * 1024 : 32 * 1024;
out->hs_offchip_workgroup_dw_size = info->hs_offchip_workgroup_dw_size;
/* Flags */
@ -1146,14 +1154,6 @@ void ac_fill_hw_info(struct radeon_info *info, const struct drm_amdgpu_info_devi
info->sqc_scalar_cache_size = device_info->sqc_data_cache_size * 1024;
info->num_sqc_per_wgp = device_info->num_sqc_per_wgp;
/* LDS is 64KB per CU (4 SIMDs on GFX6-9, which is 16KB per SIMD).
*
* GFX10+: LDS is 128KB in WGP mode, but a workgroup can only use up to 64KB.
* GFX7+: Workgroups can use up to 64KB.
* GFX6: There is 64KB LDS per CU, but a workgroup can only use up to 32KB.
*/
info->lds_size_per_workgroup = info->gfx_level >= GFX7 ? 64 * 1024 : 32 * 1024;
/* Get the number of good compute units. */
info->num_cu = 0;
for (int i = 0; i < info->max_se; i++) {
@ -1913,7 +1913,6 @@ void ac_print_gpu_info(FILE *f, const struct radeon_info *info, int fd)
fprintf(f, " tcc_rb_non_coherent = %u\n", info->tcc_rb_non_coherent);
fprintf(f, " cp_sdma_ge_use_system_memory_scope = %u\n", info->cp_sdma_ge_use_system_memory_scope);
fprintf(f, " pc_lines = %u\n", info->pc_lines);
fprintf(f, " lds_size_per_workgroup = %u\n", info->lds_size_per_workgroup);
fprintf(f, " lds_alloc_granularity = %i\n", ac_shader_get_lds_alloc_granularity(info->gfx_level));
fprintf(f, " max_memory_clock = %i MHz\n", info->memory_freq_mhz);
@ -2053,6 +2052,7 @@ void ac_print_gpu_info(FILE *f, const struct radeon_info *info, int fd)
fprintf(f, " max_vgpr_alloc = %i\n", info->compiler_info.max_vgpr_alloc);
fprintf(f, " wave64_vgpr_alloc_granularity = %i\n",
info->compiler_info.wave64_vgpr_alloc_granularity);
fprintf(f, " lds_size_per_workgroup = %u\n", info->compiler_info.lds_size_per_workgroup);
fprintf(f, " has_lds_bank_count_16 = %i\n", info->compiler_info.has_lds_bank_count_16);
fprintf(f, " has_sram_ecc_enabled = %i\n", info->compiler_info.has_sram_ecc_enabled);
fprintf(f, " has_point_sample_accel = %i\n", info->compiler_info.has_point_sample_accel);

View file

@ -116,6 +116,8 @@ struct ac_compiler_info {
uint32_t max_vgpr_alloc;
uint32_t wave64_vgpr_alloc_granularity;
uint32_t lds_size_per_workgroup;
uint32_t hs_offchip_workgroup_dw_size;
/* Flags */
@ -323,7 +325,6 @@ struct radeon_info {
bool cp_sdma_ge_use_system_memory_scope;
bool cp_dma_use_L2;
unsigned pc_lines;
uint32_t lds_size_per_workgroup;
/* CP info. */
bool gfx_ib_pad_with_type2;

View file

@ -472,7 +472,7 @@ static void ac_sqtt_fill_asic_info(const struct radeon_info *rad_info,
chunk->vram_size = (uint64_t)rad_info->vram_size_kb * 1024;
chunk->l2_cache_size = rad_info->l2_cache_size;
chunk->l1_cache_size = rad_info->tcp_cache_size;
chunk->lds_size = rad_info->lds_size_per_workgroup;
chunk->lds_size = rad_info->compiler_info.lds_size_per_workgroup;
strncpy(chunk->gpu_name, ac_get_family_name(rad_info->family), SQTT_GPU_NAME_MAX_SIZE - 1);

View file

@ -169,7 +169,7 @@ init_ray_query_vars(nir_shader *shader, const glsl_type *opaque_type, struct ray
uint32_t shared_offset = align(shader->info.shared_size, 4);
if (shader->info.stage != MESA_SHADER_COMPUTE || glsl_type_is_array(opaque_type) ||
shared_offset + shared_stack_size > compiler_info->hw.lds_size_per_workgroup) {
shared_offset + shared_stack_size > compiler_info->ac->lds_size_per_workgroup) {
dst->stack_entries = MAX_SCRATCH_STACK_ENTRY_COUNT;
} else {
if (radv_use_bvh_stack_rtn(compiler_info)) {

View file

@ -1147,7 +1147,6 @@ radv_device_init_compiler_info(struct radv_device *device)
.address32_hi = pdev->info.address32_hi,
.rbplus_allowed = pdev->info.rbplus_allowed,
.has_cs_regalloc_hang_bug = pdev->info.has_cs_regalloc_hang_bug,
.lds_size_per_workgroup = pdev->info.lds_size_per_workgroup,
},
/* Debug/tracing */
.debug =

View file

@ -1803,7 +1803,7 @@ radv_get_physical_device_properties(struct radv_physical_device *pdev)
.maxFragmentOutputAttachments = 8,
.maxFragmentDualSrcAttachments = 1,
.maxFragmentCombinedOutputResources = max_descriptor_set_size,
.maxComputeSharedMemorySize = pdev->info.lds_size_per_workgroup,
.maxComputeSharedMemorySize = pdev->info.compiler_info.lds_size_per_workgroup,
.maxComputeWorkGroupCount = {4294967295, 65535, 65535},
.maxComputeWorkGroupInvocations = 1024,
.maxComputeWorkGroupSize = {1024, 1024, 1024},

View file

@ -2245,7 +2245,7 @@ radv_postprocess_binary_config(const struct radv_compiler_info *compiler_info, s
}
}
assert(config->lds_size <= compiler_info->hw.lds_size_per_workgroup);
assert(config->lds_size <= compiler_info->ac->lds_size_per_workgroup);
unsigned lds_alloc = ac_shader_encode_lds_size(config->lds_size, gfx_level, stage);
switch (stage) {
@ -2857,7 +2857,8 @@ radv_get_max_waves(const struct radv_device *device, const struct ac_shader_conf
simd_per_cu_wgp *= 2;
if (lds_per_workgroup) {
unsigned lds_per_cu_wgp = gpu_info->lds_size_per_workgroup * (gfx_level >= GFX10 && conf->wgp_mode ? 2 : 1);
unsigned lds_per_cu_wgp =
gpu_info->compiler_info.lds_size_per_workgroup * (gfx_level >= GFX10 && conf->wgp_mode ? 2 : 1);
unsigned max_cu_wgp_waves = lds_per_cu_wgp / lds_per_workgroup * waves_per_workgroup;
max_simd_waves = MIN2(max_simd_waves, DIV_ROUND_UP(max_cu_wgp_waves, simd_per_cu_wgp));
}

View file

@ -519,7 +519,6 @@ struct radv_compiler_info {
uint32_t address32_hi;
bool rbplus_allowed;
bool has_cs_regalloc_hang_bug;
uint32_t lds_size_per_workgroup;
} hw;
/* Debug/tracing */

View file

@ -218,7 +218,7 @@ unsigned si_calculate_needed_lds_size(enum amd_gfx_level gfx_level, struct si_sh
}
/* Check that the LDS size is within hw limits. */
assert(lds_size <= shader->selector->screen->info.lds_size_per_workgroup);
assert(lds_size <= shader->selector->screen->info.compiler_info.lds_size_per_workgroup);
return lds_size;
}
@ -305,7 +305,7 @@ static void si_calculate_max_simd_waves(struct si_shader *shader)
max_simd_waves = MIN2(max_simd_waves, max_vgprs / num_vgprs);
}
unsigned max_lds_per_simd = sscreen->info.lds_size_per_workgroup / sscreen->info.compiler_info.num_simd_per_compute_unit;
unsigned max_lds_per_simd = sscreen->info.compiler_info.lds_size_per_workgroup / sscreen->info.compiler_info.num_simd_per_compute_unit;
if (lds_per_wave)
max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave);

View file

@ -639,7 +639,6 @@ static bool do_winsys_init(struct radeon_drm_winsys *ws)
ws->info.max_gflops = 128 * ws->info.num_cu * ws->info.max_gpu_freq_mhz / 1000;
ws->info.num_tcc_blocks = ws->info.max_tcc_blocks;
ws->info.tcp_cache_size = 16 * 1024;
ws->info.lds_size_per_workgroup = ws->info.gfx_level == GFX7 ? 64 * 1024 : 32 * 1024;
#ifdef HAVE_GALLIUM_RADEONSI
ac_fill_compiler_info(&ws->info, NULL);