diff --git a/src/amd/common/ac_gpu_info.c b/src/amd/common/ac_gpu_info.c index 4d8014821f9..4721ecc7a89 100644 --- a/src/amd/common/ac_gpu_info.c +++ b/src/amd/common/ac_gpu_info.c @@ -1058,12 +1058,17 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info) } info->r600_has_virtual_memory = true; - /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above + /* LDS is 64KB per CU (4 SIMDs on GFX6-9), which is 16KB per SIMD (usage above * 16KB makes some SIMDs unoccupied). * - * LDS is 128KB in WGP mode and 64KB in CU mode. Assume the WGP mode is used. + * GFX10+: LDS is 128KB in WGP mode and 64KB in CU mode. Assume the WGP mode is used. + * GFX7+: Workgroups can use up to 64KB. + * GFX6: There is 64KB LDS per CU, but a workgroup can only use up to 32KB. */ - info->lds_size_per_workgroup = info->gfx_level >= GFX10 ? 128 * 1024 : 64 * 1024; + info->lds_size_per_workgroup = info->gfx_level >= GFX10 ? 128 * 1024 + : info->gfx_level >= GFX7 ? 64 * 1024 + : 32 * 1024; + /* lds_encode_granularity is the block size used for encoding registers. * lds_alloc_granularity is what the hardware will align the LDS size to. */ diff --git a/src/amd/compiler/README-ISA.md b/src/amd/compiler/README-ISA.md index f0cdbf8612b..3b8590cbe9b 100644 --- a/src/amd/compiler/README-ISA.md +++ b/src/amd/compiler/README-ISA.md @@ -113,6 +113,18 @@ Some instructions have a `_LEGACY` variant which implements "DX9 rules", in whic the zero "wins" in multiplications, ie. `0.0*x` is always `0.0`. The VEGA ISA mentions `V_MAC_LEGACY_F32` but this instruction is not really there on VEGA. +## LDS size and allocation granule + +GFX7-8 ISA manuals are mistaken about the available LDS size. + +* GFX7+ workgroups can use 64KB LDS. + There is 64KB LDS per CU. +* GFX6 workgroups can use 32KB LDS. + There is 64KB LDS per CU, but a single workgroup can only use half of it. + + Regarding the LDS allocation granule, Mesa has the correct details and + the ISA manuals are mistaken. + ## `m0` with LDS instructions on Vega and newer The Vega ISA doc (both the old one and the "7nm" one) claims that LDS instructions diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index 66bc77f70c0..d5d2392cbb6 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -254,8 +254,6 @@ setup_vs_variables(isel_context* ctx, nir_shader* nir) if (ctx->stage == vertex_ngg) { ctx->program->config->lds_size = DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule); - assert((ctx->program->config->lds_size * ctx->program->dev.lds_encoding_granule) < - (32 * 1024)); } } @@ -285,8 +283,6 @@ setup_tes_variables(isel_context* ctx, nir_shader* nir) if (ctx->stage == tess_eval_ngg) { ctx->program->config->lds_size = DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule); - assert((ctx->program->config->lds_size * ctx->program->dev.lds_encoding_granule) < - (32 * 1024)); } } @@ -295,7 +291,6 @@ setup_ms_variables(isel_context* ctx, nir_shader* nir) { ctx->program->config->lds_size = DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule); - assert((ctx->program->config->lds_size * ctx->program->dev.lds_encoding_granule) < (32 * 1024)); } void diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp index f592cb6e65d..ef04a89c780 100644 --- a/src/amd/compiler/aco_ir.cpp +++ b/src/amd/compiler/aco_ir.cpp @@ -101,7 +101,10 @@ init_program(Program* program, Stage stage, const struct aco_shader_info* info, program->dev.lds_encoding_granule = gfx_level >= GFX11 && stage == fragment_fs ? 1024 : gfx_level >= GFX7 ? 512 : 256; program->dev.lds_alloc_granule = gfx_level >= GFX10_3 ? 1024 : program->dev.lds_encoding_granule; + + /* GFX6: There is 64KB LDS per CU, but a single workgroup can only use 32KB. */ program->dev.lds_limit = gfx_level >= GFX7 ? 65536 : 32768; + /* apparently gfx702 also has 16-bank LDS but I can't find a family for that */ program->dev.has_16bank_lds = family == CHIP_KABINI || family == CHIP_STONEY; diff --git a/src/amd/vulkan/winsys/null/radv_null_winsys.c b/src/amd/vulkan/winsys/null/radv_null_winsys.c index 28864a4570c..daeddc900a7 100644 --- a/src/amd/vulkan/winsys/null/radv_null_winsys.c +++ b/src/amd/vulkan/winsys/null/radv_null_winsys.c @@ -137,7 +137,9 @@ radv_null_winsys_query_info(struct radeon_winsys *rws, struct radeon_info *info) else info->num_physical_wave64_vgprs_per_simd = 256; info->num_simd_per_compute_unit = info->gfx_level >= GFX10 ? 2 : 4; - info->lds_size_per_workgroup = info->gfx_level >= GFX10 ? 128 * 1024 : 64 * 1024; + info->lds_size_per_workgroup = info->gfx_level >= GFX10 ? 128 * 1024 + : info->gfx_level >= GFX7 ? 64 * 1024 + : 32 * 1024; info->lds_encode_granularity = info->gfx_level >= GFX7 ? 128 * 4 : 64 * 4; info->lds_alloc_granularity = info->gfx_level >= GFX10_3 ? 256 * 4 : info->lds_encode_granularity;