ac, aco, radv: Clarify LDS size on GFX6, and NGG shaders.

This information was wrong in some places, let's fix it now.

GFX6:
The GPU has 64KB LDS, but only 32KB is usable by a workgroup.

NGG:
There was some misinformation about NGG only being able to
address 32 KB LDS, it turns out this is actually not true
and it can address the full 64K.

Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21935>
This commit is contained in:
Timur Kristóf 2023-03-15 11:59:41 -07:00 committed by Marge Bot
parent edf30b1c6d
commit 4ae46840cd
5 changed files with 26 additions and 9 deletions

View file

@ -1058,12 +1058,17 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info)
}
info->r600_has_virtual_memory = true;
/* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above
/* LDS is 64KB per CU (4 SIMDs on GFX6-9), which is 16KB per SIMD (usage above
* 16KB makes some SIMDs unoccupied).
*
* LDS is 128KB in WGP mode and 64KB in CU mode. Assume the WGP mode is used.
* GFX10+: LDS is 128KB in WGP mode and 64KB in CU mode. Assume the WGP mode is used.
* GFX7+: Workgroups can use up to 64KB.
* GFX6: There is 64KB LDS per CU, but a workgroup can only use up to 32KB.
*/
info->lds_size_per_workgroup = info->gfx_level >= GFX10 ? 128 * 1024 : 64 * 1024;
info->lds_size_per_workgroup = info->gfx_level >= GFX10 ? 128 * 1024
: info->gfx_level >= GFX7 ? 64 * 1024
: 32 * 1024;
/* lds_encode_granularity is the block size used for encoding registers.
* lds_alloc_granularity is what the hardware will align the LDS size to.
*/

View file

@ -113,6 +113,18 @@ Some instructions have a `_LEGACY` variant which implements "DX9 rules", in whic
the zero "wins" in multiplications, ie. `0.0*x` is always `0.0`. The VEGA ISA
mentions `V_MAC_LEGACY_F32` but this instruction is not really there on VEGA.
## LDS size and allocation granule
GFX7-8 ISA manuals are mistaken about the available LDS size.
* GFX7+ workgroups can use 64KB LDS.
There is 64KB LDS per CU.
* GFX6 workgroups can use 32KB LDS.
There is 64KB LDS per CU, but a single workgroup can only use half of it.
Regarding the LDS allocation granule, Mesa has the correct details and
the ISA manuals are mistaken.
## `m0` with LDS instructions on Vega and newer
The Vega ISA doc (both the old one and the "7nm" one) claims that LDS instructions

View file

@ -254,8 +254,6 @@ setup_vs_variables(isel_context* ctx, nir_shader* nir)
if (ctx->stage == vertex_ngg) {
ctx->program->config->lds_size =
DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule);
assert((ctx->program->config->lds_size * ctx->program->dev.lds_encoding_granule) <
(32 * 1024));
}
}
@ -285,8 +283,6 @@ setup_tes_variables(isel_context* ctx, nir_shader* nir)
if (ctx->stage == tess_eval_ngg) {
ctx->program->config->lds_size =
DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule);
assert((ctx->program->config->lds_size * ctx->program->dev.lds_encoding_granule) <
(32 * 1024));
}
}
@ -295,7 +291,6 @@ setup_ms_variables(isel_context* ctx, nir_shader* nir)
{
ctx->program->config->lds_size =
DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule);
assert((ctx->program->config->lds_size * ctx->program->dev.lds_encoding_granule) < (32 * 1024));
}
void

View file

@ -101,7 +101,10 @@ init_program(Program* program, Stage stage, const struct aco_shader_info* info,
program->dev.lds_encoding_granule = gfx_level >= GFX11 && stage == fragment_fs ? 1024 :
gfx_level >= GFX7 ? 512 : 256;
program->dev.lds_alloc_granule = gfx_level >= GFX10_3 ? 1024 : program->dev.lds_encoding_granule;
/* GFX6: There is 64KB LDS per CU, but a single workgroup can only use 32KB. */
program->dev.lds_limit = gfx_level >= GFX7 ? 65536 : 32768;
/* apparently gfx702 also has 16-bank LDS but I can't find a family for that */
program->dev.has_16bank_lds = family == CHIP_KABINI || family == CHIP_STONEY;

View file

@ -137,7 +137,9 @@ radv_null_winsys_query_info(struct radeon_winsys *rws, struct radeon_info *info)
else
info->num_physical_wave64_vgprs_per_simd = 256;
info->num_simd_per_compute_unit = info->gfx_level >= GFX10 ? 2 : 4;
info->lds_size_per_workgroup = info->gfx_level >= GFX10 ? 128 * 1024 : 64 * 1024;
info->lds_size_per_workgroup = info->gfx_level >= GFX10 ? 128 * 1024
: info->gfx_level >= GFX7 ? 64 * 1024
: 32 * 1024;
info->lds_encode_granularity = info->gfx_level >= GFX7 ? 128 * 4 : 64 * 4;
info->lds_alloc_granularity =
info->gfx_level >= GFX10_3 ? 256 * 4 : info->lds_encode_granularity;