ac, aco, radv: Clarify LDS size on GFX6, and NGG shaders.

This information was wrong in some places, let's fix it now. GFX6: The GPU has 64KB LDS, but only 32KB is usable by a workgroup. NGG: There was some misinformation about NGG only being able to address 32 KB LDS, it turns out this is actually not true and it can address the full 64K. Signed-off-by: Timur Kristóf <timur.kristof@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21935>
2026-01-06 13:10:10 +01:00 · 2023-03-15 11:59:41 -07:00 · 2023-03-15 11:59:41 -07:00 · 4ae46840cd
commit 4ae46840cd
parent edf30b1c6d
5 changed files with 26 additions and 9 deletions
--- a/src/amd/common/ac_gpu_info.c
+++ b/src/amd/common/ac_gpu_info.c
@ -1058,12 +1058,17 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info)
   }
   info->r600_has_virtual_memory = true;

-   /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above
+   /* LDS is 64KB per CU (4 SIMDs on GFX6-9), which is 16KB per SIMD (usage above
    * 16KB makes some SIMDs unoccupied).
    *
-    * LDS is 128KB in WGP mode and 64KB in CU mode. Assume the WGP mode is used.
+    * GFX10+: LDS is 128KB in WGP mode and 64KB in CU mode. Assume the WGP mode is used.
+    * GFX7+: Workgroups can use up to 64KB.
+    * GFX6: There is 64KB LDS per CU, but a workgroup can only use up to 32KB.
    */
-   info->lds_size_per_workgroup = info->gfx_level >= GFX10 ? 128 * 1024 : 64 * 1024;
+   info->lds_size_per_workgroup = info->gfx_level >= GFX10  ? 128 * 1024
+                                  : info->gfx_level >= GFX7 ? 64 * 1024
+                                                            : 32 * 1024;
+
   /* lds_encode_granularity is the block size used for encoding registers.
    * lds_alloc_granularity is what the hardware will align the LDS size to.
    */
--- a/src/amd/compiler/README-ISA.md
+++ b/src/amd/compiler/README-ISA.md
@ -113,6 +113,18 @@ Some instructions have a `_LEGACY` variant which implements "DX9 rules", in whic
 the zero "wins" in multiplications, ie. `0.0*x` is always `0.0`. The VEGA ISA
 mentions `V_MAC_LEGACY_F32` but this instruction is not really there on VEGA.

+## LDS size and allocation granule
+
+GFX7-8 ISA manuals are mistaken about the available LDS size.
+
+* GFX7+ workgroups can use 64KB LDS.
+  There is 64KB LDS per CU.
+* GFX6 workgroups can use 32KB LDS.
+  There is 64KB LDS per CU, but a single workgroup can only use half of it.
+
+ Regarding the LDS allocation granule, Mesa has the correct details and
+ the ISA manuals are mistaken.
+
 ## `m0` with LDS instructions on Vega and newer

 The Vega ISA doc (both the old one and the "7nm" one) claims that LDS instructions
--- a/src/amd/compiler/aco_instruction_selection_setup.cpp
+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp
@ -254,8 +254,6 @@ setup_vs_variables(isel_context* ctx, nir_shader* nir)
   if (ctx->stage == vertex_ngg) {
      ctx->program->config->lds_size =
         DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule);
-      assert((ctx->program->config->lds_size * ctx->program->dev.lds_encoding_granule) <
-             (32 * 1024));
   }
 }

@ -285,8 +283,6 @@ setup_tes_variables(isel_context* ctx, nir_shader* nir)
   if (ctx->stage == tess_eval_ngg) {
      ctx->program->config->lds_size =
         DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule);
-      assert((ctx->program->config->lds_size * ctx->program->dev.lds_encoding_granule) <
-             (32 * 1024));
   }
 }

@ -295,7 +291,6 @@ setup_ms_variables(isel_context* ctx, nir_shader* nir)
 {
   ctx->program->config->lds_size =
      DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule);
-   assert((ctx->program->config->lds_size * ctx->program->dev.lds_encoding_granule) < (32 * 1024));
 }

 void
--- a/src/amd/compiler/aco_ir.cpp
+++ b/src/amd/compiler/aco_ir.cpp
@ -101,7 +101,10 @@ init_program(Program* program, Stage stage, const struct aco_shader_info* info,
   program->dev.lds_encoding_granule = gfx_level >= GFX11 && stage == fragment_fs ? 1024 :
                                       gfx_level >= GFX7 ? 512 : 256;
   program->dev.lds_alloc_granule = gfx_level >= GFX10_3 ? 1024 : program->dev.lds_encoding_granule;
+
+   /* GFX6: There is 64KB LDS per CU, but a single workgroup can only use 32KB. */
   program->dev.lds_limit = gfx_level >= GFX7 ? 65536 : 32768;
+
   /* apparently gfx702 also has 16-bank LDS but I can't find a family for that */
   program->dev.has_16bank_lds = family == CHIP_KABINI || family == CHIP_STONEY;

--- a/src/amd/vulkan/winsys/null/radv_null_winsys.c
+++ b/src/amd/vulkan/winsys/null/radv_null_winsys.c
@ -137,7 +137,9 @@ radv_null_winsys_query_info(struct radeon_winsys *rws, struct radeon_info *info)
   else
      info->num_physical_wave64_vgprs_per_simd = 256;
   info->num_simd_per_compute_unit = info->gfx_level >= GFX10 ? 2 : 4;
-   info->lds_size_per_workgroup = info->gfx_level >= GFX10 ? 128 * 1024 : 64 * 1024;
+   info->lds_size_per_workgroup = info->gfx_level >= GFX10  ? 128 * 1024
+                                  : info->gfx_level >= GFX7 ? 64 * 1024
+                                                            : 32 * 1024;
   info->lds_encode_granularity = info->gfx_level >= GFX7 ? 128 * 4 : 64 * 4;
   info->lds_alloc_granularity =
      info->gfx_level >= GFX10_3 ? 256 * 4 : info->lds_encode_granularity;