radeonsi: pack GS_STATE_ESGS_VERTEX_STRIDE better to save 2 bits

Change it to the number of ES outputs, then compute the stride from that. Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26917>
2026-01-03 18:00:10 +01:00 · 2023-12-31 12:27:55 -05:00 · 2023-12-31 12:27:55 -05:00 · bad2530a40
commit bad2530a40
parent 8eed352e05
3 changed files with 31 additions and 13 deletions
--- a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c
+++ b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c
@ -360,9 +360,12 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s
      break;
   case nir_intrinsic_load_esgs_vertex_stride_amd:
      assert(sel->screen->info.gfx_level >= GFX9);
-      replacement = shader->is_monolithic ?
-         nir_imm_int(b, key->ge.part.gs.es->info.esgs_vertex_stride / 4) :
-         GET_FIELD_NIR(GS_STATE_ESGS_VERTEX_STRIDE);
+      if (shader->is_monolithic) {
+         replacement = nir_imm_int(b, key->ge.part.gs.es->info.esgs_vertex_stride / 4);
+      } else {
+         nir_def *num_es_outputs = GET_FIELD_NIR(GS_STATE_NUM_ES_OUTPUTS);
+         replacement = nir_iadd_imm(b, nir_imul_imm(b, num_es_outputs, 4), 1);
+      }
      break;
   case nir_intrinsic_load_tcs_num_patches_amd: {
      nir_def *tmp = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 0, 6);
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@ -253,8 +253,12 @@ enum
 * in the shader via vs_state_bits in legacy GS, the GS copy shader, and any NGG shader.
 */
 /* bit gap */
-#define GS_STATE_ESGS_VERTEX_STRIDE__SHIFT      11
-#define GS_STATE_ESGS_VERTEX_STRIDE__MASK       0xff /* max 32 * 4 + 1 */
+/* The number of ES outputs is derived from the last output index of SI_UNIQUE_SLOT_* + 1, which
+ * can be 55 at most. The ESGS vertex stride in dwords is: NUM_ES_OUTPUTS * 4 + 1
+ * Only used by GFX9+ to compute LDS addresses of GS inputs.
+ */
+#define GS_STATE_NUM_ES_OUTPUTS__SHIFT          13
+#define GS_STATE_NUM_ES_OUTPUTS__MASK           0x3f
 /* Small prim filter precision = num_samples / quant_mode, which can only be equal to 1/2^n
 * where n is between 4 and 12. Knowing that, we only need to store 4 bits of the FP32 exponent.
 * Set it like this: value = (fui(num_samples / quant_mode) >> 23) & 0xf;
--- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
@ -906,14 +906,25 @@ void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *
   assert(out->max_prims_per_subgroup <= max_out_prims);
 }

+static void gfx9_set_gs_sgpr_num_es_outputs(struct si_context *sctx, unsigned esgs_vertex_stride)
+{
+   /* The stride must always be odd (e.g. a multiple of 4 + 1) to reduce LDS bank conflicts. */
+   assert(esgs_vertex_stride % 4 == 1);
+   unsigned num_es_outputs = (esgs_vertex_stride - 1) / 4;
+
+   /* If there are no ES outputs, GS doesn't use this SGPR field, so only set it if the number
+    * is non-zero.
+    */
+   if (num_es_outputs)
+      SET_FIELD(sctx->current_gs_state, GS_STATE_NUM_ES_OUTPUTS, num_es_outputs);
+}
+
 static void si_emit_shader_gs(struct si_context *sctx, unsigned index)
 {
   struct si_shader *shader = sctx->queued.named.gs;

-   if (sctx->gfx_level >= GFX9) {
-      SET_FIELD(sctx->current_gs_state, GS_STATE_ESGS_VERTEX_STRIDE,
-                shader->key.ge.part.gs.es->info.esgs_vertex_stride / 4);
-   }
+   if (sctx->gfx_level >= GFX9)
+      gfx9_set_gs_sgpr_num_es_outputs(sctx, shader->key.ge.part.gs.es->info.esgs_vertex_stride / 4);

   radeon_begin(&sctx->gfx_cs);

@ -1167,8 +1178,8 @@ static void gfx10_emit_shader_ngg(struct si_context *sctx, unsigned index)
 {
   struct si_shader *shader = sctx->queued.named.gs;

-   SET_FIELD(sctx->current_gs_state, GS_STATE_ESGS_VERTEX_STRIDE,
-             shader->ngg.esgs_vertex_stride);
+   if (shader->selector->stage == MESA_SHADER_GEOMETRY)
+      gfx9_set_gs_sgpr_num_es_outputs(sctx, shader->ngg.esgs_vertex_stride);

   radeon_begin(&sctx->gfx_cs);
   if (HAS_TESS) {
@ -1226,8 +1237,8 @@ static void gfx11_dgpu_emit_shader_ngg(struct si_context *sctx, unsigned index)
 {
   struct si_shader *shader = sctx->queued.named.gs;

-   SET_FIELD(sctx->current_gs_state, GS_STATE_ESGS_VERTEX_STRIDE,
-             shader->ngg.esgs_vertex_stride);
+   if (shader->selector->stage == MESA_SHADER_GEOMETRY)
+      gfx9_set_gs_sgpr_num_es_outputs(sctx, shader->ngg.esgs_vertex_stride);

   radeon_begin(&sctx->gfx_cs);
   gfx11_begin_packed_context_regs();