From bad2530a40293cea5217e0cc0753dfa30eb05f20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sun, 31 Dec 2023 12:27:55 -0500 Subject: [PATCH] radeonsi: pack GS_STATE_ESGS_VERTEX_STRIDE better to save 2 bits Change it to the number of ES outputs, then compute the stride from that. Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- .../drivers/radeonsi/si_nir_lower_abi.c | 9 ++++--- src/gallium/drivers/radeonsi/si_shader.h | 8 ++++-- .../drivers/radeonsi/si_state_shaders.cpp | 27 +++++++++++++------ 3 files changed, 31 insertions(+), 13 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c index c02204751b0..013315acb2f 100644 --- a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c +++ b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c @@ -360,9 +360,12 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s break; case nir_intrinsic_load_esgs_vertex_stride_amd: assert(sel->screen->info.gfx_level >= GFX9); - replacement = shader->is_monolithic ? - nir_imm_int(b, key->ge.part.gs.es->info.esgs_vertex_stride / 4) : - GET_FIELD_NIR(GS_STATE_ESGS_VERTEX_STRIDE); + if (shader->is_monolithic) { + replacement = nir_imm_int(b, key->ge.part.gs.es->info.esgs_vertex_stride / 4); + } else { + nir_def *num_es_outputs = GET_FIELD_NIR(GS_STATE_NUM_ES_OUTPUTS); + replacement = nir_iadd_imm(b, nir_imul_imm(b, num_es_outputs, 4), 1); + } break; case nir_intrinsic_load_tcs_num_patches_amd: { nir_def *tmp = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 0, 6); diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 73bdef6730b..2e058791827 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -253,8 +253,12 @@ enum * in the shader via vs_state_bits in legacy GS, the GS copy shader, and any NGG shader. */ /* bit gap */ -#define GS_STATE_ESGS_VERTEX_STRIDE__SHIFT 11 -#define GS_STATE_ESGS_VERTEX_STRIDE__MASK 0xff /* max 32 * 4 + 1 */ +/* The number of ES outputs is derived from the last output index of SI_UNIQUE_SLOT_* + 1, which + * can be 55 at most. The ESGS vertex stride in dwords is: NUM_ES_OUTPUTS * 4 + 1 + * Only used by GFX9+ to compute LDS addresses of GS inputs. + */ +#define GS_STATE_NUM_ES_OUTPUTS__SHIFT 13 +#define GS_STATE_NUM_ES_OUTPUTS__MASK 0x3f /* Small prim filter precision = num_samples / quant_mode, which can only be equal to 1/2^n * where n is between 4 and 12. Knowing that, we only need to store 4 bits of the FP32 exponent. * Set it like this: value = (fui(num_samples / quant_mode) >> 23) & 0xf; diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index c7d723ed949..35138ae71b1 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -906,14 +906,25 @@ void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector * assert(out->max_prims_per_subgroup <= max_out_prims); } +static void gfx9_set_gs_sgpr_num_es_outputs(struct si_context *sctx, unsigned esgs_vertex_stride) +{ + /* The stride must always be odd (e.g. a multiple of 4 + 1) to reduce LDS bank conflicts. */ + assert(esgs_vertex_stride % 4 == 1); + unsigned num_es_outputs = (esgs_vertex_stride - 1) / 4; + + /* If there are no ES outputs, GS doesn't use this SGPR field, so only set it if the number + * is non-zero. + */ + if (num_es_outputs) + SET_FIELD(sctx->current_gs_state, GS_STATE_NUM_ES_OUTPUTS, num_es_outputs); +} + static void si_emit_shader_gs(struct si_context *sctx, unsigned index) { struct si_shader *shader = sctx->queued.named.gs; - if (sctx->gfx_level >= GFX9) { - SET_FIELD(sctx->current_gs_state, GS_STATE_ESGS_VERTEX_STRIDE, - shader->key.ge.part.gs.es->info.esgs_vertex_stride / 4); - } + if (sctx->gfx_level >= GFX9) + gfx9_set_gs_sgpr_num_es_outputs(sctx, shader->key.ge.part.gs.es->info.esgs_vertex_stride / 4); radeon_begin(&sctx->gfx_cs); @@ -1167,8 +1178,8 @@ static void gfx10_emit_shader_ngg(struct si_context *sctx, unsigned index) { struct si_shader *shader = sctx->queued.named.gs; - SET_FIELD(sctx->current_gs_state, GS_STATE_ESGS_VERTEX_STRIDE, - shader->ngg.esgs_vertex_stride); + if (shader->selector->stage == MESA_SHADER_GEOMETRY) + gfx9_set_gs_sgpr_num_es_outputs(sctx, shader->ngg.esgs_vertex_stride); radeon_begin(&sctx->gfx_cs); if (HAS_TESS) { @@ -1226,8 +1237,8 @@ static void gfx11_dgpu_emit_shader_ngg(struct si_context *sctx, unsigned index) { struct si_shader *shader = sctx->queued.named.gs; - SET_FIELD(sctx->current_gs_state, GS_STATE_ESGS_VERTEX_STRIDE, - shader->ngg.esgs_vertex_stride); + if (shader->selector->stage == MESA_SHADER_GEOMETRY) + gfx9_set_gs_sgpr_num_es_outputs(sctx, shader->ngg.esgs_vertex_stride); radeon_begin(&sctx->gfx_cs); gfx11_begin_packed_context_regs();