radeonsi: pack GS_STATE_ESGS_VERTEX_STRIDE better to save 2 bits

Change it to the number of ES outputs, then compute the stride from that.

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26917>
This commit is contained in:
Marek Olšák 2023-12-31 12:27:55 -05:00 committed by Marge Bot
parent 8eed352e05
commit bad2530a40
3 changed files with 31 additions and 13 deletions

View file

@ -360,9 +360,12 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s
break;
case nir_intrinsic_load_esgs_vertex_stride_amd:
assert(sel->screen->info.gfx_level >= GFX9);
replacement = shader->is_monolithic ?
nir_imm_int(b, key->ge.part.gs.es->info.esgs_vertex_stride / 4) :
GET_FIELD_NIR(GS_STATE_ESGS_VERTEX_STRIDE);
if (shader->is_monolithic) {
replacement = nir_imm_int(b, key->ge.part.gs.es->info.esgs_vertex_stride / 4);
} else {
nir_def *num_es_outputs = GET_FIELD_NIR(GS_STATE_NUM_ES_OUTPUTS);
replacement = nir_iadd_imm(b, nir_imul_imm(b, num_es_outputs, 4), 1);
}
break;
case nir_intrinsic_load_tcs_num_patches_amd: {
nir_def *tmp = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 0, 6);

View file

@ -253,8 +253,12 @@ enum
* in the shader via vs_state_bits in legacy GS, the GS copy shader, and any NGG shader.
*/
/* bit gap */
#define GS_STATE_ESGS_VERTEX_STRIDE__SHIFT 11
#define GS_STATE_ESGS_VERTEX_STRIDE__MASK 0xff /* max 32 * 4 + 1 */
/* The number of ES outputs is derived from the last output index of SI_UNIQUE_SLOT_* + 1, which
* can be 55 at most. The ESGS vertex stride in dwords is: NUM_ES_OUTPUTS * 4 + 1
* Only used by GFX9+ to compute LDS addresses of GS inputs.
*/
#define GS_STATE_NUM_ES_OUTPUTS__SHIFT 13
#define GS_STATE_NUM_ES_OUTPUTS__MASK 0x3f
/* Small prim filter precision = num_samples / quant_mode, which can only be equal to 1/2^n
* where n is between 4 and 12. Knowing that, we only need to store 4 bits of the FP32 exponent.
* Set it like this: value = (fui(num_samples / quant_mode) >> 23) & 0xf;

View file

@ -906,14 +906,25 @@ void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *
assert(out->max_prims_per_subgroup <= max_out_prims);
}
static void gfx9_set_gs_sgpr_num_es_outputs(struct si_context *sctx, unsigned esgs_vertex_stride)
{
/* The stride must always be odd (e.g. a multiple of 4 + 1) to reduce LDS bank conflicts. */
assert(esgs_vertex_stride % 4 == 1);
unsigned num_es_outputs = (esgs_vertex_stride - 1) / 4;
/* If there are no ES outputs, GS doesn't use this SGPR field, so only set it if the number
* is non-zero.
*/
if (num_es_outputs)
SET_FIELD(sctx->current_gs_state, GS_STATE_NUM_ES_OUTPUTS, num_es_outputs);
}
static void si_emit_shader_gs(struct si_context *sctx, unsigned index)
{
struct si_shader *shader = sctx->queued.named.gs;
if (sctx->gfx_level >= GFX9) {
SET_FIELD(sctx->current_gs_state, GS_STATE_ESGS_VERTEX_STRIDE,
shader->key.ge.part.gs.es->info.esgs_vertex_stride / 4);
}
if (sctx->gfx_level >= GFX9)
gfx9_set_gs_sgpr_num_es_outputs(sctx, shader->key.ge.part.gs.es->info.esgs_vertex_stride / 4);
radeon_begin(&sctx->gfx_cs);
@ -1167,8 +1178,8 @@ static void gfx10_emit_shader_ngg(struct si_context *sctx, unsigned index)
{
struct si_shader *shader = sctx->queued.named.gs;
SET_FIELD(sctx->current_gs_state, GS_STATE_ESGS_VERTEX_STRIDE,
shader->ngg.esgs_vertex_stride);
if (shader->selector->stage == MESA_SHADER_GEOMETRY)
gfx9_set_gs_sgpr_num_es_outputs(sctx, shader->ngg.esgs_vertex_stride);
radeon_begin(&sctx->gfx_cs);
if (HAS_TESS) {
@ -1226,8 +1237,8 @@ static void gfx11_dgpu_emit_shader_ngg(struct si_context *sctx, unsigned index)
{
struct si_shader *shader = sctx->queued.named.gs;
SET_FIELD(sctx->current_gs_state, GS_STATE_ESGS_VERTEX_STRIDE,
shader->ngg.esgs_vertex_stride);
if (shader->selector->stage == MESA_SHADER_GEOMETRY)
gfx9_set_gs_sgpr_num_es_outputs(sctx, shader->ngg.esgs_vertex_stride);
radeon_begin(&sctx->gfx_cs);
gfx11_begin_packed_context_regs();