diff --git a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c index 323329e737a..a20b9847553 100644 --- a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c +++ b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c @@ -246,7 +246,10 @@ static bool lower_abi_instr(nir_builder *b, nir_instr *instr, struct lower_abi_s unreachable("no nir_load_lshs_vertex_stride_amd"); break; case nir_intrinsic_load_esgs_vertex_stride_amd: - replacement = nir_imm_int(b, 1); + assert(sel->screen->info.gfx_level >= GFX9); + replacement = shader->is_monolithic ? + nir_imm_int(b, key->ge.part.gs.es->info.esgs_vertex_stride / 4) : + GET_FIELD_NIR(GS_STATE_ESGS_VERTEX_STRIDE); break; case nir_intrinsic_load_tcs_num_patches_amd: { nir_ssa_def *tmp = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 0, 6); diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 793d405b9eb..c4f7cda73a5 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -266,6 +266,8 @@ enum * in the shader via vs_state_bits in legacy GS, the GS copy shader, and any NGG shader. */ /* bit gap */ +#define GS_STATE_ESGS_VERTEX_STRIDE__SHIFT 10 +#define GS_STATE_ESGS_VERTEX_STRIDE__MASK 0xff /* max 32 * 4 + 1 */ /* Small prim filter precision = num_samples / quant_mode, which can only be equal to 1/2^n * where n is between 4 and 12. Knowing that, we only need to store 4 bits of the FP32 exponent. * Set it like this: value = (fui(num_samples / quant_mode) >> 23) & 0xf; @@ -917,7 +919,7 @@ struct si_shader { unsigned vgt_primitiveid_en; unsigned vgt_gs_onchip_cntl; unsigned vgt_gs_instance_cnt; - unsigned vgt_esgs_ring_itemsize; + unsigned esgs_vertex_stride; unsigned spi_vs_out_config; unsigned spi_shader_idx_format; unsigned spi_shader_pos_format; diff --git a/src/gallium/drivers/radeonsi/si_shader_info.c b/src/gallium/drivers/radeonsi/si_shader_info.c index d120256e009..bf073c363e6 100644 --- a/src/gallium/drivers/radeonsi/si_shader_info.c +++ b/src/gallium/drivers/radeonsi/si_shader_info.c @@ -791,8 +791,8 @@ void si_nir_scan_shader(struct si_screen *sscreen, const struct nir_shader *nir, */ if (sscreen->info.gfx_level >= GFX9) info->esgs_vertex_stride += 4; - - assert(((info->esgs_vertex_stride / 4) & C_028AAC_ITEMSIZE) == 0); + else + assert(((info->esgs_vertex_stride / 4) & C_028AAC_ITEMSIZE) == 0); info->tcs_vgpr_only_inputs = ~info->base.tess.tcs_cross_invocation_inputs_read & ~info->base.inputs_read_indirectly & diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c index 65bbbb4aca5..c0213111415 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c @@ -65,8 +65,8 @@ static void si_set_es_return_value_for_gs(struct si_shader_context *ctx) ret = si_insert_input_ptr(ctx, ret, ctx->args->internal_bindings, 8 + SI_SGPR_INTERNAL_BINDINGS); ret = si_insert_input_ptr(ctx, ret, ctx->args->bindless_samplers_and_images, 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES); + ret = si_insert_input_ptr(ctx, ret, ctx->args->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS); if (ctx->screen->use_ngg) { - ret = si_insert_input_ptr(ctx, ret, ctx->args->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS); ret = si_insert_input_ptr(ctx, ret, ctx->args->small_prim_cull_info, 8 + GFX9_SGPR_SMALL_PRIM_CULL_INFO); if (ctx->screen->info.gfx_level >= GFX11) ret = si_insert_input_ptr(ctx, ret, ctx->args->gs_attr_address, 8 + GFX9_SGPR_ATTRIBUTE_RING_ADDR); diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index a4b3b155790..7560a07c74b 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -5804,6 +5804,7 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing) si_pm4_set_reg(pm4, R_028C4C_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL, S_028C4C_NULL_SQUAD_AA_MASK_ENABLE(1)); + si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE, 1); si_pm4_set_reg(pm4, R_030968_VGT_INSTANCE_BASE_ID, 0); if (sctx->gfx_level < GFX11) { diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index a8ee703b6a9..e1a4040b8d9 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -1214,10 +1214,11 @@ static void si_emit_vs_state(struct si_context *sctx, unsigned index_size) if (HAS_GS) { radeon_set_sh_reg(vs_base + SI_SGPR_VS_STATE_BITS * 4, vs_state); - /* NGG always uses the state bits. Legacy GS uses the state bits only for the emulation - * of GS pipeline statistics on gfx10.x. + /* GS always uses the state bits for emulating VGT_ESGS_RING_ITEMSIZE on Gfx9 + * (via nir_load_esgs_vertex_stride_amd) and for emulating GS pipeline statistics + * on gfx10.x. NGG GS also has lots of states in there. */ - if (NGG || (GFX_VERSION >= GFX10 && GFX_VERSION <= GFX10_3)) + if (GFX_VERSION >= GFX9) radeon_set_sh_reg(gs_base + SI_SGPR_VS_STATE_BITS * 4, gs_state); /* The GS copy shader (for legacy GS) always uses the state bits. */ diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index 3c50148a9c0..3ddb9ffdfc7 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -934,6 +934,11 @@ static void si_emit_shader_gs(struct si_context *sctx) { struct si_shader *shader = sctx->queued.named.gs; + if (sctx->gfx_level >= GFX9) { + SET_FIELD(sctx->current_gs_state, GS_STATE_ESGS_VERTEX_STRIDE, + shader->key.ge.part.gs.es->info.esgs_vertex_stride / 4); + } + radeon_begin(&sctx->gfx_cs); /* R_028A60_VGT_GSVS_RING_OFFSET_1, R_028A64_VGT_GSVS_RING_OFFSET_2 @@ -971,10 +976,6 @@ static void si_emit_shader_gs(struct si_context *sctx) radeon_opt_set_context_reg(sctx, R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP, SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP, shader->gs.vgt_gs_max_prims_per_subgroup); - /* R_028AAC_VGT_ESGS_RING_ITEMSIZE */ - radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE, - SI_TRACKED_VGT_ESGS_RING_ITEMSIZE, - shader->gs.vgt_esgs_ring_itemsize); if (shader->key.ge.part.gs.es->stage == MESA_SHADER_TESS_EVAL) radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM, @@ -1175,6 +1176,9 @@ bool gfx10_is_ngg_passthrough(struct si_shader *shader) /* Common tail code for NGG primitive shaders. */ static void gfx10_emit_shader_ngg_tail(struct si_context *sctx, struct si_shader *shader) { + SET_FIELD(sctx->current_gs_state, GS_STATE_ESGS_VERTEX_STRIDE, + shader->ngg.esgs_vertex_stride); + radeon_begin(&sctx->gfx_cs); radeon_opt_set_context_reg(sctx, R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP, SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP, @@ -1189,9 +1193,6 @@ static void gfx10_emit_shader_ngg_tail(struct si_context *sctx, struct si_shader } radeon_opt_set_context_reg(sctx, R_028B90_VGT_GS_INSTANCE_CNT, SI_TRACKED_VGT_GS_INSTANCE_CNT, shader->ngg.vgt_gs_instance_cnt); - radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE, - SI_TRACKED_VGT_ESGS_RING_ITEMSIZE, - shader->ngg.vgt_esgs_ring_itemsize); radeon_opt_set_context_reg(sctx, R_0286C4_SPI_VS_OUT_CONFIG, SI_TRACKED_SPI_VS_OUT_CONFIG, shader->ngg.spi_vs_out_config); radeon_opt_set_context_reg2( @@ -1441,10 +1442,10 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader gs_sel->info.writes_primid); if (gs_stage == MESA_SHADER_GEOMETRY) { - shader->ngg.vgt_esgs_ring_itemsize = es_sel->info.esgs_vertex_stride / 4; + shader->ngg.esgs_vertex_stride = es_sel->info.esgs_vertex_stride / 4; shader->ngg.vgt_gs_max_vert_out = gs_sel->info.base.gs.vertices_out; } else { - shader->ngg.vgt_esgs_ring_itemsize = 1; + shader->ngg.esgs_vertex_stride = 1; } if (es_stage == MESA_SHADER_TESS_EVAL)