diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 57e18f88034..e4179575b8d 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -787,6 +787,7 @@ struct si_streamout_target { struct si_streamout { enum mesa_prim output_prim; + uint8_t num_verts_per_prim; bool begin_emitted; unsigned enabled_mask; diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 85db130d881..3e0575ac7f7 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -1594,13 +1594,15 @@ static void si_dump_shader_key(const struct si_shader *shader, FILE *f) if ((stage == MESA_SHADER_GEOMETRY || stage == MESA_SHADER_TESS_EVAL || stage == MESA_SHADER_VERTEX) && !key->ge.as_es && !key->ge.as_ls) { - fprintf(f, " opt.kill_outputs = 0x%" PRIx64 "\n", key->ge.opt.kill_outputs); - fprintf(f, " opt.kill_pointsize = 0x%x\n", key->ge.opt.kill_pointsize); - fprintf(f, " opt.kill_layer = 0x%x\n", key->ge.opt.kill_layer); - fprintf(f, " opt.kill_clip_distances = 0x%x\n", key->ge.opt.kill_clip_distances); - fprintf(f, " opt.ngg_culling = 0x%x\n", key->ge.opt.ngg_culling); - fprintf(f, " opt.remove_streamout = 0x%x\n", key->ge.opt.remove_streamout); fprintf(f, " mono.remove_streamout = 0x%x\n", key->ge.mono.remove_streamout); + fprintf(f, " opt.kill_outputs = 0x%" PRIx64 "\n", key->ge.opt.kill_outputs); + fprintf(f, " opt.kill_clip_distances = 0x%x\n", key->ge.opt.kill_clip_distances); + fprintf(f, " opt.kill_pointsize = %u\n", key->ge.opt.kill_pointsize); + fprintf(f, " opt.kill_layer = %u\n", key->ge.opt.kill_layer); + fprintf(f, " opt.remove_streamout = %u\n", key->ge.opt.remove_streamout); + fprintf(f, " opt.ngg_culling = 0x%x\n", key->ge.opt.ngg_culling); + fprintf(f, " opt.ngg_vs_streamout_num_verts_per_prim = %u\n", + key->ge.opt.ngg_vs_streamout_num_verts_per_prim); } if (stage <= MESA_SHADER_GEOMETRY) diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 13144b606cb..bb5a5d5f2ff 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -780,6 +780,12 @@ struct si_shader_key_ge { /* For NGG VS and TES. */ unsigned ngg_culling : 11; /* SI_NGG_CULL_* */ + /* If NGG VS streamout knows the number of vertices per primitive at compile time, + * it can put stores for all vertices in the same VMEM clause, instead of storing + * vertices for the 2nd and 3rd vertex conditionally because the primitive type is + * unknown. + */ + unsigned ngg_vs_streamout_num_verts_per_prim : 2; /* For shaders where monolithic variants have better code. * diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index 46b7b5dbee3..78798a47f24 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -1369,6 +1369,15 @@ unsigned si_get_input_prim(const struct si_shader_selector *gs, const union si_s if (key->ge.opt.ngg_culling & SI_NGG_CULL_VS_LINES) return MESA_PRIM_LINES; + switch (key->ge.opt.ngg_vs_streamout_num_verts_per_prim) { + case 3: + return MESA_PRIM_TRIANGLES; + case 2: + return MESA_PRIM_LINES; + case 1: + return MESA_PRIM_POINTS; + } + if (return_unknown) return MESA_PRIM_UNKNOWN; else @@ -2525,8 +2534,21 @@ static void si_get_vs_key_outputs(struct si_context *sctx, struct si_shader_sele key->ge.opt.ngg_culling = sctx->ngg_culling; key->ge.mono.u.vs_export_prim_id = vs->stage != MESA_SHADER_GEOMETRY && sctx->shader.ps.cso && sctx->shader.ps.cso->info.uses_primid; - key->ge.opt.remove_streamout = vs->info.enabled_streamout_buffer_mask && - !sctx->streamout.enabled_mask; + + if (vs->info.enabled_streamout_buffer_mask) { + if (sctx->streamout.enabled_mask) { + key->ge.opt.remove_streamout = 0; + key->ge.opt.ngg_vs_streamout_num_verts_per_prim = + sctx->gfx_level >= GFX11 ? sctx->streamout.num_verts_per_prim : 0; + } else { + key->ge.opt.remove_streamout = 1; + key->ge.opt.ngg_vs_streamout_num_verts_per_prim = 0; + } + } else { + key->ge.opt.remove_streamout = 0; + key->ge.opt.ngg_vs_streamout_num_verts_per_prim = 0; + } + if (sctx->gfx_level >= GFX12) key->ge.mono.remove_streamout = key->ge.opt.remove_streamout; } @@ -2538,6 +2560,7 @@ static void si_clear_vs_key_outputs(struct si_context *sctx, struct si_shader_se key->ge.opt.kill_outputs = 0; key->ge.opt.remove_streamout = 0; key->ge.opt.ngg_culling = 0; + key->ge.opt.ngg_vs_streamout_num_verts_per_prim = 0; key->ge.mono.u.vs_export_prim_id = 0; key->ge.mono.remove_streamout = 0; } diff --git a/src/gallium/drivers/radeonsi/si_state_streamout.c b/src/gallium/drivers/radeonsi/si_state_streamout.c index f0d6144b90c..b1ed4f11f7c 100644 --- a/src/gallium/drivers/radeonsi/si_state_streamout.c +++ b/src/gallium/drivers/radeonsi/si_state_streamout.c @@ -218,6 +218,8 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ sctx->do_update_shaders = true; /* to keep/remove streamout shader code as an optimization */ sctx->streamout.output_prim = output_prim; + sctx->streamout.num_verts_per_prim = output_prim == MESA_PRIM_UNKNOWN ? + 0 : mesa_vertices_per_prim(output_prim); sctx->streamout.num_targets = num_targets; sctx->streamout.enabled_mask = enabled_mask; sctx->streamout.append_bitmask = append_bitmask;