ac/nir: return GSVS emit sizes from legacy GS lowering and simplify shader info

This simplifies shader info in drivers by returning GSVS emit sizes from
ac_nir_lower_legacy_gs. The pass knows the sizes, so drivers shouldn't
have to determine them independently.

This also makes the values more accurate because both drivers were
computing the GSVS emit sizes inaccurately and had redundant fields
in shader info. RADV had a lot of redudancy there.

Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35473>
This commit is contained in:
Marek Olšák 2025-06-01 16:39:35 -04:00 committed by Marge Bot
parent c1d3108855
commit 65972f2301
12 changed files with 62 additions and 59 deletions

View file

@ -276,9 +276,13 @@ typedef struct {
bool force_vrs;
} ac_nir_lower_legacy_gs_options;
typedef struct {
uint8_t num_components_per_stream[4];
} ac_nir_legacy_gs_info;
bool
ac_nir_lower_legacy_gs(nir_shader *nir, ac_nir_lower_legacy_gs_options *options,
nir_shader **gs_copy_shader);
nir_shader **gs_copy_shader, ac_nir_legacy_gs_info *out_info);
/* This is a pre-link pass. It should only eliminate code and do lowering that mostly doesn't
* generate AMD-specific intrinsics.

View file

@ -10,6 +10,7 @@
#include "nir_builder.h"
typedef struct {
ac_nir_legacy_gs_info *out_info;
ac_nir_prerast_out out;
nir_def *vertex_count[4];
@ -114,6 +115,9 @@ lower_legacy_gs_emit_vertex_with_counter(nir_builder *b, nir_intrinsic_instr *in
memset(s->out.outputs_16bit_hi[slot], 0, sizeof(s->out.outputs_16bit_hi[slot]));
}
assert(offset / 4 < 256);
s->out_info->num_components_per_stream[stream] = offset / 4;
/* Signal vertex emission. */
nir_sendmsg_amd(b, nir_load_gs_wave_id_amd(b),
.base = AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8));
@ -190,9 +194,9 @@ gather_output_stores(nir_shader *shader, lower_legacy_gs_state *s)
bool
ac_nir_lower_legacy_gs(nir_shader *nir, ac_nir_lower_legacy_gs_options *options,
nir_shader **gs_copy_shader)
nir_shader **gs_copy_shader, ac_nir_legacy_gs_info *out_info)
{
lower_legacy_gs_state s = {0};
lower_legacy_gs_state s = {out_info};
gather_output_stores(nir, &s);
ac_nir_compute_prerast_packed_output_info(&s.out);

View file

@ -449,7 +449,7 @@ load_gsvs_ring(nir_builder *b, lower_abi_state *s, unsigned stream_id)
unsigned stream_offset = 0;
unsigned stride = 0;
for (unsigned i = 0; i <= stream_id; i++) {
stride = 4 * s->info->gs.num_stream_output_components[i] * s->info->gs.vertices_out;
stride = 4 * (uint32_t)s->info->gs.num_components_per_stream[i] * s->info->gs.vertices_out;
if (i < stream_id)
stream_offset += stride * s->info->wave_size;
}

View file

@ -503,7 +503,12 @@ radv_postprocess_nir(struct radv_device *device, const struct radv_graphics_stat
.has_param_exports = stage->info.outinfo.param_exports,
.force_vrs = stage->info.force_vrs_per_vertex,
};
NIR_PASS(_, stage->nir, ac_nir_lower_legacy_gs, &options, &stage->gs_copy_shader);
ac_nir_legacy_gs_info info = {0};
NIR_PASS(_, stage->nir, ac_nir_lower_legacy_gs, &options, &stage->gs_copy_shader, &info);
for (unsigned i = 0; i < 4; i++)
stage->info.gs.num_components_per_stream[i] = info.num_components_per_stream[i];
}
} else if (stage->stage == MESA_SHADER_FRAGMENT) {
ac_nir_lower_ps_late_options late_options = {
@ -998,7 +1003,12 @@ radv_GetPipelineExecutableStatisticsKHR(VkDevice _device, const VkPipelineExecut
shader->info.outinfo.prim_param_exports;
} else {
/* GS -> FS outputs (GFX6-10.3 legacy) */
stats.outputs += shader->info.gs.gsvs_vertex_size / 16;
stats.outputs += DIV_ROUND_UP(((uint32_t)shader->info.gs.num_components_per_stream[0] +
(uint32_t)shader->info.gs.num_components_per_stream[1] +
(uint32_t)shader->info.gs.num_components_per_stream[2] +
(uint32_t)shader->info.gs.num_components_per_stream[3]) *
4,
16);
}
break;

View file

@ -1509,8 +1509,11 @@ radv_precompute_registers_hw_gs(struct radv_device *device, struct radv_shader_b
S_028A44_GS_INST_PRIMS_IN_SUBGRP(info->gs_ring_info.gs_inst_prims_in_subgroup);
const uint32_t gs_max_out_vertices = info->gs.vertices_out;
const uint8_t max_stream = info->gs.max_stream;
const uint8_t *num_components = info->gs.num_stream_output_components;
const uint8_t max_stream = info->gs.num_components_per_stream[3] ? 3
: info->gs.num_components_per_stream[2] ? 2
: info->gs.num_components_per_stream[1] ? 1
: 0;
const uint8_t *num_components = info->gs.num_components_per_stream;
uint32_t offset = num_components[0] * gs_max_out_vertices;
info->regs.gs.vgt_gsvs_ring_offset[0] = offset;

View file

@ -149,9 +149,6 @@ gather_intrinsic_store_output_info(const nir_shader *nir, const nir_intrinsic_in
case MESA_SHADER_TESS_EVAL:
output_usage_mask = info->tes.output_usage_mask;
break;
case MESA_SHADER_GEOMETRY:
output_usage_mask = info->gs.output_usage_mask;
break;
case MESA_SHADER_FRAGMENT:
if (location >= FRAG_RESULT_DATA0) {
const unsigned fs_semantic = location + io_sem.dual_source_blend_index;
@ -184,11 +181,6 @@ gather_intrinsic_store_output_info(const nir_shader *nir, const nir_intrinsic_in
}
}
if (nir->info.stage == MESA_SHADER_GEOMETRY) {
const uint8_t gs_streams = nir_intrinsic_io_semantics(instr).gs_streams;
info->gs.output_streams[location] |= gs_streams << (component * 2);
}
if ((location == VARYING_SLOT_CLIP_DIST0 || location == VARYING_SLOT_CLIP_DIST1) && !io_sem.no_sysval_output) {
unsigned base = (location == VARYING_SLOT_CLIP_DIST1 ? 4 : 0) + component;
unsigned clip_array_mask = BITFIELD_MASK(nir->info.clip_distance_array_size);
@ -721,7 +713,13 @@ radv_get_legacy_gs_info(const struct radv_device *device, struct radv_shader_inf
unsigned min_esgs_ring_size = align(esgs_vertex_stride * gs_vertex_reuse * wave_size, alignment);
/* These are recommended sizes, not minimum sizes. */
unsigned esgs_ring_size = max_gs_waves * 2 * wave_size * esgs_vertex_stride * gs_info->gs.vertices_in;
unsigned gsvs_ring_size = max_gs_waves * 2 * wave_size * gs_info->gs.max_gsvs_emit_size;
unsigned gsvs_emit_size = 0;
for (unsigned stream = 0; stream < 4; stream++) {
gsvs_emit_size += (uint32_t)gs_info->gs.num_components_per_stream[stream] * 4 * gs_info->gs.vertices_out;
}
unsigned gsvs_ring_size = max_gs_waves * 2 * wave_size * gsvs_emit_size;
min_esgs_ring_size = align(min_esgs_ring_size, alignment);
esgs_ring_size = align(esgs_ring_size, alignment);
@ -736,29 +734,11 @@ radv_get_legacy_gs_info(const struct radv_device *device, struct radv_shader_inf
static void
gather_shader_info_gs(struct radv_device *device, const nir_shader *nir, struct radv_shader_info *info)
{
unsigned add_clip = nir->info.clip_distance_array_size + nir->info.cull_distance_array_size > 4;
info->gs.gsvs_vertex_size = (util_bitcount64(nir->info.outputs_written) + add_clip) * 16;
info->gs.max_gsvs_emit_size = info->gs.gsvs_vertex_size * nir->info.gs.vertices_out;
info->gs.vertices_in = nir->info.gs.vertices_in;
info->gs.vertices_out = nir->info.gs.vertices_out;
info->gs.input_prim = nir->info.gs.input_primitive;
info->gs.output_prim = nir->info.gs.output_primitive;
info->gs.invocations = nir->info.gs.invocations;
info->gs.max_stream = nir->info.gs.active_stream_mask ? util_last_bit(nir->info.gs.active_stream_mask) - 1 : 0;
for (unsigned slot = 0; slot < VARYING_SLOT_MAX; ++slot) {
const uint8_t usage_mask = info->gs.output_usage_mask[slot];
const uint8_t gs_streams = info->gs.output_streams[slot];
for (unsigned component = 0; component < 4; ++component) {
if (!(usage_mask & BITFIELD_BIT(component)))
continue;
const uint8_t stream = (gs_streams >> (component * 2)) & 0x3;
info->gs.num_stream_output_components[stream]++;
}
}
if (!info->inputs_linked)
info->gs.num_linked_inputs = util_last_bit64(radv_gather_unlinked_io_mask(nir->info.inputs_read));

View file

@ -135,12 +135,7 @@ struct radv_shader_info {
uint32_t num_outputs; /* For NGG streamout only */
} vs;
struct {
uint8_t output_usage_mask[VARYING_SLOT_VAR31 + 1];
uint8_t num_stream_output_components[4];
uint8_t output_streams[VARYING_SLOT_VAR31 + 1];
uint8_t max_stream;
unsigned gsvs_vertex_size;
unsigned max_gsvs_emit_size;
uint8_t num_components_per_stream[4];
unsigned vertices_in;
unsigned vertices_out;
unsigned input_prim;

View file

@ -127,7 +127,7 @@ static bool build_gsvs_ring_desc(nir_builder *b, struct lower_abi_state *s)
*/
for (unsigned stream = 0; stream < 4; stream++) {
unsigned num_components = sel->info.num_gs_stream_components[stream];
unsigned num_components = s->shader->info.legacy_gs.num_components_per_stream[stream];
if (!num_components)
continue;

View file

@ -1612,7 +1612,8 @@ static void run_late_optimization_and_lowering_passes(struct si_nir_shader_ctx *
.force_vrs = sel->screen->options.vrs2x2,
};
NIR_PASS(_, nir, ac_nir_lower_legacy_gs, &options, &ctx->gs_copy_shader);
NIR_PASS(_, nir, ac_nir_lower_legacy_gs, &options, &ctx->gs_copy_shader,
&shader->info.legacy_gs);
progress = true;
} else if (nir->info.stage == MESA_SHADER_FRAGMENT && shader->is_monolithic) {
ac_nir_lower_ps_late_options late_options = {

View file

@ -171,10 +171,11 @@ static void scan_io_usage(const nir_shader *nir, struct si_shader_info *info,
for (unsigned i = 0; i < 4; i++) {
unsigned stream = (gs_streams >> (i * 2)) & 0x3;
if (new_mask & (1 << i)) {
if (new_mask && stream == 0)
info->gs_writes_stream0 = true;
if (new_mask & (1 << i))
info->output_streams[loc] |= stream << (i * 2);
info->num_gs_stream_components[stream]++;
}
}
if (nir_intrinsic_has_src_type(intr))
@ -437,7 +438,6 @@ void si_nir_scan_shader(struct si_screen *sscreen, struct nir_shader *nir,
info->base.gs.input_primitive = nir->info.gs.input_primitive;
info->base.gs.vertices_out = nir->info.gs.vertices_out;
info->base.gs.invocations = nir->info.gs.invocations;
info->base.gs.active_stream_mask = nir->info.gs.active_stream_mask;
break;
case MESA_SHADER_FRAGMENT:
@ -637,7 +637,6 @@ void si_nir_scan_shader(struct si_screen *sscreen, struct nir_shader *nir,
}
if (nir->info.stage == MESA_SHADER_GEOMETRY) {
info->max_gsvs_emit_size = info->num_outputs * 16 * nir->info.gs.vertices_out;
info->gs_input_verts_per_prim =
mesa_vertices_per_prim(nir->info.gs.input_primitive);
}

View file

@ -64,7 +64,6 @@ struct si_shader_info {
enum mesa_prim input_primitive;
uint16_t vertices_out;
uint8_t invocations;
uint8_t active_stream_mask:4;
} gs;
struct {
@ -98,7 +97,6 @@ struct si_shader_info {
uint8_t num_vs_inputs;
uint8_t num_vbos_in_user_sgprs;
uint8_t num_gs_stream_components[4];
uint16_t enabled_streamout_buffer_mask;
uint64_t inputs_read; /* "get_unique_index" bits */
@ -112,10 +110,10 @@ struct si_shader_info {
uint8_t clipdist_mask;
uint8_t culldist_mask;
bool gs_writes_stream0;
uint16_t esgs_vertex_stride;
uint8_t gs_input_verts_per_prim;
unsigned max_gsvs_emit_size;
/* Set 0xf or 0x0 (4 bits) per each written output.
* ANDed with spi_shader_col_format.
@ -230,6 +228,7 @@ struct si_shader_variant_info {
unsigned private_mem_vgprs;
unsigned max_simd_waves;
uint32_t ngg_lds_vertex_size; /* VS,TES: Cull+XFB, GS: GSVS size */
ac_nir_legacy_gs_info legacy_gs;
};
#endif

View file

@ -938,11 +938,13 @@ static void si_emit_shader_gs(struct si_context *sctx, unsigned index)
static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
{
struct si_shader_selector *sel = shader->selector;
const uint8_t *num_components = sel->info.num_gs_stream_components;
const uint8_t *num_components = shader->info.legacy_gs.num_components_per_stream;
unsigned gs_num_invocations = sel->info.base.gs.invocations;
struct si_pm4_state *pm4;
uint64_t va;
unsigned max_stream = util_last_bit(sel->info.base.gs.active_stream_mask);
unsigned max_stream = num_components[3] ? 4 :
num_components[2] ? 3 :
num_components[1] ? 2 : 1;
unsigned offset;
assert(sscreen->info.gfx_level < GFX11); /* gfx11 doesn't have the legacy pipeline */
@ -951,19 +953,19 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
if (!pm4)
return;
offset = num_components[0] * sel->info.base.gs.vertices_out;
offset = (uint32_t)num_components[0] * sel->info.base.gs.vertices_out;
shader->gs.vgt_gsvs_ring_offset_1 = offset;
if (max_stream >= 2)
offset += num_components[1] * sel->info.base.gs.vertices_out;
offset += (uint32_t)num_components[1] * sel->info.base.gs.vertices_out;
shader->gs.vgt_gsvs_ring_offset_2 = offset;
if (max_stream >= 3)
offset += num_components[2] * sel->info.base.gs.vertices_out;
offset += (uint32_t)num_components[2] * sel->info.base.gs.vertices_out;
shader->gs.vgt_gsvs_ring_offset_3 = offset;
if (max_stream >= 4)
offset += num_components[3] * sel->info.base.gs.vertices_out;
offset += (uint32_t)num_components[3] * sel->info.base.gs.vertices_out;
shader->gs.vgt_gsvs_ring_itemsize = offset;
/* The GSVS_RING_ITEMSIZE register takes 15 bits */
@ -3541,7 +3543,7 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
!sel->nir->info.writes_memory &&
/* NGG GS supports culling with streamout because it culls after streamout. */
(sel->stage == MESA_SHADER_GEOMETRY || !sel->info.enabled_streamout_buffer_mask) &&
(sel->stage != MESA_SHADER_GEOMETRY || sel->info.num_gs_stream_components[0]) &&
(sel->stage != MESA_SHADER_GEOMETRY || sel->info.gs_writes_stream0) &&
(sel->stage != MESA_SHADER_VERTEX ||
(!sel->nir->info.vs.blit_sgprs_amd &&
!sel->nir->info.vs.window_space_position));
@ -4120,10 +4122,16 @@ bool si_update_gs_ring_buffers(struct si_context *sctx)
/* Calculate the minimum size. */
unsigned min_esgs_ring_size = align(es->info.esgs_vertex_stride * gs_vertex_reuse * wave_size, alignment);
unsigned gsvs_emit_size = 0;
for (unsigned stream = 0; stream < 4; stream++) {
gsvs_emit_size += (uint32_t)sctx->shader.gs.current->info.legacy_gs.num_components_per_stream[stream] *
4 * gs->info.base.gs.vertices_out;
}
/* These are recommended sizes, not minimum sizes. */
unsigned esgs_ring_size =
max_gs_waves * 2 * wave_size * es->info.esgs_vertex_stride * gs->info.gs_input_verts_per_prim;
unsigned gsvs_ring_size = max_gs_waves * 2 * wave_size * gs->info.max_gsvs_emit_size;
unsigned gsvs_ring_size = max_gs_waves * 2 * wave_size * gsvs_emit_size;
min_esgs_ring_size = align(min_esgs_ring_size, alignment);
esgs_ring_size = align(esgs_ring_size, alignment);