radeonsi: support mesh shader per primitive output

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37932>
This commit is contained in:
Qiang Yu 2025-05-19 14:13:16 +08:00 committed by Marge Bot
parent 16656ebaaf
commit dfc679e791
5 changed files with 40 additions and 8 deletions

View file

@ -54,7 +54,9 @@ static nir_def *build_attr_ring_desc(nir_builder *b, struct si_shader *shader,
b->shader->info.vs.blit_sgprs_amd - 1) :
ac_nir_load_arg(b, &args->ac, args->gs_attr_address);
unsigned stride = 16 * si_shader_num_alloc_param_exports(shader);
unsigned per_vertex_params = MAX2(1, si_shader_num_alloc_param_exports(shader));
unsigned total_params = per_vertex_params + shader->info.nr_prim_param_exports;
unsigned stride = 16 * total_params;
uint32_t desc[4];
ac_build_attr_ring_descriptor(sel->screen->info.gfx_level,

View file

@ -1224,6 +1224,7 @@ static void si_nir_assign_param_offsets(nir_shader *nir, struct si_shader *shade
uint64_t outputs_written = 0;
uint32_t outputs_written_16bit = 0;
uint64_t per_primitive_outputs = 0;
nir_function_impl *impl = nir_shader_get_entrypoint(nir);
assert(impl);
@ -1235,7 +1236,8 @@ static void si_nir_assign_param_offsets(nir_shader *nir, struct si_shader *shade
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
if (intr->intrinsic != nir_intrinsic_store_output &&
intr->intrinsic != nir_intrinsic_store_per_vertex_output)
intr->intrinsic != nir_intrinsic_store_per_vertex_output &&
intr->intrinsic != nir_intrinsic_store_per_primitive_output)
continue;
/* No indirect indexing allowed. */
@ -1250,6 +1252,9 @@ static void si_nir_assign_param_offsets(nir_shader *nir, struct si_shader *shade
else
outputs_written |= BITFIELD64_BIT(sem.location);
if (intr->intrinsic == nir_intrinsic_store_per_primitive_output)
per_primitive_outputs |= BITFIELD64_BIT(sem.location);
/* Assign the param index if it's unassigned. */
if (nir_slot_is_varying(sem.location, MESA_SHADER_FRAGMENT) && !sem.no_varying &&
(sem.gs_streams & 0x3) == 0 &&
@ -1259,7 +1264,10 @@ static void si_nir_assign_param_offsets(nir_shader *nir, struct si_shader *shade
/* It must not be remapped (duplicated). */
assert(slot_remap[sem.location] == -1);
temp_info->vs_output_param_offset[sem.location] = info->nr_param_exports++;
temp_info->vs_output_param_offset[sem.location] =
intr->intrinsic == nir_intrinsic_store_per_primitive_output ?
info->nr_prim_param_exports++ :
info->nr_param_exports++;
}
}
}
@ -1274,9 +1282,19 @@ static void si_nir_assign_param_offsets(nir_shader *nir, struct si_shader *shade
temp_info->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = info->nr_param_exports++;
}
/* per primitive outputs come after per vertex outputs */
unsigned per_primitive_outputs_offset = info->nr_param_exports;
if (sel->screen->info.gfx_level >= GFX11)
per_primitive_outputs_offset = MAX2(per_primitive_outputs_offset, 1);
u_foreach_bit64 (i, per_primitive_outputs) {
if (temp_info->vs_output_param_offset[i] != AC_EXP_PARAM_DEFAULT_VAL_0000)
temp_info->vs_output_param_offset[i] += per_primitive_outputs_offset;
}
/* Update outputs written info, we may remove some outputs before. */
nir->info.outputs_written = outputs_written;
nir->info.outputs_written_16bit = outputs_written_16bit;
nir->info.per_primitive_outputs = per_primitive_outputs;
}
static void si_assign_param_offsets(nir_shader *nir, struct si_shader *shader,
@ -1284,6 +1302,7 @@ static void si_assign_param_offsets(nir_shader *nir, struct si_shader *shader,
{
/* Initialize this first. */
shader->info.nr_param_exports = 0;
shader->info.nr_prim_param_exports = 0;
STATIC_ASSERT(sizeof(temp_info->vs_output_param_offset[0]) == 1);
memset(temp_info->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
@ -1296,7 +1315,7 @@ static void si_assign_param_offsets(nir_shader *nir, struct si_shader *shader,
memset(slot_remap, -1, NUM_TOTAL_VARYING_SLOTS);
/* This sets DEFAULT_VAL for constant outputs in vs_output_param_offset. */
/* TODO: This doesn't affect GS. */
/* TODO: This doesn't affect GS and MS. */
NIR_PASS(_, nir, ac_nir_optimize_outputs, false, slot_remap,
temp_info->vs_output_param_offset);
@ -2039,6 +2058,10 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi
S_028644_DEFAULT_VAL(offset);
}
if (sscreen->info.gfx_level >= GFX11 &&
(nir->info.per_primitive_outputs & BITFIELD64_BIT(semantic)))
ps_input_cntl |= S_028644_PRIM_ATTR(1);
shader->info.vs_output_ps_input_cntl[semantic] = ps_input_cntl;
}
}

View file

@ -341,6 +341,7 @@ static void scan_instruction(const struct nir_shader *nir, struct si_shader_info
case nir_intrinsic_load_per_vertex_output:
case nir_intrinsic_store_output:
case nir_intrinsic_store_per_vertex_output:
case nir_intrinsic_store_per_primitive_output:
scan_io_usage(nir, info, intr, false, colors_lowered);
break;
case nir_intrinsic_load_deref:

View file

@ -224,6 +224,7 @@ struct si_shader_variant_info {
bool uses_mesh_scratch_ring : 1;
uint8_t nr_pos_exports;
uint8_t nr_param_exports;
uint8_t nr_prim_param_exports;
uint8_t clipdist_mask;
uint8_t culldist_mask;
uint8_t num_streamout_vec4s;

View file

@ -1600,8 +1600,11 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
S_028A84_NGG_DISABLE_PROVOK_REUSE(shader->key.ge.mono.u.vs_export_prim_id ||
gs_sel->info.writes_primid);
unsigned num_params = si_shader_num_alloc_param_exports(shader);
unsigned num_prim_params = shader->info.nr_prim_param_exports;
bool no_pc_export = num_params == 0 && num_prim_params == 0;
if (sscreen->info.gfx_level >= GFX12) {
unsigned num_params = si_shader_num_alloc_param_exports(shader);
unsigned wave_limit_per_se = 0x3ff;
/* This tuning adds up to 50% streamout performance. */
@ -1632,7 +1635,8 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
S_00B220_WAVE_LIMIT(wave_limit_per_se) |
S_00B220_INST_PREF_SIZE(si_get_shader_prefetch_size(shader));
shader->ngg.spi_vs_out_config = S_00B0C4_VS_EXPORT_COUNT(MAX2(num_params, 1) - 1) |
S_00B0C4_NO_PC_EXPORT(num_params == 0);
S_00B0C4_PRIM_EXPORT_COUNT(num_prim_params) |
S_00B0C4_NO_PC_EXPORT(no_pc_export);
} else {
unsigned late_alloc_wave64, cu_mask;
@ -1662,8 +1666,9 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
C_00B21C_CU_EN, 0, &sscreen->info);
shader->ngg.spi_shader_pgm_rsrc4_gs = S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64);
shader->ngg.spi_vs_out_config =
S_0286C4_VS_EXPORT_COUNT(MAX2(shader->info.nr_param_exports, 1) - 1) |
S_0286C4_NO_PC_EXPORT(shader->info.nr_param_exports == 0);
S_0286C4_VS_EXPORT_COUNT(MAX2(num_params, 1) - 1) |
S_0286C4_PRIM_EXPORT_COUNT(num_prim_params) |
S_0286C4_NO_PC_EXPORT(no_pc_export);
if (sscreen->info.gfx_level >= GFX11) {
shader->ngg.spi_shader_pgm_rsrc4_gs |=