radv: recalculate legacy_gs_info on bind

Previously legacy_gs_info calculated based on
gs_info->legacy_gs_info.esgs_itemsize which is calculated based on gs
input varyings.

However, when using ESO vs/tes can have outputs not read by gs, which
leads to underestimating LDS usage.

Cc: mesa-stable
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38514>
(cherry picked from commit 5e8885a339)
This commit is contained in:
Anna Maniscalco 2025-11-18 23:40:00 +01:00 committed by Eric Engestrom
parent 0f909636cc
commit bc639539af
7 changed files with 45 additions and 35 deletions

View file

@ -4414,7 +4414,7 @@
"description": "radv: recalculate legacy_gs_info on bind",
"nominated": true,
"nomination_type": 1,
"resolution": 0,
"resolution": 1,
"main_sha": null,
"because_sha": null,
"notes": null

View file

@ -11536,18 +11536,27 @@ radv_bind_graphics_shaders(struct radv_cmd_buffer *cmd_buffer)
radv_bind_gs_copy_shader(cmd_buffer, gs_copy_shader);
/* Determine NGG GS info. */
/* Determine GS info. */
if (cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY] &&
cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY]->info.is_ngg &&
cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY]->info.merged_shader_compiled_separately) {
struct radv_shader *es = cmd_buffer->state.shaders[MESA_SHADER_TESS_EVAL]
? cmd_buffer->state.shaders[MESA_SHADER_TESS_EVAL]
: cmd_buffer->state.shaders[MESA_SHADER_VERTEX];
struct radv_shader *gs = cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY];
gfx10_ngg_set_esgs_ring_itemsize(device, &es->info, &gs->info, &gs->info.ngg_info);
gfx10_get_ngg_info(device, &es->info, &gs->info, &gs->info.ngg_info);
radv_precompute_registers_hw_ngg(device, &gs->config, &gs->info);
if (cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY]->info.is_ngg) {
gfx10_ngg_set_esgs_ring_itemsize(device, &es->info, &gs->info, &gs->info.ngg_info);
gfx10_get_ngg_info(device, &es->info, &gs->info, &gs->info.ngg_info);
radv_precompute_registers_hw_ngg(device, &gs->config, &gs->info);
} else {
radv_get_legacy_gs_info(device, &es->info, &gs->info);
radv_precompute_registers_hw_gs(device, &es->info, &gs->info);
cmd_buffer->esgs_ring_size_needed =
MAX2(cmd_buffer->esgs_ring_size_needed, gs->info.legacy_gs_info.esgs_ring_size);
cmd_buffer->gsvs_ring_size_needed =
MAX2(cmd_buffer->gsvs_ring_size_needed, gs->info.legacy_gs_info.gsvs_ring_size);
}
}
/* Determine the rasterized primitive. */

View file

@ -2803,7 +2803,7 @@ radv_graphics_shaders_compile(struct radv_device *device, struct vk_pipeline_cac
}
if (stages[MESA_SHADER_GEOMETRY].nir && !stages[MESA_SHADER_GEOMETRY].info.is_ngg)
radv_get_legacy_gs_info(device, &stages[MESA_SHADER_GEOMETRY].info);
radv_get_legacy_gs_info(device, NULL, &stages[MESA_SHADER_GEOMETRY].info);
/* Compile NIR shaders to AMD assembly. */
radv_graphics_shaders_nir_to_asm(device, cache, stages, gfx_state, keep_executable_info, keep_statistic_info,

View file

@ -1494,60 +1494,59 @@ radv_precompute_registers_hw_vs(struct radv_device *device, struct radv_shader_b
}
}
static void
radv_precompute_registers_hw_gs(struct radv_device *device, struct radv_shader_binary *binary)
void
radv_precompute_registers_hw_gs(struct radv_device *device, struct radv_shader_info *es_info, struct radv_shader_info *gs_info)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
struct radv_shader_info *info = &binary->info;
info->regs.gs.vgt_esgs_ring_itemsize = info->legacy_gs_info.esgs_itemsize;
gs_info->regs.gs.vgt_esgs_ring_itemsize = es_info ? es_info->esgs_itemsize / 4 : gs_info->legacy_gs_info.esgs_itemsize;
info->regs.gs.vgt_gs_max_prims_per_subgroup =
S_028A94_MAX_PRIMS_PER_SUBGROUP(info->legacy_gs_info.gs_inst_prims_in_subgroup);
gs_info->regs.gs.vgt_gs_max_prims_per_subgroup =
S_028A94_MAX_PRIMS_PER_SUBGROUP(gs_info->legacy_gs_info.gs_inst_prims_in_subgroup);
info->regs.vgt_gs_onchip_cntl = S_028A44_ES_VERTS_PER_SUBGRP(info->legacy_gs_info.es_verts_per_subgroup) |
S_028A44_GS_PRIMS_PER_SUBGRP(info->legacy_gs_info.gs_prims_per_subgroup) |
S_028A44_GS_INST_PRIMS_IN_SUBGRP(info->legacy_gs_info.gs_inst_prims_in_subgroup);
gs_info->regs.vgt_gs_onchip_cntl = S_028A44_ES_VERTS_PER_SUBGRP(gs_info->legacy_gs_info.es_verts_per_subgroup) |
S_028A44_GS_PRIMS_PER_SUBGRP(gs_info->legacy_gs_info.gs_prims_per_subgroup) |
S_028A44_GS_INST_PRIMS_IN_SUBGRP(gs_info->legacy_gs_info.gs_inst_prims_in_subgroup);
const uint32_t gs_max_out_vertices = info->gs.vertices_out;
const uint8_t max_stream = info->gs.num_components_per_stream[3] ? 3
: info->gs.num_components_per_stream[2] ? 2
: info->gs.num_components_per_stream[1] ? 1
const uint32_t gs_max_out_vertices = gs_info->gs.vertices_out;
const uint8_t max_stream = gs_info->gs.num_components_per_stream[3] ? 3
: gs_info->gs.num_components_per_stream[2] ? 2
: gs_info->gs.num_components_per_stream[1] ? 1
: 0;
const uint8_t *num_components = info->gs.num_components_per_stream;
const uint8_t *num_components = gs_info->gs.num_components_per_stream;
uint32_t offset = num_components[0] * gs_max_out_vertices;
info->regs.gs.vgt_gsvs_ring_offset[0] = offset;
gs_info->regs.gs.vgt_gsvs_ring_offset[0] = offset;
if (max_stream >= 1)
offset += num_components[1] * gs_max_out_vertices;
info->regs.gs.vgt_gsvs_ring_offset[1] = offset;
gs_info->regs.gs.vgt_gsvs_ring_offset[1] = offset;
if (max_stream >= 2)
offset += num_components[2] * gs_max_out_vertices;
info->regs.gs.vgt_gsvs_ring_offset[2] = offset;
gs_info->regs.gs.vgt_gsvs_ring_offset[2] = offset;
if (max_stream >= 3)
offset += num_components[3] * gs_max_out_vertices;
info->regs.gs.vgt_gsvs_ring_itemsize = offset;
gs_info->regs.gs.vgt_gsvs_ring_itemsize = offset;
for (uint32_t i = 0; i < 4; i++)
info->regs.gs.vgt_gs_vert_itemsize[i] = (max_stream >= i) ? num_components[i] : 0;
gs_info->regs.gs.vgt_gs_vert_itemsize[i] = (max_stream >= i) ? num_components[i] : 0;
const uint32_t gs_num_invocations = info->gs.invocations;
info->regs.gs.vgt_gs_instance_cnt =
const uint32_t gs_num_invocations = gs_info->gs.invocations;
gs_info->regs.gs.vgt_gs_instance_cnt =
S_028B90_CNT(MIN2(gs_num_invocations, 127)) | S_028B90_ENABLE(gs_num_invocations > 0);
info->regs.spi_shader_pgm_rsrc3_gs =
gs_info->regs.spi_shader_pgm_rsrc3_gs =
ac_apply_cu_en(S_00B21C_CU_EN(0xffff) | S_00B21C_WAVE_LIMIT(0x3F), C_00B21C_CU_EN, 0, &pdev->info);
if (pdev->info.gfx_level >= GFX10) {
info->regs.spi_shader_pgm_rsrc4_gs =
gs_info->regs.spi_shader_pgm_rsrc4_gs =
ac_apply_cu_en(S_00B204_CU_EN_GFX10(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0), C_00B204_CU_EN_GFX10,
16, &pdev->info);
}
info->regs.vgt_gs_max_vert_out = info->gs.vertices_out;
gs_info->regs.vgt_gs_max_vert_out = gs_info->gs.vertices_out;
}
void
@ -1897,7 +1896,7 @@ radv_precompute_registers(struct radv_device *device, struct radv_shader_binary
if (info->is_ngg) {
radv_precompute_registers_hw_ngg(device, &binary->config, &binary->info);
} else {
radv_precompute_registers_hw_gs(device, binary);
radv_precompute_registers_hw_gs(device, NULL, &binary->info);
}
break;
case MESA_SHADER_MESH:

View file

@ -736,6 +736,8 @@ uint32_t radv_get_user_sgpr_loc(const struct radv_shader *shader, int idx);
uint32_t radv_get_user_sgpr(const struct radv_shader *shader, int idx);
void radv_precompute_registers_hw_gs(struct radv_device *device, struct radv_shader_info *es_info, struct radv_shader_info *gs_info);
void radv_precompute_registers_hw_ngg(struct radv_device *device, const struct ac_shader_config *config,
struct radv_shader_info *info);

View file

@ -677,11 +677,11 @@ gather_shader_info_tes(struct radv_device *device, const nir_shader *nir, struct
}
void
radv_get_legacy_gs_info(const struct radv_device *device, struct radv_shader_info *gs_info)
radv_get_legacy_gs_info(const struct radv_device *device, struct radv_shader_info *es_info, struct radv_shader_info *gs_info)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
struct radv_legacy_gs_info *out = &gs_info->legacy_gs_info;
const unsigned esgs_vertex_stride = out->esgs_itemsize * 4;
const unsigned esgs_vertex_stride = es_info ? es_info->esgs_itemsize : out->esgs_itemsize * 4;
ac_legacy_gs_subgroup_info info;
ac_legacy_gs_compute_subgroup_info(gs_info->gs.input_prim, gs_info->gs.vertices_out, gs_info->gs.invocations,

View file

@ -329,7 +329,7 @@ void radv_nir_shader_info_pass(struct radv_device *device, const struct nir_shad
const enum radv_pipeline_type pipeline_type, bool consider_force_vrs,
struct radv_shader_info *info);
void radv_get_legacy_gs_info(const struct radv_device *device, struct radv_shader_info *gs_info);
void radv_get_legacy_gs_info(const struct radv_device *device, struct radv_shader_info *es_info, struct radv_shader_info *gs_info);
void gfx10_get_ngg_info(const struct radv_device *device, struct radv_shader_info *es_info,
struct radv_shader_info *gs_info, struct gfx10_ngg_info *out);