radv: switch to the new TCS LDS/offchip size computation

to use the same logic as radeonsi. This could be improved, see TODOs.

Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31673>
This commit is contained in:
Marek Olšák 2024-10-03 12:05:24 -04:00 committed by Marge Bot
parent 823e9e846e
commit 8c2f9f0665
7 changed files with 44 additions and 87 deletions

View file

@ -1209,20 +1209,6 @@ uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t nu
return num_patches; return num_patches;
} }
uint32_t
ac_compute_tess_lds_size(const struct radeon_info *info, uint32_t lds_per_patch, uint32_t num_patches)
{
unsigned lds_size = lds_per_patch * num_patches;
/* The first vec4 is reserved for the tf0/1 shader message group vote. */
if (info->gfx_level >= GFX11)
lds_size += AC_HS_MSG_VOTE_LDS_BYTES;
assert(lds_size <= (info->gfx_level >= GFX9 ? 65536 : 32768));
return align(lds_size, info->lds_encode_granularity) / info->lds_encode_granularity;
}
uint32_t ac_apply_cu_en(uint32_t value, uint32_t clear_mask, unsigned value_shift, uint32_t ac_apply_cu_en(uint32_t value, uint32_t clear_mask, unsigned value_shift,
const struct radeon_info *info) const struct radeon_info *info)
{ {

View file

@ -306,9 +306,6 @@ uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t nu
uint32_t lds_per_patch, uint32_t wave_size, uint32_t lds_per_patch, uint32_t wave_size,
bool tess_uses_primid); bool tess_uses_primid);
uint32_t ac_compute_tess_lds_size(const struct radeon_info *info,
uint32_t lds_per_patch, uint32_t num_patches);
uint32_t ac_apply_cu_en(uint32_t value, uint32_t clear_mask, unsigned value_shift, uint32_t ac_apply_cu_en(uint32_t value, uint32_t clear_mask, unsigned value_shift,
const struct radeon_info *info); const struct radeon_info *info);

View file

@ -3495,17 +3495,21 @@ radv_emit_patch_control_points(struct radv_cmd_buffer *cmd_buffer)
* is dynamic. * is dynamic.
*/ */
if (cmd_buffer->state.uses_dynamic_patch_control_points) { if (cmd_buffer->state.uses_dynamic_patch_control_points) {
/* Compute the number of patches. */ struct shader_info tcs_info;
cmd_buffer->state.tess_num_patches = radv_get_tcs_num_patches(
pdev, d->vk.ts.patch_control_points, tcs->info.tcs.tcs_vertices_out, vs->info.vs.num_linked_outputs,
tcs->info.tcs.num_lds_per_vertex_outputs, tcs->info.tcs.num_lds_per_patch_outputs,
tcs->info.tcs.num_linked_outputs, tcs->info.tcs.num_linked_patch_outputs);
/* Compute the LDS size. */ /* No other shader_info fields are needed. */
cmd_buffer->state.tess_lds_size = tcs_info.tess.tcs_vertices_out = tcs->info.tcs.tcs_vertices_out;
radv_get_tess_lds_size(pdev, d->vk.ts.patch_control_points, tcs->info.tcs.tcs_vertices_out, /* These are only used to determine the LDS layout for TCS outputs. */
vs->info.vs.num_linked_outputs, cmd_buffer->state.tess_num_patches, tcs_info.outputs_read = tcs->info.tcs.tcs_outputs_read;
tcs->info.tcs.num_lds_per_vertex_outputs, tcs->info.tcs.num_lds_per_patch_outputs); tcs_info.outputs_written = tcs->info.tcs.tcs_outputs_written;
tcs_info.patch_outputs_read = tcs->info.tcs.tcs_patch_outputs_read;
tcs_info.patch_outputs_written = tcs->info.tcs.tcs_patch_outputs_written;
radv_get_tess_wg_info(pdev, &tcs_info, d->vk.ts.patch_control_points,
/* TODO: This should be only inputs in LDS (not VGPR inputs) to reduce LDS usage */
vs->info.vs.num_linked_outputs, tcs->info.tcs.num_linked_outputs,
tcs->info.tcs.num_linked_patch_outputs, tcs->info.tcs.all_invocations_define_tess_levels,
&cmd_buffer->state.tess_num_patches, &cmd_buffer->state.tess_lds_size);
} }
ls_hs_config = S_028B58_NUM_PATCHES(cmd_buffer->state.tess_num_patches) | ls_hs_config = S_028B58_NUM_PATCHES(cmd_buffer->state.tess_num_patches) |

View file

@ -3567,43 +3567,17 @@ radv_get_user_sgpr(const struct radv_shader *shader, int idx)
return offset ? ((offset - SI_SH_REG_OFFSET) >> 2) : 0; return offset ? ((offset - SI_SH_REG_OFFSET) >> 2) : 0;
} }
static uint32_t void
radv_get_tess_patch_size(uint32_t tcs_num_input_vertices, uint32_t tcs_num_output_vertices, uint32_t tcs_num_inputs, radv_get_tess_wg_info(const struct radv_physical_device *pdev, const struct shader_info *tcs_info,
uint32_t tcs_num_lds_outputs, uint32_t tcs_num_lds_patch_outputs) unsigned tcs_num_input_vertices, unsigned tcs_num_lds_inputs, unsigned tcs_num_vram_outputs,
unsigned tcs_num_vram_patch_outputs, bool all_invocations_define_tess_levels,
unsigned *num_patches_per_wg, unsigned *hw_lds_size)
{ {
const uint32_t input_vertex_size = get_tcs_input_vertex_stride(tcs_num_inputs); const uint32_t lds_input_vertex_size = get_tcs_input_vertex_stride(tcs_num_lds_inputs);
const uint32_t input_patch_size = tcs_num_input_vertices * input_vertex_size;
const uint32_t lds_output_vertex_size = tcs_num_lds_outputs * 16;
const uint32_t lds_pervertex_output_patch_size = tcs_num_output_vertices * lds_output_vertex_size;
const uint32_t lds_output_patch_size = lds_pervertex_output_patch_size + tcs_num_lds_patch_outputs * 16;
return input_patch_size + lds_output_patch_size; ac_nir_compute_tess_wg_info(&pdev->info, tcs_info, pdev->ge_wave_size, false, all_invocations_define_tess_levels,
} tcs_num_input_vertices, lds_input_vertex_size, tcs_num_vram_outputs,
tcs_num_vram_patch_outputs, num_patches_per_wg, hw_lds_size);
uint32_t
radv_get_tcs_num_patches(const struct radv_physical_device *pdev, unsigned tcs_num_input_vertices,
unsigned tcs_num_output_vertices, unsigned tcs_num_inputs, unsigned tcs_num_lds_outputs,
unsigned tcs_num_lds_patch_outputs, unsigned tcs_num_vram_outputs,
unsigned tcs_num_vram_patch_outputs)
{
const uint32_t lds_per_patch = radv_get_tess_patch_size(
tcs_num_input_vertices, tcs_num_output_vertices, tcs_num_inputs, tcs_num_lds_outputs, tcs_num_lds_patch_outputs);
const uint32_t vram_per_patch = radv_get_tess_patch_size(tcs_num_input_vertices, tcs_num_output_vertices, 0,
tcs_num_vram_outputs, tcs_num_vram_patch_outputs);
return ac_compute_num_tess_patches(&pdev->info, tcs_num_input_vertices, tcs_num_output_vertices, vram_per_patch,
lds_per_patch, pdev->ge_wave_size, false);
}
uint32_t
radv_get_tess_lds_size(const struct radv_physical_device *pdev, uint32_t tcs_num_input_vertices,
uint32_t tcs_num_output_vertices, uint32_t tcs_num_inputs, uint32_t tcs_num_patches,
uint32_t tcs_num_lds_outputs, uint32_t tcs_num_lds_patch_outputs)
{
const uint32_t lds_per_patch = radv_get_tess_patch_size(
tcs_num_input_vertices, tcs_num_output_vertices, tcs_num_inputs, tcs_num_lds_outputs, tcs_num_lds_patch_outputs);
return ac_compute_tess_lds_size(&pdev->info, lds_per_patch, tcs_num_patches);
} }
VkResult VkResult

View file

@ -664,14 +664,10 @@ get_tcs_input_vertex_stride(unsigned tcs_num_inputs)
return stride; return stride;
} }
uint32_t radv_get_tcs_num_patches(const struct radv_physical_device *pdev, unsigned tcs_num_input_vertices, void radv_get_tess_wg_info(const struct radv_physical_device *pdev, const struct shader_info *tcs_info,
unsigned tcs_num_output_vertices, unsigned tcs_num_inputs, unsigned tcs_num_input_vertices, unsigned tcs_num_lds_inputs, unsigned tcs_num_vram_outputs,
unsigned tcs_num_lds_outputs, unsigned tcs_num_lds_patch_outputs, unsigned tcs_num_vram_patch_outputs, bool all_invocations_define_tess_levels,
unsigned tcs_num_vram_outputs, unsigned tcs_num_vram_patch_outputs); unsigned *num_patches_per_wg, unsigned *hw_lds_size);
uint32_t radv_get_tess_lds_size(const struct radv_physical_device *pdev, uint32_t tcs_num_input_vertices,
uint32_t tcs_num_output_vertices, uint32_t tcs_num_inputs, uint32_t tcs_num_patches,
uint32_t tcs_num_lds_outputs, uint32_t tcs_num_lds_patch_outputs);
void radv_lower_ngg(struct radv_device *device, struct radv_shader_stage *ngg_stage, void radv_lower_ngg(struct radv_device *device, struct radv_shader_stage *ngg_stage,
const struct radv_graphics_state_key *gfx_state); const struct radv_graphics_state_key *gfx_state);

View file

@ -633,17 +633,18 @@ gather_shader_info_tcs(struct radv_device *device, const nir_shader *nir,
const struct radv_graphics_state_key *gfx_state, struct radv_shader_info *info) const struct radv_graphics_state_key *gfx_state, struct radv_shader_info *info)
{ {
const struct radv_physical_device *pdev = radv_device_physical(device); const struct radv_physical_device *pdev = radv_device_physical(device);
nir_tcs_info tcs_info;
const uint64_t tess_lvl_mask = VARYING_BIT_TESS_LEVEL_OUTER | VARYING_BIT_TESS_LEVEL_INNER; nir_gather_tcs_info(nir, &tcs_info, nir->info.tess._primitive_mode, nir->info.tess.spacing);
const uint64_t per_vtx_out_mask = nir->info.outputs_read & nir->info.outputs_written & ~tess_lvl_mask;
const uint64_t tess_lvl_out_mask = nir->info.outputs_written & tess_lvl_mask;
const uint32_t per_patch_out_mask = nir->info.patch_outputs_read & nir->info.patch_outputs_written;
info->tcs.num_lds_per_vertex_outputs = util_bitcount64(per_vtx_out_mask); info->tcs.tcs_outputs_read = nir->info.outputs_read;
info->tcs.num_lds_per_patch_outputs = util_bitcount64(tess_lvl_out_mask) + util_bitcount(per_patch_out_mask); info->tcs.tcs_outputs_written = nir->info.outputs_written;
info->tcs.tcs_patch_outputs_read = nir->info.patch_inputs_read;
info->tcs.tcs_patch_outputs_written = nir->info.patch_outputs_written;
info->tcs.tcs_vertices_out = nir->info.tess.tcs_vertices_out; info->tcs.tcs_vertices_out = nir->info.tess.tcs_vertices_out;
info->tcs.tes_inputs_read = ~0ULL; info->tcs.tes_inputs_read = ~0ULL;
info->tcs.tes_patch_inputs_read = ~0ULL; info->tcs.tes_patch_inputs_read = ~0ULL;
info->tcs.all_invocations_define_tess_levels = tcs_info.all_invocations_define_tess_levels;
if (!info->inputs_linked) if (!info->inputs_linked)
info->tcs.num_linked_inputs = util_last_bit64(radv_gather_unlinked_io_mask(nir->info.inputs_read)); info->tcs.num_linked_inputs = util_last_bit64(radv_gather_unlinked_io_mask(nir->info.inputs_read));
@ -655,16 +656,12 @@ gather_shader_info_tcs(struct radv_device *device, const nir_shader *nir,
} }
if (gfx_state->ts.patch_control_points) { if (gfx_state->ts.patch_control_points) {
/* Number of tessellation patches per workgroup processed by the current pipeline. */
info->num_tess_patches = radv_get_tcs_num_patches(
pdev, gfx_state->ts.patch_control_points, nir->info.tess.tcs_vertices_out, info->tcs.num_linked_inputs,
info->tcs.num_lds_per_vertex_outputs, info->tcs.num_lds_per_patch_outputs, info->tcs.num_linked_outputs,
info->tcs.num_linked_patch_outputs);
/* LDS size used by VS+TCS for storing TCS inputs and outputs. */ radv_get_tess_wg_info(pdev, &nir->info, gfx_state->ts.patch_control_points,
info->tcs.num_lds_blocks = radv_get_tess_lds_size( /* TODO: This should be only inputs in LDS (not VGPR inputs) to reduce LDS usage */
pdev, gfx_state->ts.patch_control_points, nir->info.tess.tcs_vertices_out, info->tcs.num_linked_inputs, info->tcs.num_linked_inputs, info->tcs.num_linked_outputs,
info->num_tess_patches, info->tcs.num_lds_per_vertex_outputs, info->tcs.num_lds_per_patch_outputs); info->tcs.num_linked_patch_outputs, tcs_info.all_invocations_define_tess_levels,
&info->num_tess_patches, &info->tcs.num_lds_blocks);
} }
} }

View file

@ -235,14 +235,17 @@ struct radv_shader_info {
struct { struct {
uint64_t tes_inputs_read; uint64_t tes_inputs_read;
uint64_t tes_patch_inputs_read; uint64_t tes_patch_inputs_read;
uint64_t tcs_outputs_read;
uint64_t tcs_outputs_written;
uint32_t tcs_patch_outputs_read;
uint32_t tcs_patch_outputs_written;
unsigned tcs_vertices_out; unsigned tcs_vertices_out;
uint32_t num_lds_blocks; uint32_t num_lds_blocks;
uint8_t num_linked_inputs; /* Number of reserved per-vertex input slots in LDS. */ uint8_t num_linked_inputs; /* Number of reserved per-vertex input slots in LDS. */
uint8_t num_linked_outputs; /* Number of reserved per-vertex output slots in VRAM. */ uint8_t num_linked_outputs; /* Number of reserved per-vertex output slots in VRAM. */
uint8_t num_linked_patch_outputs; /* Number of reserved per-patch output slots in VRAM. */ uint8_t num_linked_patch_outputs; /* Number of reserved per-patch output slots in VRAM. */
uint8_t num_lds_per_vertex_outputs; /* Number of reserved per-vertex output slots in LDS. */
uint8_t num_lds_per_patch_outputs; /* Number of reserved per-patch output slots in LDS. */
bool tes_reads_tess_factors : 1; bool tes_reads_tess_factors : 1;
bool all_invocations_define_tess_levels : 1;
} tcs; } tcs;
struct { struct {
enum mesa_prim output_prim; enum mesa_prim output_prim;