ac/nir/tess: adjust memory layout of TCS outputs to have aligned store offsets

There is a comment that explains it.

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34780>
This commit is contained in:
Marek Olšák 2025-04-19 10:03:37 -04:00 committed by Marge Bot
parent 80236f2367
commit 534b282573
6 changed files with 100 additions and 55 deletions

View file

@ -926,10 +926,27 @@ unsigned ac_compute_ngg_workgroup_size(unsigned es_verts, unsigned gs_inst_prims
return CLAMP(workgroup_size, 1, 256);
}
static unsigned get_tcs_wg_output_mem_size(uint32_t num_tcs_output_cp, uint32_t num_mem_tcs_outputs,
uint32_t num_mem_tcs_patch_outputs, uint32_t num_patches)
{
/* Align each per-vertex and per-patch output to 16 vec4 elements = 256B. It's most optimal when
* the 16 vec4 elements are written by 16 consecutive lanes.
*
* 256B is the granularity of interleaving memory channels, which means a single output store
* in wave64 will cover 4 channels (1024B). If an output was only aligned to 128B, wave64 could
* cover 5 channels (128B .. 1.125K) instead of 4, which could increase VMEM latency.
*/
unsigned mem_one_pervertex_output = align(16 * num_tcs_output_cp * num_patches, 256);
unsigned mem_one_perpatch_output = align(16 * num_patches, 256);
return mem_one_pervertex_output * num_mem_tcs_outputs +
mem_one_perpatch_output * num_mem_tcs_patch_outputs;
}
uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t num_tcs_input_cp,
uint32_t num_tcs_output_cp, uint32_t vram_per_patch,
uint32_t lds_per_patch, uint32_t wave_size,
bool tess_uses_primid)
uint32_t num_tcs_output_cp, uint32_t num_mem_tcs_outputs,
uint32_t num_mem_tcs_patch_outputs, uint32_t lds_per_patch,
uint32_t wave_size, bool tess_uses_primid)
{
/* The VGT HS block increments the patch ID unconditionally within a single threadgroup.
* This results in incorrect patch IDs when instanced draws are used.
@ -956,8 +973,24 @@ uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t nu
num_patches = MIN2(num_patches, 16); /* recommended */
/* Make sure the output data fits in the offchip buffer */
if (vram_per_patch)
num_patches = MIN2(num_patches, (info->hs_offchip_workgroup_dw_size * 4) / vram_per_patch);
unsigned mem_size = get_tcs_wg_output_mem_size(num_tcs_output_cp, num_mem_tcs_outputs,
num_mem_tcs_patch_outputs, num_patches);
if (mem_size > info->hs_offchip_workgroup_dw_size * 4) {
/* Find the number of patches that fit in memory. Each output is aligned separately,
* so this division won't return a precise result.
*/
num_patches = info->hs_offchip_workgroup_dw_size * 4 /
get_tcs_wg_output_mem_size(num_tcs_output_cp, num_mem_tcs_outputs,
num_mem_tcs_patch_outputs, 1);
assert(get_tcs_wg_output_mem_size(num_tcs_output_cp, num_mem_tcs_outputs,
num_mem_tcs_patch_outputs, num_patches) <=
info->hs_offchip_workgroup_dw_size * 4);
while (get_tcs_wg_output_mem_size(num_tcs_output_cp, num_mem_tcs_outputs,
num_mem_tcs_patch_outputs, num_patches + 1) <=
info->hs_offchip_workgroup_dw_size * 4)
num_patches++;
}
/* Make sure that the data fits in LDS. This assumes the shaders only
* use LDS for the inputs and outputs.

View file

@ -294,9 +294,9 @@ unsigned ac_compute_ngg_workgroup_size(unsigned es_verts, unsigned gs_inst_prims
unsigned max_vtx_out, unsigned prim_amp_factor);
uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t num_tcs_input_cp,
uint32_t num_tcs_output_cp, uint32_t vram_per_patch,
uint32_t lds_per_patch, uint32_t wave_size,
bool tess_uses_primid);
uint32_t num_tcs_output_cp, uint32_t num_mem_tcs_outputs,
uint32_t num_mem_tcs_patch_outputs, uint32_t lds_per_patch,
uint32_t wave_size, bool tess_uses_primid);
uint32_t ac_apply_cu_en(uint32_t value, uint32_t clear_mask, unsigned value_shift,
const struct radeon_info *info);

View file

@ -76,28 +76,31 @@
* ### VRAM layout used by TCS-TES I/O:
*
* ```
* attr 0 of patch 0 vertex 0 < "off-chip LDS" offset
* attr 0 of patch 0 vertex 0 < "off-chip LDS" offset, aligned to >= 4K
* attr 0 of patch 0 vertex 1
* attr 0 of patch 0 vertex 2
* ...
* attr 0 of patch 1 vertex 0
* attr 0 of patch 1 vertex 1
* attr 0 of patch 1 vertex 2 < hs_per_vertex_output_vmem_offset (attribute slot = 0, rel_patch_id = 1, vertex index = 1)
* attr 0 of patch 1 vertex 2 < hs_per_vertex_output_vmem_offset (attribute slot = 0, rel_patch_id = 1, vertex index = 2)
* ...
* attr 0 of patch 2 vertex 0
* attr 0 of patch 2 vertex 1
* attr 0 of patch 2 vertex 2
* ...
* attr 1 of patch 0 vertex 0
* [pad to 256B]
* attr 1 of patch 0 vertex 0 < aligned to 256B
* attr 1 of patch 0 vertex 1
* attr 1 of patch 0 vertex 2
* ...
* ...
* per-patch attr 0 of patch 0 < hs_out_patch_data_offset_amd
* [pad to 256B]
* per-patch attr 0 of patch 0 < hs_out_patch_data_offset_amd, aligned to 256B
* per-patch attr 0 of patch 1
* per-patch attr 0 of patch 2 < hs_per_patch_output_vmem_offset (attribute slot = 0, rel_patch_id = 2)
* ...
* per-patch attr 1 of patch 0
* [pad to 256B]
* per-patch attr 1 of patch 0 < aligned to 256B
* per-patch attr 1 of patch 1
* per-patch attr 1 of patch 2
* ...
@ -477,6 +480,8 @@ hs_per_vertex_output_vmem_offset(nir_builder *b, lower_tess_io_state *st, unsign
nir_def *tcs_num_patches = nir_load_tcs_num_patches_amd(b);
nir_def *attr_stride = nir_imul(b, tcs_num_patches, nir_imul_imm(b, out_vertices_per_patch, 16u));
/* Align the stride to 256B. */
attr_stride = nir_align_imm(b, attr_stride, 256);
nir_def *off =
ac_nir_calc_io_off(b, component, io_offset, attr_stride, 4u,
hs_output_vram_map_io_location(b->shader, true, location, st));
@ -495,10 +500,12 @@ hs_per_patch_output_vmem_offset(nir_builder *b, lower_tess_io_state *st, unsigne
{
nir_def *tcs_num_patches = nir_load_tcs_num_patches_amd(b);
nir_def *per_patch_data_offset = nir_load_hs_out_patch_data_offset_amd(b);
/* Align the stride to 256B. */
nir_def *attr_stride = nir_align_imm(b, nir_imul_imm(b, tcs_num_patches, 16u), 256);
nir_def *off =
io_offset
? ac_nir_calc_io_off(b, component, io_offset, nir_imul_imm(b, tcs_num_patches, 16u), 4u,
? ac_nir_calc_io_off(b, component, io_offset, attr_stride, 4u,
hs_output_vram_map_io_location(b->shader, false, location, st))
: nir_imm_int(b, 0);
@ -1023,8 +1030,7 @@ hs_store_tess_factors_for_tes(nir_builder *b, tess_levels tessfactors, lower_tes
const bool tes_reads_inner = st->tes_inputs_read & VARYING_BIT_TESS_LEVEL_INNER;
if (st->tcs_tess_level_outer_mask && tes_reads_outer) {
const unsigned tf_outer_loc = hs_output_vram_map_io_location(b->shader, false, VARYING_SLOT_TESS_LEVEL_OUTER, st);
nir_def *vmem_off_outer = hs_per_patch_output_vmem_offset(b, st, 0, 0, NULL, tf_outer_loc * 16);
nir_def *vmem_off_outer = hs_per_patch_output_vmem_offset(b, st, VARYING_SLOT_TESS_LEVEL_OUTER, 0, zero, 0);
nir_store_buffer_amd(b, tessfactors.outer, hs_ring_tess_offchip,
vmem_off_outer, offchip_offset, zero,
@ -1033,8 +1039,7 @@ hs_store_tess_factors_for_tes(nir_builder *b, tess_levels tessfactors, lower_tes
}
if (tessfactors.inner && st->tcs_tess_level_inner_mask && tes_reads_inner) {
const unsigned tf_inner_loc = hs_output_vram_map_io_location(b->shader, false, VARYING_SLOT_TESS_LEVEL_INNER, st);
nir_def *vmem_off_inner = hs_per_patch_output_vmem_offset(b, st, 0, 0, NULL, tf_inner_loc * 16);
nir_def *vmem_off_inner = hs_per_patch_output_vmem_offset(b, st, VARYING_SLOT_TESS_LEVEL_INNER, 0, zero, 0);
nir_store_buffer_amd(b, tessfactors.inner, hs_ring_tess_offchip,
vmem_off_inner, offchip_offset, zero,
@ -1328,8 +1333,8 @@ ac_nir_compute_tess_wg_info(const struct radeon_info *info, uint64_t outputs_rea
unsigned lds_per_patch = num_tcs_input_cp * lds_input_vertex_size +
num_tcs_output_cp * lds_output_vertex_size +
lds_perpatch_output_patch_size;
unsigned mem_per_patch = (num_tcs_output_cp * num_mem_tcs_outputs + num_mem_tcs_patch_outputs) * 16;
unsigned num_patches = ac_compute_num_tess_patches(info, num_tcs_input_cp, num_tcs_output_cp, mem_per_patch,
unsigned num_patches = ac_compute_num_tess_patches(info, num_tcs_input_cp, num_tcs_output_cp,
num_mem_tcs_outputs, num_mem_tcs_patch_outputs,
lds_per_patch, wave_size, tess_uses_primid);
unsigned lds_size = lds_per_patch * num_patches;

View file

@ -244,33 +244,31 @@ lower_abi_instr(nir_builder *b, nir_intrinsic_instr *intrin, void *state)
break;
}
case nir_intrinsic_load_hs_out_patch_data_offset_amd: {
nir_def *num_tcs_outputs, *out_vertices_per_patch;
if (stage == MESA_SHADER_TESS_CTRL) {
num_tcs_outputs = nir_imm_int(b, s->info->tcs.num_linked_outputs);
out_vertices_per_patch = nir_imm_int(b, s->info->tcs.tcs_vertices_out);
} else {
if (s->info->inputs_linked) {
out_vertices_per_patch = nir_imm_int(b, s->info->tes.tcs_vertices_out);
num_tcs_outputs = nir_imm_int(b, s->info->tes.num_linked_inputs);
} else {
nir_def *n = GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_OUT_PATCH_CP);
out_vertices_per_patch = nir_iadd_imm_nuw(b, n, 1);
num_tcs_outputs = GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_NUM_HS_OUTPUTS);
}
}
nir_def *per_vertex_output_patch_size =
nir_imul(b, out_vertices_per_patch, nir_imul_imm(b, num_tcs_outputs, 16u));
nir_def *num_patches, *out_vertices_per_patch, *num_tcs_mem_outputs;
if (s->info->num_tess_patches) {
unsigned num_patches = s->info->num_tess_patches;
replacement = nir_imul_imm(b, per_vertex_output_patch_size, num_patches);
num_patches = nir_imm_int(b, s->info->num_tess_patches);
} else {
nir_def *n = GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_NUM_PATCHES);
nir_def *num_patches = nir_iadd_imm_nuw(b, n, 1);
replacement = nir_imul(b, per_vertex_output_patch_size, num_patches);
num_patches = nir_iadd_imm_nuw(b, n, 1);
}
if (stage == MESA_SHADER_TESS_CTRL) {
out_vertices_per_patch = nir_imm_int(b, s->info->tcs.tcs_vertices_out);
num_tcs_mem_outputs = nir_imm_int(b, s->info->tcs.num_linked_outputs);
} else if (s->info->inputs_linked) {
out_vertices_per_patch = nir_imm_int(b, s->info->tes.tcs_vertices_out);
num_tcs_mem_outputs = nir_imm_int(b, s->info->tes.num_linked_inputs);
} else {
nir_def *n = GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_OUT_PATCH_CP);
out_vertices_per_patch = nir_iadd_imm_nuw(b, n, 1);
num_tcs_mem_outputs = GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_NUM_HS_OUTPUTS);
}
/* Compute the stride of a single output. */
nir_def *attr_stride = nir_imul(b, num_patches, nir_imul_imm(b, out_vertices_per_patch, 16));
attr_stride = nir_align_imm(b, attr_stride, 256);
replacement = nir_imul(b, attr_stride, num_tcs_mem_outputs);
break;
}
case nir_intrinsic_load_sample_positions_amd: {

View file

@ -1221,6 +1221,16 @@ nir_umod_imm(nir_builder *build, nir_def *x, uint64_t y)
}
}
static inline nir_def *
nir_align_imm(nir_builder *b, nir_def *x, uint64_t align)
{
if (align == 1)
return x;
assert(util_is_power_of_two_nonzero64(align));
return nir_iand_imm(b, nir_iadd_imm(b, x, align - 1), ~(align - 1));
}
static inline nir_def *
nir_ibfe_imm(nir_builder *build, nir_def *x, uint32_t offset, uint32_t size)
{

View file

@ -309,24 +309,23 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s
break;
}
case nir_intrinsic_load_hs_out_patch_data_offset_amd: {
nir_def *per_vtx_out_patch_size = NULL;
nir_def *tcs_num_patches =
nir_iadd_imm_nuw(b, ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 0, 7), 1);
nir_def *tcs_out_vertices, *num_tcs_mem_outputs;
if (stage == MESA_SHADER_TESS_CTRL) {
const unsigned num_hs_out = util_last_bit64(sel->info.tcs_outputs_written_for_tes);
const unsigned out_vtx_size = num_hs_out * 16;
const unsigned out_vtx_per_patch = b->shader->info.tess.tcs_vertices_out;
per_vtx_out_patch_size = nir_imm_int(b, out_vtx_size * out_vtx_per_patch);
tcs_out_vertices = nir_imm_int(b, b->shader->info.tess.tcs_vertices_out);
num_tcs_mem_outputs = nir_imm_int(b, util_last_bit64(sel->info.tcs_outputs_written_for_tes));
} else {
nir_def *num_hs_out = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 23, 6);
nir_def *out_vtx_size = nir_ishl_imm(b, num_hs_out, 4);
nir_def *o = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 7, 5);
nir_def *out_vtx_per_patch = nir_iadd_imm_nuw(b, o, 1);
per_vtx_out_patch_size = nir_imul(b, out_vtx_per_patch, out_vtx_size);
tcs_out_vertices =
nir_iadd_imm_nuw(b, ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 7, 5), 1);
num_tcs_mem_outputs = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 23, 6);
}
nir_def *p = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 0, 7);
nir_def *num_patches = nir_iadd_imm_nuw(b, p, 1);
replacement = nir_imul(b, per_vtx_out_patch_size, num_patches);
/* Compute the stride of a single output. */
nir_def *attr_stride = nir_imul(b, tcs_num_patches, nir_imul_imm(b, tcs_out_vertices, 16));
attr_stride = nir_align_imm(b, attr_stride, 256);
replacement = nir_imul(b, attr_stride, num_tcs_mem_outputs);
break;
}
case nir_intrinsic_load_clip_half_line_width_amd: {