mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-08 02:38:04 +02:00
ac/nir/tess: adjust memory layout of TCS outputs to have aligned store offsets
There is a comment that explains it. Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Reviewed-by: Timur Kristóf <timur.kristof@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34780>
This commit is contained in:
parent
80236f2367
commit
534b282573
6 changed files with 100 additions and 55 deletions
|
|
@ -926,10 +926,27 @@ unsigned ac_compute_ngg_workgroup_size(unsigned es_verts, unsigned gs_inst_prims
|
|||
return CLAMP(workgroup_size, 1, 256);
|
||||
}
|
||||
|
||||
static unsigned get_tcs_wg_output_mem_size(uint32_t num_tcs_output_cp, uint32_t num_mem_tcs_outputs,
|
||||
uint32_t num_mem_tcs_patch_outputs, uint32_t num_patches)
|
||||
{
|
||||
/* Align each per-vertex and per-patch output to 16 vec4 elements = 256B. It's most optimal when
|
||||
* the 16 vec4 elements are written by 16 consecutive lanes.
|
||||
*
|
||||
* 256B is the granularity of interleaving memory channels, which means a single output store
|
||||
* in wave64 will cover 4 channels (1024B). If an output was only aligned to 128B, wave64 could
|
||||
* cover 5 channels (128B .. 1.125K) instead of 4, which could increase VMEM latency.
|
||||
*/
|
||||
unsigned mem_one_pervertex_output = align(16 * num_tcs_output_cp * num_patches, 256);
|
||||
unsigned mem_one_perpatch_output = align(16 * num_patches, 256);
|
||||
|
||||
return mem_one_pervertex_output * num_mem_tcs_outputs +
|
||||
mem_one_perpatch_output * num_mem_tcs_patch_outputs;
|
||||
}
|
||||
|
||||
uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t num_tcs_input_cp,
|
||||
uint32_t num_tcs_output_cp, uint32_t vram_per_patch,
|
||||
uint32_t lds_per_patch, uint32_t wave_size,
|
||||
bool tess_uses_primid)
|
||||
uint32_t num_tcs_output_cp, uint32_t num_mem_tcs_outputs,
|
||||
uint32_t num_mem_tcs_patch_outputs, uint32_t lds_per_patch,
|
||||
uint32_t wave_size, bool tess_uses_primid)
|
||||
{
|
||||
/* The VGT HS block increments the patch ID unconditionally within a single threadgroup.
|
||||
* This results in incorrect patch IDs when instanced draws are used.
|
||||
|
|
@ -956,8 +973,24 @@ uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t nu
|
|||
num_patches = MIN2(num_patches, 16); /* recommended */
|
||||
|
||||
/* Make sure the output data fits in the offchip buffer */
|
||||
if (vram_per_patch)
|
||||
num_patches = MIN2(num_patches, (info->hs_offchip_workgroup_dw_size * 4) / vram_per_patch);
|
||||
unsigned mem_size = get_tcs_wg_output_mem_size(num_tcs_output_cp, num_mem_tcs_outputs,
|
||||
num_mem_tcs_patch_outputs, num_patches);
|
||||
if (mem_size > info->hs_offchip_workgroup_dw_size * 4) {
|
||||
/* Find the number of patches that fit in memory. Each output is aligned separately,
|
||||
* so this division won't return a precise result.
|
||||
*/
|
||||
num_patches = info->hs_offchip_workgroup_dw_size * 4 /
|
||||
get_tcs_wg_output_mem_size(num_tcs_output_cp, num_mem_tcs_outputs,
|
||||
num_mem_tcs_patch_outputs, 1);
|
||||
assert(get_tcs_wg_output_mem_size(num_tcs_output_cp, num_mem_tcs_outputs,
|
||||
num_mem_tcs_patch_outputs, num_patches) <=
|
||||
info->hs_offchip_workgroup_dw_size * 4);
|
||||
|
||||
while (get_tcs_wg_output_mem_size(num_tcs_output_cp, num_mem_tcs_outputs,
|
||||
num_mem_tcs_patch_outputs, num_patches + 1) <=
|
||||
info->hs_offchip_workgroup_dw_size * 4)
|
||||
num_patches++;
|
||||
}
|
||||
|
||||
/* Make sure that the data fits in LDS. This assumes the shaders only
|
||||
* use LDS for the inputs and outputs.
|
||||
|
|
|
|||
|
|
@ -294,9 +294,9 @@ unsigned ac_compute_ngg_workgroup_size(unsigned es_verts, unsigned gs_inst_prims
|
|||
unsigned max_vtx_out, unsigned prim_amp_factor);
|
||||
|
||||
uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t num_tcs_input_cp,
|
||||
uint32_t num_tcs_output_cp, uint32_t vram_per_patch,
|
||||
uint32_t lds_per_patch, uint32_t wave_size,
|
||||
bool tess_uses_primid);
|
||||
uint32_t num_tcs_output_cp, uint32_t num_mem_tcs_outputs,
|
||||
uint32_t num_mem_tcs_patch_outputs, uint32_t lds_per_patch,
|
||||
uint32_t wave_size, bool tess_uses_primid);
|
||||
|
||||
uint32_t ac_apply_cu_en(uint32_t value, uint32_t clear_mask, unsigned value_shift,
|
||||
const struct radeon_info *info);
|
||||
|
|
|
|||
|
|
@ -76,28 +76,31 @@
|
|||
* ### VRAM layout used by TCS-TES I/O:
|
||||
*
|
||||
* ```
|
||||
* attr 0 of patch 0 vertex 0 <─── "off-chip LDS" offset
|
||||
* attr 0 of patch 0 vertex 0 <─── "off-chip LDS" offset, aligned to >= 4K
|
||||
* attr 0 of patch 0 vertex 1
|
||||
* attr 0 of patch 0 vertex 2
|
||||
* ...
|
||||
* attr 0 of patch 1 vertex 0
|
||||
* attr 0 of patch 1 vertex 1
|
||||
* attr 0 of patch 1 vertex 2 <─── hs_per_vertex_output_vmem_offset (attribute slot = 0, rel_patch_id = 1, vertex index = 1)
|
||||
* attr 0 of patch 1 vertex 2 <─── hs_per_vertex_output_vmem_offset (attribute slot = 0, rel_patch_id = 1, vertex index = 2)
|
||||
* ...
|
||||
* attr 0 of patch 2 vertex 0
|
||||
* attr 0 of patch 2 vertex 1
|
||||
* attr 0 of patch 2 vertex 2
|
||||
* ...
|
||||
* attr 1 of patch 0 vertex 0
|
||||
* [pad to 256B]
|
||||
* attr 1 of patch 0 vertex 0 <─── aligned to 256B
|
||||
* attr 1 of patch 0 vertex 1
|
||||
* attr 1 of patch 0 vertex 2
|
||||
* ...
|
||||
* ...
|
||||
* per-patch attr 0 of patch 0 <─── hs_out_patch_data_offset_amd
|
||||
* [pad to 256B]
|
||||
* per-patch attr 0 of patch 0 <─── hs_out_patch_data_offset_amd, aligned to 256B
|
||||
* per-patch attr 0 of patch 1
|
||||
* per-patch attr 0 of patch 2 <─── hs_per_patch_output_vmem_offset (attribute slot = 0, rel_patch_id = 2)
|
||||
* ...
|
||||
* per-patch attr 1 of patch 0
|
||||
* [pad to 256B]
|
||||
* per-patch attr 1 of patch 0 <─── aligned to 256B
|
||||
* per-patch attr 1 of patch 1
|
||||
* per-patch attr 1 of patch 2
|
||||
* ...
|
||||
|
|
@ -477,6 +480,8 @@ hs_per_vertex_output_vmem_offset(nir_builder *b, lower_tess_io_state *st, unsign
|
|||
|
||||
nir_def *tcs_num_patches = nir_load_tcs_num_patches_amd(b);
|
||||
nir_def *attr_stride = nir_imul(b, tcs_num_patches, nir_imul_imm(b, out_vertices_per_patch, 16u));
|
||||
/* Align the stride to 256B. */
|
||||
attr_stride = nir_align_imm(b, attr_stride, 256);
|
||||
nir_def *off =
|
||||
ac_nir_calc_io_off(b, component, io_offset, attr_stride, 4u,
|
||||
hs_output_vram_map_io_location(b->shader, true, location, st));
|
||||
|
|
@ -495,10 +500,12 @@ hs_per_patch_output_vmem_offset(nir_builder *b, lower_tess_io_state *st, unsigne
|
|||
{
|
||||
nir_def *tcs_num_patches = nir_load_tcs_num_patches_amd(b);
|
||||
nir_def *per_patch_data_offset = nir_load_hs_out_patch_data_offset_amd(b);
|
||||
/* Align the stride to 256B. */
|
||||
nir_def *attr_stride = nir_align_imm(b, nir_imul_imm(b, tcs_num_patches, 16u), 256);
|
||||
|
||||
nir_def *off =
|
||||
io_offset
|
||||
? ac_nir_calc_io_off(b, component, io_offset, nir_imul_imm(b, tcs_num_patches, 16u), 4u,
|
||||
? ac_nir_calc_io_off(b, component, io_offset, attr_stride, 4u,
|
||||
hs_output_vram_map_io_location(b->shader, false, location, st))
|
||||
: nir_imm_int(b, 0);
|
||||
|
||||
|
|
@ -1023,8 +1030,7 @@ hs_store_tess_factors_for_tes(nir_builder *b, tess_levels tessfactors, lower_tes
|
|||
const bool tes_reads_inner = st->tes_inputs_read & VARYING_BIT_TESS_LEVEL_INNER;
|
||||
|
||||
if (st->tcs_tess_level_outer_mask && tes_reads_outer) {
|
||||
const unsigned tf_outer_loc = hs_output_vram_map_io_location(b->shader, false, VARYING_SLOT_TESS_LEVEL_OUTER, st);
|
||||
nir_def *vmem_off_outer = hs_per_patch_output_vmem_offset(b, st, 0, 0, NULL, tf_outer_loc * 16);
|
||||
nir_def *vmem_off_outer = hs_per_patch_output_vmem_offset(b, st, VARYING_SLOT_TESS_LEVEL_OUTER, 0, zero, 0);
|
||||
|
||||
nir_store_buffer_amd(b, tessfactors.outer, hs_ring_tess_offchip,
|
||||
vmem_off_outer, offchip_offset, zero,
|
||||
|
|
@ -1033,8 +1039,7 @@ hs_store_tess_factors_for_tes(nir_builder *b, tess_levels tessfactors, lower_tes
|
|||
}
|
||||
|
||||
if (tessfactors.inner && st->tcs_tess_level_inner_mask && tes_reads_inner) {
|
||||
const unsigned tf_inner_loc = hs_output_vram_map_io_location(b->shader, false, VARYING_SLOT_TESS_LEVEL_INNER, st);
|
||||
nir_def *vmem_off_inner = hs_per_patch_output_vmem_offset(b, st, 0, 0, NULL, tf_inner_loc * 16);
|
||||
nir_def *vmem_off_inner = hs_per_patch_output_vmem_offset(b, st, VARYING_SLOT_TESS_LEVEL_INNER, 0, zero, 0);
|
||||
|
||||
nir_store_buffer_amd(b, tessfactors.inner, hs_ring_tess_offchip,
|
||||
vmem_off_inner, offchip_offset, zero,
|
||||
|
|
@ -1328,8 +1333,8 @@ ac_nir_compute_tess_wg_info(const struct radeon_info *info, uint64_t outputs_rea
|
|||
unsigned lds_per_patch = num_tcs_input_cp * lds_input_vertex_size +
|
||||
num_tcs_output_cp * lds_output_vertex_size +
|
||||
lds_perpatch_output_patch_size;
|
||||
unsigned mem_per_patch = (num_tcs_output_cp * num_mem_tcs_outputs + num_mem_tcs_patch_outputs) * 16;
|
||||
unsigned num_patches = ac_compute_num_tess_patches(info, num_tcs_input_cp, num_tcs_output_cp, mem_per_patch,
|
||||
unsigned num_patches = ac_compute_num_tess_patches(info, num_tcs_input_cp, num_tcs_output_cp,
|
||||
num_mem_tcs_outputs, num_mem_tcs_patch_outputs,
|
||||
lds_per_patch, wave_size, tess_uses_primid);
|
||||
unsigned lds_size = lds_per_patch * num_patches;
|
||||
|
||||
|
|
|
|||
|
|
@ -244,33 +244,31 @@ lower_abi_instr(nir_builder *b, nir_intrinsic_instr *intrin, void *state)
|
|||
break;
|
||||
}
|
||||
case nir_intrinsic_load_hs_out_patch_data_offset_amd: {
|
||||
nir_def *num_tcs_outputs, *out_vertices_per_patch;
|
||||
|
||||
if (stage == MESA_SHADER_TESS_CTRL) {
|
||||
num_tcs_outputs = nir_imm_int(b, s->info->tcs.num_linked_outputs);
|
||||
out_vertices_per_patch = nir_imm_int(b, s->info->tcs.tcs_vertices_out);
|
||||
} else {
|
||||
if (s->info->inputs_linked) {
|
||||
out_vertices_per_patch = nir_imm_int(b, s->info->tes.tcs_vertices_out);
|
||||
num_tcs_outputs = nir_imm_int(b, s->info->tes.num_linked_inputs);
|
||||
} else {
|
||||
nir_def *n = GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_OUT_PATCH_CP);
|
||||
out_vertices_per_patch = nir_iadd_imm_nuw(b, n, 1);
|
||||
num_tcs_outputs = GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_NUM_HS_OUTPUTS);
|
||||
}
|
||||
}
|
||||
|
||||
nir_def *per_vertex_output_patch_size =
|
||||
nir_imul(b, out_vertices_per_patch, nir_imul_imm(b, num_tcs_outputs, 16u));
|
||||
nir_def *num_patches, *out_vertices_per_patch, *num_tcs_mem_outputs;
|
||||
|
||||
if (s->info->num_tess_patches) {
|
||||
unsigned num_patches = s->info->num_tess_patches;
|
||||
replacement = nir_imul_imm(b, per_vertex_output_patch_size, num_patches);
|
||||
num_patches = nir_imm_int(b, s->info->num_tess_patches);
|
||||
} else {
|
||||
nir_def *n = GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_NUM_PATCHES);
|
||||
nir_def *num_patches = nir_iadd_imm_nuw(b, n, 1);
|
||||
replacement = nir_imul(b, per_vertex_output_patch_size, num_patches);
|
||||
num_patches = nir_iadd_imm_nuw(b, n, 1);
|
||||
}
|
||||
|
||||
if (stage == MESA_SHADER_TESS_CTRL) {
|
||||
out_vertices_per_patch = nir_imm_int(b, s->info->tcs.tcs_vertices_out);
|
||||
num_tcs_mem_outputs = nir_imm_int(b, s->info->tcs.num_linked_outputs);
|
||||
} else if (s->info->inputs_linked) {
|
||||
out_vertices_per_patch = nir_imm_int(b, s->info->tes.tcs_vertices_out);
|
||||
num_tcs_mem_outputs = nir_imm_int(b, s->info->tes.num_linked_inputs);
|
||||
} else {
|
||||
nir_def *n = GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_OUT_PATCH_CP);
|
||||
out_vertices_per_patch = nir_iadd_imm_nuw(b, n, 1);
|
||||
num_tcs_mem_outputs = GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_NUM_HS_OUTPUTS);
|
||||
}
|
||||
|
||||
/* Compute the stride of a single output. */
|
||||
nir_def *attr_stride = nir_imul(b, num_patches, nir_imul_imm(b, out_vertices_per_patch, 16));
|
||||
attr_stride = nir_align_imm(b, attr_stride, 256);
|
||||
replacement = nir_imul(b, attr_stride, num_tcs_mem_outputs);
|
||||
break;
|
||||
}
|
||||
case nir_intrinsic_load_sample_positions_amd: {
|
||||
|
|
|
|||
|
|
@ -1221,6 +1221,16 @@ nir_umod_imm(nir_builder *build, nir_def *x, uint64_t y)
|
|||
}
|
||||
}
|
||||
|
||||
static inline nir_def *
|
||||
nir_align_imm(nir_builder *b, nir_def *x, uint64_t align)
|
||||
{
|
||||
if (align == 1)
|
||||
return x;
|
||||
|
||||
assert(util_is_power_of_two_nonzero64(align));
|
||||
return nir_iand_imm(b, nir_iadd_imm(b, x, align - 1), ~(align - 1));
|
||||
}
|
||||
|
||||
static inline nir_def *
|
||||
nir_ibfe_imm(nir_builder *build, nir_def *x, uint32_t offset, uint32_t size)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -309,24 +309,23 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s
|
|||
break;
|
||||
}
|
||||
case nir_intrinsic_load_hs_out_patch_data_offset_amd: {
|
||||
nir_def *per_vtx_out_patch_size = NULL;
|
||||
nir_def *tcs_num_patches =
|
||||
nir_iadd_imm_nuw(b, ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 0, 7), 1);
|
||||
nir_def *tcs_out_vertices, *num_tcs_mem_outputs;
|
||||
|
||||
if (stage == MESA_SHADER_TESS_CTRL) {
|
||||
const unsigned num_hs_out = util_last_bit64(sel->info.tcs_outputs_written_for_tes);
|
||||
const unsigned out_vtx_size = num_hs_out * 16;
|
||||
const unsigned out_vtx_per_patch = b->shader->info.tess.tcs_vertices_out;
|
||||
per_vtx_out_patch_size = nir_imm_int(b, out_vtx_size * out_vtx_per_patch);
|
||||
tcs_out_vertices = nir_imm_int(b, b->shader->info.tess.tcs_vertices_out);
|
||||
num_tcs_mem_outputs = nir_imm_int(b, util_last_bit64(sel->info.tcs_outputs_written_for_tes));
|
||||
} else {
|
||||
nir_def *num_hs_out = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 23, 6);
|
||||
nir_def *out_vtx_size = nir_ishl_imm(b, num_hs_out, 4);
|
||||
nir_def *o = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 7, 5);
|
||||
nir_def *out_vtx_per_patch = nir_iadd_imm_nuw(b, o, 1);
|
||||
per_vtx_out_patch_size = nir_imul(b, out_vtx_per_patch, out_vtx_size);
|
||||
tcs_out_vertices =
|
||||
nir_iadd_imm_nuw(b, ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 7, 5), 1);
|
||||
num_tcs_mem_outputs = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 23, 6);
|
||||
}
|
||||
|
||||
nir_def *p = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 0, 7);
|
||||
nir_def *num_patches = nir_iadd_imm_nuw(b, p, 1);
|
||||
replacement = nir_imul(b, per_vtx_out_patch_size, num_patches);
|
||||
/* Compute the stride of a single output. */
|
||||
nir_def *attr_stride = nir_imul(b, tcs_num_patches, nir_imul_imm(b, tcs_out_vertices, 16));
|
||||
attr_stride = nir_align_imm(b, attr_stride, 256);
|
||||
replacement = nir_imul(b, attr_stride, num_tcs_mem_outputs);
|
||||
break;
|
||||
}
|
||||
case nir_intrinsic_load_clip_half_line_width_amd: {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue