diff --git a/src/amd/common/ac_shader_util.c b/src/amd/common/ac_shader_util.c index f9d5e2153a3..af3688212b3 100644 --- a/src/amd/common/ac_shader_util.c +++ b/src/amd/common/ac_shader_util.c @@ -926,10 +926,27 @@ unsigned ac_compute_ngg_workgroup_size(unsigned es_verts, unsigned gs_inst_prims return CLAMP(workgroup_size, 1, 256); } +static unsigned get_tcs_wg_output_mem_size(uint32_t num_tcs_output_cp, uint32_t num_mem_tcs_outputs, + uint32_t num_mem_tcs_patch_outputs, uint32_t num_patches) +{ + /* Align each per-vertex and per-patch output to 16 vec4 elements = 256B. It's most optimal when + * the 16 vec4 elements are written by 16 consecutive lanes. + * + * 256B is the granularity of interleaving memory channels, which means a single output store + * in wave64 will cover 4 channels (1024B). If an output was only aligned to 128B, wave64 could + * cover 5 channels (128B .. 1.125K) instead of 4, which could increase VMEM latency. + */ + unsigned mem_one_pervertex_output = align(16 * num_tcs_output_cp * num_patches, 256); + unsigned mem_one_perpatch_output = align(16 * num_patches, 256); + + return mem_one_pervertex_output * num_mem_tcs_outputs + + mem_one_perpatch_output * num_mem_tcs_patch_outputs; +} + uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t num_tcs_input_cp, - uint32_t num_tcs_output_cp, uint32_t vram_per_patch, - uint32_t lds_per_patch, uint32_t wave_size, - bool tess_uses_primid) + uint32_t num_tcs_output_cp, uint32_t num_mem_tcs_outputs, + uint32_t num_mem_tcs_patch_outputs, uint32_t lds_per_patch, + uint32_t wave_size, bool tess_uses_primid) { /* The VGT HS block increments the patch ID unconditionally within a single threadgroup. * This results in incorrect patch IDs when instanced draws are used. @@ -956,8 +973,24 @@ uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t nu num_patches = MIN2(num_patches, 16); /* recommended */ /* Make sure the output data fits in the offchip buffer */ - if (vram_per_patch) - num_patches = MIN2(num_patches, (info->hs_offchip_workgroup_dw_size * 4) / vram_per_patch); + unsigned mem_size = get_tcs_wg_output_mem_size(num_tcs_output_cp, num_mem_tcs_outputs, + num_mem_tcs_patch_outputs, num_patches); + if (mem_size > info->hs_offchip_workgroup_dw_size * 4) { + /* Find the number of patches that fit in memory. Each output is aligned separately, + * so this division won't return a precise result. + */ + num_patches = info->hs_offchip_workgroup_dw_size * 4 / + get_tcs_wg_output_mem_size(num_tcs_output_cp, num_mem_tcs_outputs, + num_mem_tcs_patch_outputs, 1); + assert(get_tcs_wg_output_mem_size(num_tcs_output_cp, num_mem_tcs_outputs, + num_mem_tcs_patch_outputs, num_patches) <= + info->hs_offchip_workgroup_dw_size * 4); + + while (get_tcs_wg_output_mem_size(num_tcs_output_cp, num_mem_tcs_outputs, + num_mem_tcs_patch_outputs, num_patches + 1) <= + info->hs_offchip_workgroup_dw_size * 4) + num_patches++; + } /* Make sure that the data fits in LDS. This assumes the shaders only * use LDS for the inputs and outputs. diff --git a/src/amd/common/ac_shader_util.h b/src/amd/common/ac_shader_util.h index d1936ce2540..3ede1af6bb4 100644 --- a/src/amd/common/ac_shader_util.h +++ b/src/amd/common/ac_shader_util.h @@ -294,9 +294,9 @@ unsigned ac_compute_ngg_workgroup_size(unsigned es_verts, unsigned gs_inst_prims unsigned max_vtx_out, unsigned prim_amp_factor); uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t num_tcs_input_cp, - uint32_t num_tcs_output_cp, uint32_t vram_per_patch, - uint32_t lds_per_patch, uint32_t wave_size, - bool tess_uses_primid); + uint32_t num_tcs_output_cp, uint32_t num_mem_tcs_outputs, + uint32_t num_mem_tcs_patch_outputs, uint32_t lds_per_patch, + uint32_t wave_size, bool tess_uses_primid); uint32_t ac_apply_cu_en(uint32_t value, uint32_t clear_mask, unsigned value_shift, const struct radeon_info *info); diff --git a/src/amd/common/nir/ac_nir_lower_tess_io_to_mem.c b/src/amd/common/nir/ac_nir_lower_tess_io_to_mem.c index d220fb35687..e699bc89aba 100644 --- a/src/amd/common/nir/ac_nir_lower_tess_io_to_mem.c +++ b/src/amd/common/nir/ac_nir_lower_tess_io_to_mem.c @@ -76,28 +76,31 @@ * ### VRAM layout used by TCS-TES I/O: * * ``` - * attr 0 of patch 0 vertex 0 <─── "off-chip LDS" offset + * attr 0 of patch 0 vertex 0 <─── "off-chip LDS" offset, aligned to >= 4K * attr 0 of patch 0 vertex 1 * attr 0 of patch 0 vertex 2 * ... * attr 0 of patch 1 vertex 0 * attr 0 of patch 1 vertex 1 - * attr 0 of patch 1 vertex 2 <─── hs_per_vertex_output_vmem_offset (attribute slot = 0, rel_patch_id = 1, vertex index = 1) + * attr 0 of patch 1 vertex 2 <─── hs_per_vertex_output_vmem_offset (attribute slot = 0, rel_patch_id = 1, vertex index = 2) * ... * attr 0 of patch 2 vertex 0 * attr 0 of patch 2 vertex 1 * attr 0 of patch 2 vertex 2 * ... - * attr 1 of patch 0 vertex 0 + * [pad to 256B] + * attr 1 of patch 0 vertex 0 <─── aligned to 256B * attr 1 of patch 0 vertex 1 * attr 1 of patch 0 vertex 2 * ... * ... - * per-patch attr 0 of patch 0 <─── hs_out_patch_data_offset_amd + * [pad to 256B] + * per-patch attr 0 of patch 0 <─── hs_out_patch_data_offset_amd, aligned to 256B * per-patch attr 0 of patch 1 * per-patch attr 0 of patch 2 <─── hs_per_patch_output_vmem_offset (attribute slot = 0, rel_patch_id = 2) * ... - * per-patch attr 1 of patch 0 + * [pad to 256B] + * per-patch attr 1 of patch 0 <─── aligned to 256B * per-patch attr 1 of patch 1 * per-patch attr 1 of patch 2 * ... @@ -477,6 +480,8 @@ hs_per_vertex_output_vmem_offset(nir_builder *b, lower_tess_io_state *st, unsign nir_def *tcs_num_patches = nir_load_tcs_num_patches_amd(b); nir_def *attr_stride = nir_imul(b, tcs_num_patches, nir_imul_imm(b, out_vertices_per_patch, 16u)); + /* Align the stride to 256B. */ + attr_stride = nir_align_imm(b, attr_stride, 256); nir_def *off = ac_nir_calc_io_off(b, component, io_offset, attr_stride, 4u, hs_output_vram_map_io_location(b->shader, true, location, st)); @@ -495,10 +500,12 @@ hs_per_patch_output_vmem_offset(nir_builder *b, lower_tess_io_state *st, unsigne { nir_def *tcs_num_patches = nir_load_tcs_num_patches_amd(b); nir_def *per_patch_data_offset = nir_load_hs_out_patch_data_offset_amd(b); + /* Align the stride to 256B. */ + nir_def *attr_stride = nir_align_imm(b, nir_imul_imm(b, tcs_num_patches, 16u), 256); nir_def *off = io_offset - ? ac_nir_calc_io_off(b, component, io_offset, nir_imul_imm(b, tcs_num_patches, 16u), 4u, + ? ac_nir_calc_io_off(b, component, io_offset, attr_stride, 4u, hs_output_vram_map_io_location(b->shader, false, location, st)) : nir_imm_int(b, 0); @@ -1023,8 +1030,7 @@ hs_store_tess_factors_for_tes(nir_builder *b, tess_levels tessfactors, lower_tes const bool tes_reads_inner = st->tes_inputs_read & VARYING_BIT_TESS_LEVEL_INNER; if (st->tcs_tess_level_outer_mask && tes_reads_outer) { - const unsigned tf_outer_loc = hs_output_vram_map_io_location(b->shader, false, VARYING_SLOT_TESS_LEVEL_OUTER, st); - nir_def *vmem_off_outer = hs_per_patch_output_vmem_offset(b, st, 0, 0, NULL, tf_outer_loc * 16); + nir_def *vmem_off_outer = hs_per_patch_output_vmem_offset(b, st, VARYING_SLOT_TESS_LEVEL_OUTER, 0, zero, 0); nir_store_buffer_amd(b, tessfactors.outer, hs_ring_tess_offchip, vmem_off_outer, offchip_offset, zero, @@ -1033,8 +1039,7 @@ hs_store_tess_factors_for_tes(nir_builder *b, tess_levels tessfactors, lower_tes } if (tessfactors.inner && st->tcs_tess_level_inner_mask && tes_reads_inner) { - const unsigned tf_inner_loc = hs_output_vram_map_io_location(b->shader, false, VARYING_SLOT_TESS_LEVEL_INNER, st); - nir_def *vmem_off_inner = hs_per_patch_output_vmem_offset(b, st, 0, 0, NULL, tf_inner_loc * 16); + nir_def *vmem_off_inner = hs_per_patch_output_vmem_offset(b, st, VARYING_SLOT_TESS_LEVEL_INNER, 0, zero, 0); nir_store_buffer_amd(b, tessfactors.inner, hs_ring_tess_offchip, vmem_off_inner, offchip_offset, zero, @@ -1328,8 +1333,8 @@ ac_nir_compute_tess_wg_info(const struct radeon_info *info, uint64_t outputs_rea unsigned lds_per_patch = num_tcs_input_cp * lds_input_vertex_size + num_tcs_output_cp * lds_output_vertex_size + lds_perpatch_output_patch_size; - unsigned mem_per_patch = (num_tcs_output_cp * num_mem_tcs_outputs + num_mem_tcs_patch_outputs) * 16; - unsigned num_patches = ac_compute_num_tess_patches(info, num_tcs_input_cp, num_tcs_output_cp, mem_per_patch, + unsigned num_patches = ac_compute_num_tess_patches(info, num_tcs_input_cp, num_tcs_output_cp, + num_mem_tcs_outputs, num_mem_tcs_patch_outputs, lds_per_patch, wave_size, tess_uses_primid); unsigned lds_size = lds_per_patch * num_patches; diff --git a/src/amd/vulkan/nir/radv_nir_lower_abi.c b/src/amd/vulkan/nir/radv_nir_lower_abi.c index 85cb42c9371..498cdc69d36 100644 --- a/src/amd/vulkan/nir/radv_nir_lower_abi.c +++ b/src/amd/vulkan/nir/radv_nir_lower_abi.c @@ -244,33 +244,31 @@ lower_abi_instr(nir_builder *b, nir_intrinsic_instr *intrin, void *state) break; } case nir_intrinsic_load_hs_out_patch_data_offset_amd: { - nir_def *num_tcs_outputs, *out_vertices_per_patch; - - if (stage == MESA_SHADER_TESS_CTRL) { - num_tcs_outputs = nir_imm_int(b, s->info->tcs.num_linked_outputs); - out_vertices_per_patch = nir_imm_int(b, s->info->tcs.tcs_vertices_out); - } else { - if (s->info->inputs_linked) { - out_vertices_per_patch = nir_imm_int(b, s->info->tes.tcs_vertices_out); - num_tcs_outputs = nir_imm_int(b, s->info->tes.num_linked_inputs); - } else { - nir_def *n = GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_OUT_PATCH_CP); - out_vertices_per_patch = nir_iadd_imm_nuw(b, n, 1); - num_tcs_outputs = GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_NUM_HS_OUTPUTS); - } - } - - nir_def *per_vertex_output_patch_size = - nir_imul(b, out_vertices_per_patch, nir_imul_imm(b, num_tcs_outputs, 16u)); + nir_def *num_patches, *out_vertices_per_patch, *num_tcs_mem_outputs; if (s->info->num_tess_patches) { - unsigned num_patches = s->info->num_tess_patches; - replacement = nir_imul_imm(b, per_vertex_output_patch_size, num_patches); + num_patches = nir_imm_int(b, s->info->num_tess_patches); } else { nir_def *n = GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_NUM_PATCHES); - nir_def *num_patches = nir_iadd_imm_nuw(b, n, 1); - replacement = nir_imul(b, per_vertex_output_patch_size, num_patches); + num_patches = nir_iadd_imm_nuw(b, n, 1); } + + if (stage == MESA_SHADER_TESS_CTRL) { + out_vertices_per_patch = nir_imm_int(b, s->info->tcs.tcs_vertices_out); + num_tcs_mem_outputs = nir_imm_int(b, s->info->tcs.num_linked_outputs); + } else if (s->info->inputs_linked) { + out_vertices_per_patch = nir_imm_int(b, s->info->tes.tcs_vertices_out); + num_tcs_mem_outputs = nir_imm_int(b, s->info->tes.num_linked_inputs); + } else { + nir_def *n = GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_OUT_PATCH_CP); + out_vertices_per_patch = nir_iadd_imm_nuw(b, n, 1); + num_tcs_mem_outputs = GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_NUM_HS_OUTPUTS); + } + + /* Compute the stride of a single output. */ + nir_def *attr_stride = nir_imul(b, num_patches, nir_imul_imm(b, out_vertices_per_patch, 16)); + attr_stride = nir_align_imm(b, attr_stride, 256); + replacement = nir_imul(b, attr_stride, num_tcs_mem_outputs); break; } case nir_intrinsic_load_sample_positions_amd: { diff --git a/src/compiler/nir/nir_builder.h b/src/compiler/nir/nir_builder.h index 9e8e5bd7858..83b817449b2 100644 --- a/src/compiler/nir/nir_builder.h +++ b/src/compiler/nir/nir_builder.h @@ -1221,6 +1221,16 @@ nir_umod_imm(nir_builder *build, nir_def *x, uint64_t y) } } +static inline nir_def * +nir_align_imm(nir_builder *b, nir_def *x, uint64_t align) +{ + if (align == 1) + return x; + + assert(util_is_power_of_two_nonzero64(align)); + return nir_iand_imm(b, nir_iadd_imm(b, x, align - 1), ~(align - 1)); +} + static inline nir_def * nir_ibfe_imm(nir_builder *build, nir_def *x, uint32_t offset, uint32_t size) { diff --git a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c index 3bf567aa9cb..6b3b6a336e7 100644 --- a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c +++ b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c @@ -309,24 +309,23 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s break; } case nir_intrinsic_load_hs_out_patch_data_offset_amd: { - nir_def *per_vtx_out_patch_size = NULL; + nir_def *tcs_num_patches = + nir_iadd_imm_nuw(b, ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 0, 7), 1); + nir_def *tcs_out_vertices, *num_tcs_mem_outputs; if (stage == MESA_SHADER_TESS_CTRL) { - const unsigned num_hs_out = util_last_bit64(sel->info.tcs_outputs_written_for_tes); - const unsigned out_vtx_size = num_hs_out * 16; - const unsigned out_vtx_per_patch = b->shader->info.tess.tcs_vertices_out; - per_vtx_out_patch_size = nir_imm_int(b, out_vtx_size * out_vtx_per_patch); + tcs_out_vertices = nir_imm_int(b, b->shader->info.tess.tcs_vertices_out); + num_tcs_mem_outputs = nir_imm_int(b, util_last_bit64(sel->info.tcs_outputs_written_for_tes)); } else { - nir_def *num_hs_out = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 23, 6); - nir_def *out_vtx_size = nir_ishl_imm(b, num_hs_out, 4); - nir_def *o = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 7, 5); - nir_def *out_vtx_per_patch = nir_iadd_imm_nuw(b, o, 1); - per_vtx_out_patch_size = nir_imul(b, out_vtx_per_patch, out_vtx_size); + tcs_out_vertices = + nir_iadd_imm_nuw(b, ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 7, 5), 1); + num_tcs_mem_outputs = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 23, 6); } - nir_def *p = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 0, 7); - nir_def *num_patches = nir_iadd_imm_nuw(b, p, 1); - replacement = nir_imul(b, per_vtx_out_patch_size, num_patches); + /* Compute the stride of a single output. */ + nir_def *attr_stride = nir_imul(b, tcs_num_patches, nir_imul_imm(b, tcs_out_vertices, 16)); + attr_stride = nir_align_imm(b, attr_stride, 256); + replacement = nir_imul(b, attr_stride, num_tcs_mem_outputs); break; } case nir_intrinsic_load_clip_half_line_width_amd: {