diff --git a/src/amd/common/nir/ac_nir_lower_tess_io_to_mem.c b/src/amd/common/nir/ac_nir_lower_tess_io_to_mem.c index e699bc89aba..962c85e5f0e 100644 --- a/src/amd/common/nir/ac_nir_lower_tess_io_to_mem.c +++ b/src/amd/common/nir/ac_nir_lower_tess_io_to_mem.c @@ -477,11 +477,7 @@ hs_per_vertex_output_vmem_offset(nir_builder *b, lower_tess_io_state *st, unsign nir_def *out_vertices_per_patch = b->shader->info.stage == MESA_SHADER_TESS_CTRL ? nir_imm_int(b, b->shader->info.tess.tcs_vertices_out) : nir_load_patch_vertices_in(b); - - nir_def *tcs_num_patches = nir_load_tcs_num_patches_amd(b); - nir_def *attr_stride = nir_imul(b, tcs_num_patches, nir_imul_imm(b, out_vertices_per_patch, 16u)); - /* Align the stride to 256B. */ - attr_stride = nir_align_imm(b, attr_stride, 256); + nir_def *attr_stride = nir_load_tcs_mem_attrib_stride(b); nir_def *off = ac_nir_calc_io_off(b, component, io_offset, attr_stride, 4u, hs_output_vram_map_io_location(b->shader, true, location, st)); diff --git a/src/amd/vulkan/nir/radv_nir_lower_abi.c b/src/amd/vulkan/nir/radv_nir_lower_abi.c index 3d1e80b50fd..f8880face78 100644 --- a/src/amd/vulkan/nir/radv_nir_lower_abi.c +++ b/src/amd/vulkan/nir/radv_nir_lower_abi.c @@ -242,34 +242,35 @@ lower_abi_instr(nir_builder *b, nir_intrinsic_instr *intrin, void *state) } break; } - case nir_intrinsic_load_hs_out_patch_data_offset_amd: { - nir_def *num_patches, *out_vertices_per_patch, *num_tcs_mem_outputs; - + case nir_intrinsic_load_tcs_mem_attrib_stride: + case nir_intrinsic_load_hs_out_patch_data_offset_amd: if (s->info->num_tess_patches) { - num_patches = nir_imm_int(b, s->info->num_tess_patches); + /* The stride is a compile-time constant. */ + unsigned tcs_vertices_out = + stage == MESA_SHADER_TESS_CTRL ? b->shader->info.tess.tcs_vertices_out : s->info->tes.tcs_vertices_out; + assert(tcs_vertices_out); + /* Align the stride to 256B. */ + replacement = nir_imm_int(b, align(s->info->num_tess_patches * tcs_vertices_out * 16, 256)); } else { - num_patches = GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_NUM_PATCHES); + replacement = nir_imul_imm( + b, GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_TCS_MEM_ATTRIB_STRIDE), 256); } - if (stage == MESA_SHADER_TESS_CTRL) { - out_vertices_per_patch = nir_imm_int(b, s->info->tcs.tcs_vertices_out); - num_tcs_mem_outputs = nir_imm_int(b, s->info->tcs.num_linked_outputs); - } else if (s->info->inputs_linked) { - out_vertices_per_patch = nir_imm_int(b, s->info->tes.tcs_vertices_out); - num_tcs_mem_outputs = nir_imm_int(b, s->info->tes.num_linked_inputs); - } else { - assert(stage == MESA_SHADER_TESS_EVAL); - nir_def *n = GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_PATCH_VERTICES_IN); - out_vertices_per_patch = nir_iadd_imm_nuw(b, n, 1); - num_tcs_mem_outputs = GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_NUM_HS_OUTPUTS); - } + if (intrin->intrinsic == nir_intrinsic_load_hs_out_patch_data_offset_amd) { + nir_def *num_tcs_mem_outputs; - /* Compute the stride of a single output. */ - nir_def *attr_stride = nir_imul(b, num_patches, nir_imul_imm(b, out_vertices_per_patch, 16)); - attr_stride = nir_align_imm(b, attr_stride, 256); - replacement = nir_imul(b, attr_stride, num_tcs_mem_outputs); + if (stage == MESA_SHADER_TESS_CTRL) { + num_tcs_mem_outputs = nir_imm_int(b, s->info->tcs.num_linked_outputs); + } else if (s->info->inputs_linked) { + num_tcs_mem_outputs = nir_imm_int(b, s->info->tes.num_linked_inputs); + } else { + assert(stage == MESA_SHADER_TESS_EVAL); + num_tcs_mem_outputs = GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_NUM_HS_OUTPUTS); + } + + replacement = nir_imul(b, replacement, num_tcs_mem_outputs); + } break; - } case nir_intrinsic_load_sample_positions_amd: { uint32_t sample_pos_offset = (RING_PS_SAMPLE_POSITIONS * 16) - 8; diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index cc2642db5b2..dd4479758ed 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -10654,7 +10654,11 @@ radv_emit_tess_state(struct radv_cmd_buffer *cmd_buffer) const uint32_t tcs_offchip_layout_offset = radv_get_user_sgpr_loc(tcs, AC_UD_TCS_OFFCHIP_LAYOUT); const uint32_t tes_offchip_layout_offset = radv_get_user_sgpr_loc(tes, AC_UD_TCS_OFFCHIP_LAYOUT); if (tcs_offchip_layout_offset) { + unsigned tcs_out_mem_attrib_stride = + align(cmd_buffer->state.tess_num_patches * tcs->info.tcs.tcs_vertices_out * 16, 256) / 256; + uint32_t tmp = SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_NUM_PATCHES, cmd_buffer->state.tess_num_patches) | + SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_TCS_MEM_ATTRIB_STRIDE, tcs_out_mem_attrib_stride) | SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_NUM_LS_OUTPUTS, vs->info.vs.num_linked_outputs) | SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_NUM_HS_OUTPUTS, tcs->info.tcs.num_linked_outputs) | SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_TES_READS_TF, tes->info.tes.reads_tess_factors) | diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h index e177fa54c4b..135577f24e7 100644 --- a/src/amd/vulkan/radv_shader.h +++ b/src/amd/vulkan/radv_shader.h @@ -202,8 +202,10 @@ struct radv_nir_compiler_options { #define TCS_OFFCHIP_LAYOUT_NUM_PATCHES__SHIFT 0 #define TCS_OFFCHIP_LAYOUT_NUM_PATCHES__MASK 0x7f -#define TCS_OFFCHIP_LAYOUT_PATCH_VERTICES_IN__SHIFT 7 -#define TCS_OFFCHIP_LAYOUT_PATCH_VERTICES_IN__MASK 0x1f +#define TCS_OFFCHIP_LAYOUT_PATCH_VERTICES_IN__SHIFT 7 +#define TCS_OFFCHIP_LAYOUT_PATCH_VERTICES_IN__MASK 0x1f +#define TCS_OFFCHIP_LAYOUT_TCS_MEM_ATTRIB_STRIDE__SHIFT 12 +#define TCS_OFFCHIP_LAYOUT_TCS_MEM_ATTRIB_STRIDE__MASK 0x1f #define TCS_OFFCHIP_LAYOUT_NUM_LS_OUTPUTS__SHIFT 17 #define TCS_OFFCHIP_LAYOUT_NUM_LS_OUTPUTS__MASK 0x3f #define TCS_OFFCHIP_LAYOUT_NUM_HS_OUTPUTS__SHIFT 23 diff --git a/src/amd/vulkan/radv_shader_args.h b/src/amd/vulkan/radv_shader_args.h index 8060c26f5e8..e57603cc950 100644 --- a/src/amd/vulkan/radv_shader_args.h +++ b/src/amd/vulkan/radv_shader_args.h @@ -110,7 +110,7 @@ struct radv_shader_args { /* # [0:6] = the number of tessellation patches, max = 127 * # [7:11] = TCS: the number of input patch control points minus one, max = 31 * TES: the number of output patch control points minus one, max = 31 - * # [12:16] = (unused) + * # [12:16] = the stride of 1 TCS per-vertex output in memory / 256, max = 16 * # [17:22] = the number of LS outputs, up to 32 * # [23:28] = the number of HS per-vertex outputs, up to 32 * # [29:30] = tess_primitive_mode diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c index 409e82ac404..df2d031d0f0 100644 --- a/src/compiler/nir/nir_divergence_analysis.c +++ b/src/compiler/nir/nir_divergence_analysis.c @@ -348,6 +348,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state) case nir_intrinsic_load_fbfetch_image_fmask_desc_amd: case nir_intrinsic_load_fbfetch_image_desc_amd: case nir_intrinsic_load_polygon_stipple_buffer_amd: + case nir_intrinsic_load_tcs_mem_attrib_stride: case nir_intrinsic_load_printf_buffer_address: case nir_intrinsic_load_printf_buffer_size: case nir_intrinsic_load_core_id_agx: diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 869e74a43d8..2708123fa90 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -1709,6 +1709,8 @@ system_value("rasterization_primitive_amd", 1); # Number of patches processed by each TCS workgroup system_value("tcs_num_patches_amd", 1) +# The stride of 1 TCS per-vertex output in memory / 256 +system_value("tcs_mem_attrib_stride", 1) # Whether TCS should store tessellation level outputs for TES to read system_value("tcs_tess_levels_to_tes_amd", dest_comp=1, bit_sizes=[1]) # Tessellation primitive mode for TCS diff --git a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c index 0de1c7dad2e..0258fe35c25 100644 --- a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c +++ b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c @@ -302,22 +302,20 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s replacement = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 0, 7); break; } + case nir_intrinsic_load_tcs_mem_attrib_stride: + replacement = nir_imul_imm(b, ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 12, 5), 256); + break; case nir_intrinsic_load_hs_out_patch_data_offset_amd: { - nir_def *tcs_num_patches = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 0, 7); - nir_def *tcs_out_vertices, *num_tcs_mem_outputs; + nir_def *num_tcs_mem_outputs; - if (stage == MESA_SHADER_TESS_CTRL) { - tcs_out_vertices = nir_imm_int(b, b->shader->info.tess.tcs_vertices_out); + if (stage == MESA_SHADER_TESS_CTRL) num_tcs_mem_outputs = nir_imm_int(b, util_last_bit64(sel->info.tcs_outputs_written_for_tes)); - } else { - tcs_out_vertices = - nir_iadd_imm_nuw(b, ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 7, 5), 1); + else num_tcs_mem_outputs = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 23, 6); - } - /* Compute the stride of a single output. */ - nir_def *attr_stride = nir_imul(b, tcs_num_patches, nir_imul_imm(b, tcs_out_vertices, 16)); - attr_stride = nir_align_imm(b, attr_stride, 256); + /* Get the stride of a single output. */ + nir_def *attr_stride = + nir_imul_imm(b, ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 12, 5), 256); replacement = nir_imul(b, attr_stride, num_tcs_mem_outputs); break; } diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h index 700743860f9..a0f363b71ac 100644 --- a/src/gallium/drivers/radeonsi/si_shader_internal.h +++ b/src/gallium/drivers/radeonsi/si_shader_internal.h @@ -50,7 +50,7 @@ struct si_shader_args { /* Layout of TCS outputs in the offchip buffer * [0:6] (7 bits) = the number of patches per threadgroup, max = 127 * [7:11] (5 bits) = patch_vertices_in - 1, different for TCS and TES, max = 31 - * [12:16] (5 bits) = (unused) + * [12:16] (5 bits) = the stride of 1 TCS per-vertex output in memory / 256, max = 16 * [17:22] (6 bits) = the number of LS outputs in LDS, max = 63 * [23:28] (6 bits) = the number of HS per-vertex outputs in memory, max = 63 * [29:30] (2 bits) = TES output primitive type (TCS only) diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index 63811ee21b9..f6a36e52de8 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -4833,10 +4833,13 @@ void si_update_tess_io_layout_state(struct si_context *sctx) /* Compute userdata SGPRs. */ unsigned num_lds_vs_outputs = lds_input_vertex_size / 16; + unsigned tcs_mem_attrib_stride = align(num_patches * num_tcs_output_cp * 16, 256) / 256; + assert(ls_current->config.lds_size == 0); assert(num_tcs_input_cp <= 32); assert(num_tcs_output_cp <= 32); assert(num_patches <= 127); + assert(tcs_mem_attrib_stride <= 31); assert(num_lds_vs_outputs <= 63); assert(num_mem_tcs_outputs <= 63); @@ -4846,7 +4849,8 @@ void si_update_tess_io_layout_state(struct si_context *sctx) si_resource(sctx->screen->tess_rings)->gpu_address; assert((ring_va & BITFIELD_MASK(19)) == 0); - unsigned shared_fields = num_patches | (num_lds_vs_outputs << 17) | (num_mem_tcs_outputs << 23); + unsigned shared_fields = num_patches | (tcs_mem_attrib_stride << 12) | + (num_lds_vs_outputs << 17) | (num_mem_tcs_outputs << 23); sctx->tes_offchip_ring_va_sgpr = ring_va; sctx->tcs_offchip_layout = (sctx->tcs_offchip_layout & 0xe0000000) |