diff --git a/src/amd/vulkan/nir/radv_nir_lower_abi.c b/src/amd/vulkan/nir/radv_nir_lower_abi.c index 498cdc69d36..87905c3e67a 100644 --- a/src/amd/vulkan/nir/radv_nir_lower_abi.c +++ b/src/amd/vulkan/nir/radv_nir_lower_abi.c @@ -124,14 +124,14 @@ lower_abi_instr(nir_builder *b, nir_intrinsic_instr *intrin, void *state) if (s->gfx_state->ts.patch_control_points) { replacement = nir_imm_int(b, s->gfx_state->ts.patch_control_points); } else { - nir_def *n = GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_PATCH_CONTROL_POINTS); + nir_def *n = GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_PATCH_VERTICES_IN); replacement = nir_iadd_imm_nuw(b, n, 1); } } else if (stage == MESA_SHADER_TESS_EVAL) { if (s->info->tes.tcs_vertices_out) { replacement = nir_imm_int(b, s->info->tes.tcs_vertices_out); } else { - nir_def *n = GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_OUT_PATCH_CP); + nir_def *n = GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_PATCH_VERTICES_IN); replacement = nir_iadd_imm_nuw(b, n, 1); } } else @@ -260,7 +260,8 @@ lower_abi_instr(nir_builder *b, nir_intrinsic_instr *intrin, void *state) out_vertices_per_patch = nir_imm_int(b, s->info->tes.tcs_vertices_out); num_tcs_mem_outputs = nir_imm_int(b, s->info->tes.num_linked_inputs); } else { - nir_def *n = GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_OUT_PATCH_CP); + assert(stage == MESA_SHADER_TESS_EVAL); + nir_def *n = GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_PATCH_VERTICES_IN); out_vertices_per_patch = nir_iadd_imm_nuw(b, n, 1); num_tcs_mem_outputs = GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_NUM_HS_OUTPUTS); } diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index 8376f84a7b5..69a09eae418 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -10634,7 +10634,7 @@ radv_emit_tess_state(struct radv_cmd_buffer *cmd_buffer) const struct radv_shader *tes = radv_get_shader(cmd_buffer->state.shaders, MESA_SHADER_TESS_EVAL); const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; struct radeon_cmdbuf *cs = cmd_buffer->cs; - uint32_t tcs_offchip_layout = 0; + uint32_t tcs_offchip_layout = 0, tes_offchip_layout = 0; uint32_t pgm_hs_rsrc2 = 0; if (pdev->info.gfx_level >= GFX9) { @@ -10654,21 +10654,23 @@ radv_emit_tess_state(struct radv_cmd_buffer *cmd_buffer) const uint32_t tcs_offchip_layout_offset = radv_get_user_sgpr_loc(tcs, AC_UD_TCS_OFFCHIP_LAYOUT); const uint32_t tes_offchip_layout_offset = radv_get_user_sgpr_loc(tes, AC_UD_TCS_OFFCHIP_LAYOUT); if (tcs_offchip_layout_offset) { - tcs_offchip_layout = SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_PATCH_CONTROL_POINTS, d->vk.ts.patch_control_points - 1) | - SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_OUT_PATCH_CP, tcs->info.tcs.tcs_vertices_out - 1) | - SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_NUM_PATCHES, cmd_buffer->state.tess_num_patches - 1) | - SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_NUM_LS_OUTPUTS, vs->info.vs.num_linked_outputs) | - SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_NUM_HS_OUTPUTS, tcs->info.tcs.num_linked_outputs) | - SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_TES_READS_TF, tes->info.tes.reads_tess_factors) | - SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_PRIMITIVE_MODE, tes->info.tes._primitive_mode); + uint32_t tmp = SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_NUM_PATCHES, cmd_buffer->state.tess_num_patches - 1) | + SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_NUM_LS_OUTPUTS, vs->info.vs.num_linked_outputs) | + SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_NUM_HS_OUTPUTS, tcs->info.tcs.num_linked_outputs) | + SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_TES_READS_TF, tes->info.tes.reads_tess_factors) | + SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_PRIMITIVE_MODE, tes->info.tes._primitive_mode); + tcs_offchip_layout = + tmp | SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_PATCH_VERTICES_IN, d->vk.ts.patch_control_points - 1); + tes_offchip_layout = + tmp | SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_PATCH_VERTICES_IN, tcs->info.tcs.tcs_vertices_out - 1); assert(tes_offchip_layout_offset); } if (pdev->info.gfx_level >= GFX12) { gfx12_push_sh_reg(cmd_buffer, tcs->info.regs.pgm_rsrc2, pgm_hs_rsrc2); - if (tcs_offchip_layout) { + if (tcs_offchip_layout || tes_offchip_layout) { gfx12_push_sh_reg(cmd_buffer, tcs_offchip_layout_offset, tcs_offchip_layout); - gfx12_push_sh_reg(cmd_buffer, tes_offchip_layout_offset, tcs_offchip_layout); + gfx12_push_sh_reg(cmd_buffer, tes_offchip_layout_offset, tes_offchip_layout); } } else { radeon_begin(cs); @@ -10681,9 +10683,9 @@ radv_emit_tess_state(struct radv_cmd_buffer *cmd_buffer) radeon_set_sh_reg(vs->info.regs.pgm_rsrc2, ls_rsrc2); } - if (tcs_offchip_layout) { + if (tcs_offchip_layout || tes_offchip_layout) { radeon_set_sh_reg(tcs_offchip_layout_offset, tcs_offchip_layout); - radeon_set_sh_reg(tes_offchip_layout_offset, tcs_offchip_layout); + radeon_set_sh_reg(tes_offchip_layout_offset, tes_offchip_layout); } radeon_end(); diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h index 4ea29285b25..e177fa54c4b 100644 --- a/src/amd/vulkan/radv_shader.h +++ b/src/amd/vulkan/radv_shader.h @@ -202,10 +202,8 @@ struct radv_nir_compiler_options { #define TCS_OFFCHIP_LAYOUT_NUM_PATCHES__SHIFT 0 #define TCS_OFFCHIP_LAYOUT_NUM_PATCHES__MASK 0x7f -#define TCS_OFFCHIP_LAYOUT_PATCH_CONTROL_POINTS__SHIFT 12 -#define TCS_OFFCHIP_LAYOUT_PATCH_CONTROL_POINTS__MASK 0x1f -#define TCS_OFFCHIP_LAYOUT_OUT_PATCH_CP__SHIFT 7 -#define TCS_OFFCHIP_LAYOUT_OUT_PATCH_CP__MASK 0x1f +#define TCS_OFFCHIP_LAYOUT_PATCH_VERTICES_IN__SHIFT 7 +#define TCS_OFFCHIP_LAYOUT_PATCH_VERTICES_IN__MASK 0x1f #define TCS_OFFCHIP_LAYOUT_NUM_LS_OUTPUTS__SHIFT 17 #define TCS_OFFCHIP_LAYOUT_NUM_LS_OUTPUTS__MASK 0x3f #define TCS_OFFCHIP_LAYOUT_NUM_HS_OUTPUTS__SHIFT 23 diff --git a/src/amd/vulkan/radv_shader_args.h b/src/amd/vulkan/radv_shader_args.h index ad9b8fd6a34..2a3c3766d88 100644 --- a/src/amd/vulkan/radv_shader_args.h +++ b/src/amd/vulkan/radv_shader_args.h @@ -108,8 +108,9 @@ struct radv_shader_args { /* TCS */ /* # [0:6] = the number of tessellation patches minus one, max = 127 - * # [7:11] = the number of output patch control points minus one, max = 31 - * # [12:16] = the number of input patch control points minus one, max = 31 + * # [7:11] = TCS: the number of input patch control points minus one, max = 31 + * TES: the number of output patch control points minus one, max = 31 + * # [12:16] = (unused) * # [17:22] = the number of LS outputs, up to 32 * # [23:28] = the number of HS per-vertex outputs, up to 32 * # [29:30] = tess_primitive_mode diff --git a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c index 6b3b6a336e7..30b9bf84767 100644 --- a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c +++ b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c @@ -268,13 +268,8 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s break; } case nir_intrinsic_load_patch_vertices_in: - if (stage == MESA_SHADER_TESS_CTRL) - replacement = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 12, 5); - else if (stage == MESA_SHADER_TESS_EVAL) { - replacement = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 7, 5); - } else - unreachable("no nir_load_patch_vertices_in"); - replacement = nir_iadd_imm(b, replacement, 1); + replacement = + nir_iadd_imm(b, ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 7, 5), 1); break; case nir_intrinsic_load_sample_mask_in: replacement = ac_nir_load_arg(b, &args->ac, args->ac.sample_coverage); diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index a07ebfb4525..c1630cec1ef 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1200,6 +1200,7 @@ struct si_context { bool last_tess_uses_primid; unsigned num_patches_per_workgroup; unsigned tcs_offchip_layout; + unsigned tes_offchip_layout; unsigned tes_offchip_ring_va_sgpr; unsigned ls_hs_rsrc2; unsigned ls_hs_config; diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h index ba048738dee..e4c3c1ef026 100644 --- a/src/gallium/drivers/radeonsi/si_shader_internal.h +++ b/src/gallium/drivers/radeonsi/si_shader_internal.h @@ -48,20 +48,13 @@ struct si_shader_args { /* API TCS & TES */ /* Layout of TCS outputs in the offchip buffer - * # 7 bits - * [0:6] = the number of patches per threadgroup - 1, max = 127 - * # 5 bits - * [7:11] = the number of output vertices per patch - 1, max = 31 - * # 5 bits - * [12:16] = the number of input vertices per patch - 1, max = 31 (TCS only) - * # 6 bits - * [17:22] = the number of LS outputs in LDS, max = 63 - * # 6 bits - * [23:28] = the number of HS per-vertex outputs in memory, max = 63 - * # 2 bits - * [29:30] = TES output primitive type - * # 1 bit - * [31] = whether TES reads tess factor outputs from TCS + * [0:6] (7 bits) = the number of patches per threadgroup - 1, max = 127 + * [7:11] (5 bits) = patch_vertices_in - 1, different for TCS and TES, max = 31 + * [12:16] (5 bits) = (unused) + * [17:22] (6 bits) = the number of LS outputs in LDS, max = 63 + * [23:28] (6 bits) = the number of HS per-vertex outputs in memory, max = 63 + * [29:30] (2 bits) = TES output primitive type (TCS only) + * [31] (1 bit) = whether TES reads tess factor outputs from TCS (TCS only) */ struct ac_arg tcs_offchip_layout; diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index 144ffbda2be..5eabec2733a 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -4846,11 +4846,12 @@ void si_update_tess_io_layout_state(struct si_context *sctx) si_resource(sctx->screen->tess_rings)->gpu_address; assert((ring_va & BITFIELD_MASK(19)) == 0); + unsigned shared_fields = (num_patches - 1) | (num_lds_vs_outputs << 17) | (num_mem_tcs_outputs << 23); + sctx->tes_offchip_ring_va_sgpr = ring_va; - sctx->tcs_offchip_layout &= 0xe0000000; - sctx->tcs_offchip_layout |= - (num_patches - 1) | ((num_tcs_output_cp - 1) << 7) | ((num_tcs_input_cp - 1) << 12) | - (num_lds_vs_outputs << 17) | (num_mem_tcs_outputs << 23); + sctx->tcs_offchip_layout = (sctx->tcs_offchip_layout & 0xe0000000) | + shared_fields | ((num_tcs_input_cp - 1) << 7); + sctx->tes_offchip_layout = shared_fields | ((num_tcs_output_cp - 1) << 7); unsigned ls_hs_rsrc2; @@ -4939,18 +4940,18 @@ static void gfx6_emit_tess_io_layout_state(struct si_context *sctx, unsigned ind if (sctx->screen->info.has_set_sh_pairs_packed) { gfx11_opt_push_gfx_sh_reg(tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4, SI_TRACKED_SPI_SHADER_USER_DATA_ES__BASE_VERTEX, - sctx->tcs_offchip_layout); + sctx->tes_offchip_layout); gfx11_opt_push_gfx_sh_reg(tes_sh_base + SI_SGPR_TES_OFFCHIP_ADDR * 4, SI_TRACKED_SPI_SHADER_USER_DATA_ES__DRAWID, sctx->tes_offchip_ring_va_sgpr); } else if (sctx->ngg || sctx->shader.gs.cso) { radeon_opt_set_sh_reg2(tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4, SI_TRACKED_SPI_SHADER_USER_DATA_ES__BASE_VERTEX, - sctx->tcs_offchip_layout, sctx->tes_offchip_ring_va_sgpr); + sctx->tes_offchip_layout, sctx->tes_offchip_ring_va_sgpr); } else { radeon_opt_set_sh_reg2(tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4, SI_TRACKED_SPI_SHADER_USER_DATA_VS__BASE_VERTEX, - sctx->tcs_offchip_layout, sctx->tes_offchip_ring_va_sgpr); + sctx->tes_offchip_layout, sctx->tes_offchip_ring_va_sgpr); } radeon_end(); @@ -4994,7 +4995,7 @@ static void gfx12_emit_tess_io_layout_state(struct si_context *sctx, unsigned in */ gfx12_opt_push_gfx_sh_reg(tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4, SI_TRACKED_SPI_SHADER_USER_DATA_ES__BASE_VERTEX, - sctx->tcs_offchip_layout); + sctx->tes_offchip_layout); gfx12_opt_push_gfx_sh_reg(tes_sh_base + SI_SGPR_TES_OFFCHIP_ADDR * 4, SI_TRACKED_SPI_SHADER_USER_DATA_ES__DRAWID, sctx->tes_offchip_ring_va_sgpr);