diff --git a/src/amd/common/ac_shader_args.h b/src/amd/common/ac_shader_args.h index 3854dccdc71..97ac6377233 100644 --- a/src/amd/common/ac_shader_args.h +++ b/src/amd/common/ac_shader_args.h @@ -92,6 +92,17 @@ struct ac_shader_args { struct ac_arg tcs_patch_id; struct ac_arg tcs_rel_ids; + /* # [0:6] = the number of tessellation patches, max = 127 + * # [7:11] = TCS: the number of input patch control points minus one, max = 31 + * TES: the number of output patch control points minus one, max = 31 + * # [12:16] = the stride of 1 TCS per-vertex output in memory / 256, max = 16 + * # [17:22] = the number of LS outputs, up to 32 + * # [23:28] = the number of HS per-vertex outputs, up to 32 + * # [29:30] = tess_primitive_mode + * # [31] = whether TES reads tess factors + */ + struct ac_arg tcs_offchip_layout; + /* TES */ struct ac_arg tes_u; struct ac_arg tes_v; diff --git a/src/amd/vulkan/nir/radv_nir_lower_abi.c b/src/amd/vulkan/nir/radv_nir_lower_abi.c index 5508fe70bc9..7b193b0ffe9 100644 --- a/src/amd/vulkan/nir/radv_nir_lower_abi.c +++ b/src/amd/vulkan/nir/radv_nir_lower_abi.c @@ -80,7 +80,7 @@ lower_abi_instr(nir_builder *b, nir_intrinsic_instr *intrin, void *state) if (s->info->num_tess_patches) { replacement = nir_imm_int(b, s->info->num_tess_patches); } else { - replacement = GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_NUM_PATCHES); + replacement = GET_SGPR_FIELD_NIR(s->args->ac.tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_NUM_PATCHES); } break; case nir_intrinsic_load_tcs_tess_levels_to_tes_amd: @@ -88,14 +88,14 @@ lower_abi_instr(nir_builder *b, nir_intrinsic_instr *intrin, void *state) replacement = nir_imm_bool(b, s->info->tcs.tes_reads_tess_factors); } else { replacement = - nir_ine_imm(b, GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_TES_READS_TF), 0); + nir_ine_imm(b, GET_SGPR_FIELD_NIR(s->args->ac.tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_TES_READS_TF), 0); } break; case nir_intrinsic_load_tcs_primitive_mode_amd: if (s->info->outputs_linked) { replacement = nir_imm_int(b, s->info->tes._primitive_mode); } else { - replacement = GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_PRIMITIVE_MODE); + replacement = GET_SGPR_FIELD_NIR(s->args->ac.tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_PRIMITIVE_MODE); } break; case nir_intrinsic_load_ring_esgs_amd: @@ -123,14 +123,14 @@ lower_abi_instr(nir_builder *b, nir_intrinsic_instr *intrin, void *state) if (s->gfx_state->ts.patch_control_points) { replacement = nir_imm_int(b, s->gfx_state->ts.patch_control_points); } else { - nir_def *n = GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_PATCH_VERTICES_IN); + nir_def *n = GET_SGPR_FIELD_NIR(s->args->ac.tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_PATCH_VERTICES_IN); replacement = nir_iadd_imm_nuw(b, n, 1); } } else if (stage == MESA_SHADER_TESS_EVAL) { if (s->info->tes.tcs_vertices_out) { replacement = nir_imm_int(b, s->info->tes.tcs_vertices_out); } else { - nir_def *n = GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_PATCH_VERTICES_IN); + nir_def *n = GET_SGPR_FIELD_NIR(s->args->ac.tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_PATCH_VERTICES_IN); replacement = nir_iadd_imm_nuw(b, n, 1); } } else @@ -223,7 +223,7 @@ lower_abi_instr(nir_builder *b, nir_intrinsic_instr *intrin, void *state) if (s->info->inputs_linked) { replacement = nir_imm_int(b, get_tcs_input_vertex_stride(s->info->tcs.num_linked_inputs)); } else { - nir_def *num_ls_out = GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_NUM_LS_OUTPUTS); + nir_def *num_ls_out = GET_SGPR_FIELD_NIR(s->args->ac.tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_NUM_LS_OUTPUTS); nir_def *extra_dw = nir_bcsel(b, nir_ieq_imm(b, num_ls_out, 0), nir_imm_int(b, 0), nir_imm_int(b, 4)); replacement = nir_iadd_nuw(b, nir_ishl_imm(b, num_ls_out, 4), extra_dw); } @@ -253,7 +253,7 @@ lower_abi_instr(nir_builder *b, nir_intrinsic_instr *intrin, void *state) replacement = nir_imm_int(b, align(s->info->num_tess_patches * tcs_vertices_out * 16, 256)); } else { replacement = nir_imul_imm( - b, GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_TCS_MEM_ATTRIB_STRIDE), 256); + b, GET_SGPR_FIELD_NIR(s->args->ac.tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_TCS_MEM_ATTRIB_STRIDE), 256); } if (intrin->intrinsic == nir_intrinsic_load_hs_out_patch_data_offset_amd) { @@ -265,7 +265,7 @@ lower_abi_instr(nir_builder *b, nir_intrinsic_instr *intrin, void *state) num_tcs_mem_outputs = nir_imm_int(b, s->info->tes.num_linked_inputs); } else { assert(stage == MESA_SHADER_TESS_EVAL); - num_tcs_mem_outputs = GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_NUM_HS_OUTPUTS); + num_tcs_mem_outputs = GET_SGPR_FIELD_NIR(s->args->ac.tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_NUM_HS_OUTPUTS); } replacement = nir_imul(b, replacement, num_tcs_mem_outputs); diff --git a/src/amd/vulkan/radv_shader_args.c b/src/amd/vulkan/radv_shader_args.c index 37cd08140ae..3a609eb56b3 100644 --- a/src/amd/vulkan/radv_shader_args.c +++ b/src/amd/vulkan/radv_shader_args.c @@ -402,7 +402,7 @@ declare_unmerged_vs_tcs_args(const enum amd_gfx_level gfx_level, const struct ra declare_global_input_sgprs(gfx_level, info, user_sgpr_info, args); add_ud_arg(args, 1, AC_ARG_INT, &args->ac.view_index, AC_UD_VIEW_INDEX); - add_ud_arg(args, 1, AC_ARG_INT, &args->tcs_offchip_layout, AC_UD_TCS_OFFCHIP_LAYOUT); + add_ud_arg(args, 1, AC_ARG_INT, &args->ac.tcs_offchip_layout, AC_UD_TCS_OFFCHIP_LAYOUT); add_ud_arg(args, 1, AC_ARG_INT, &args->epilog_pc, AC_UD_EPILOG_PC); add_ud_arg(args, 1, AC_ARG_INT, &args->next_stage_pc, AC_UD_NEXT_STAGE_PC); @@ -427,7 +427,7 @@ declare_unmerged_vs_tcs_args(const enum amd_gfx_level gfx_level, const struct ra ac_add_preserved(&args->ac, &args->descriptor_sets[0]); ac_add_preserved(&args->ac, &args->ac.push_constants); ac_add_preserved(&args->ac, &args->ac.view_index); - ac_add_preserved(&args->ac, &args->tcs_offchip_layout); + ac_add_preserved(&args->ac, &args->ac.tcs_offchip_layout); ac_add_preserved(&args->ac, &args->epilog_pc); /* Preserved VGPRs */ @@ -449,7 +449,7 @@ declare_unmerged_vs_tes_gs_args(const enum amd_gfx_level gfx_level, const struct declare_global_input_sgprs(gfx_level, info, user_sgpr_info, args); add_ud_arg(args, 1, AC_ARG_INT, &args->ac.view_index, AC_UD_VIEW_INDEX); - add_ud_arg(args, 1, AC_ARG_INT, &args->tcs_offchip_layout, AC_UD_TCS_OFFCHIP_LAYOUT); + add_ud_arg(args, 1, AC_ARG_INT, &args->ac.tcs_offchip_layout, AC_UD_TCS_OFFCHIP_LAYOUT); if (info->is_ngg) { add_ud_arg(args, 1, AC_ARG_INT, &args->ngg_state, AC_UD_NGG_STATE); @@ -495,7 +495,7 @@ declare_unmerged_vs_tes_gs_args(const enum amd_gfx_level gfx_level, const struct if (gfx_level >= GFX12) ac_add_preserved(&args->ac, &args->streamout_state); ac_add_preserved(&args->ac, &args->ac.view_index); - ac_add_preserved(&args->ac, &args->tcs_offchip_layout); + ac_add_preserved(&args->ac, &args->ac.tcs_offchip_layout); if (info->is_ngg) { ac_add_preserved(&args->ac, &args->ngg_state); if (gfx_level >= GFX11) @@ -686,7 +686,7 @@ declare_shader_args(const struct radv_device *device, const struct radv_graphics } if (radv_tcs_needs_state_sgpr(info, gfx_state)) { - add_ud_arg(args, 1, AC_ARG_INT, &args->tcs_offchip_layout, AC_UD_TCS_OFFCHIP_LAYOUT); + add_ud_arg(args, 1, AC_ARG_INT, &args->ac.tcs_offchip_layout, AC_UD_TCS_OFFCHIP_LAYOUT); } ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_patch_id); @@ -702,7 +702,7 @@ declare_shader_args(const struct radv_device *device, const struct radv_graphics } if (radv_tcs_needs_state_sgpr(info, gfx_state)) { - add_ud_arg(args, 1, AC_ARG_INT, &args->tcs_offchip_layout, AC_UD_TCS_OFFCHIP_LAYOUT); + add_ud_arg(args, 1, AC_ARG_INT, &args->ac.tcs_offchip_layout, AC_UD_TCS_OFFCHIP_LAYOUT); } ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset); @@ -724,7 +724,7 @@ declare_shader_args(const struct radv_device *device, const struct radv_graphics add_ud_arg(args, 1, AC_ARG_INT, &args->ac.view_index, AC_UD_VIEW_INDEX); if (radv_tes_needs_state_sgpr(info)) - add_ud_arg(args, 1, AC_ARG_INT, &args->tcs_offchip_layout, AC_UD_TCS_OFFCHIP_LAYOUT); + add_ud_arg(args, 1, AC_ARG_INT, &args->ac.tcs_offchip_layout, AC_UD_TCS_OFFCHIP_LAYOUT); if (info->tes.as_es) { ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset); @@ -776,7 +776,7 @@ declare_shader_args(const struct radv_device *device, const struct radv_graphics } if (previous_stage == MESA_SHADER_TESS_EVAL && radv_tes_needs_state_sgpr(info)) - add_ud_arg(args, 1, AC_ARG_INT, &args->tcs_offchip_layout, AC_UD_TCS_OFFCHIP_LAYOUT); + add_ud_arg(args, 1, AC_ARG_INT, &args->ac.tcs_offchip_layout, AC_UD_TCS_OFFCHIP_LAYOUT); /* Legacy GS force vrs is handled by GS copy shader. */ if (info->force_vrs_per_vertex && info->is_ngg) { diff --git a/src/amd/vulkan/radv_shader_args.h b/src/amd/vulkan/radv_shader_args.h index e57603cc950..894eb70003b 100644 --- a/src/amd/vulkan/radv_shader_args.h +++ b/src/amd/vulkan/radv_shader_args.h @@ -106,18 +106,6 @@ struct radv_shader_args { struct ac_arg stencil; struct ac_arg sample_mask; - /* TCS */ - /* # [0:6] = the number of tessellation patches, max = 127 - * # [7:11] = TCS: the number of input patch control points minus one, max = 31 - * TES: the number of output patch control points minus one, max = 31 - * # [12:16] = the stride of 1 TCS per-vertex output in memory / 256, max = 16 - * # [17:22] = the number of LS outputs, up to 32 - * # [23:28] = the number of HS per-vertex outputs, up to 32 - * # [29:30] = tess_primitive_mode - * # [31] = whether TES reads tess factors - */ - struct ac_arg tcs_offchip_layout; - /* GS */ struct ac_arg vgt_esgs_ring_itemsize; diff --git a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c index 617a1ff96fd..503e2f277c2 100644 --- a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c +++ b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c @@ -269,7 +269,7 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s } case nir_intrinsic_load_patch_vertices_in: replacement = - nir_iadd_imm(b, ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 7, 5), 1); + nir_iadd_imm(b, ac_nir_unpack_arg(b, &args->ac, args->ac.tcs_offchip_layout, 7, 5), 1); break; case nir_intrinsic_load_sample_mask_in: replacement = ac_nir_load_arg(b, &args->ac, args->ac.sample_coverage); @@ -281,7 +281,7 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s if (sel->screen->info.gfx_level >= GFX9 && shader->is_monolithic) { replacement = nir_imm_int(b, si_shader_lshs_vertex_stride(shader)); } else { - nir_def *num_ls_out = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 17, 6); + nir_def *num_ls_out = ac_nir_unpack_arg(b, &args->ac, args->ac.tcs_offchip_layout, 17, 6); nir_def *extra_dw = nir_bcsel(b, nir_ieq_imm(b, num_ls_out, 0), nir_imm_int(b, 0), nir_imm_int(b, 4)); replacement = nir_iadd_nuw(b, nir_ishl_imm(b, num_ls_out, 4), extra_dw); } @@ -299,11 +299,11 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s } break; case nir_intrinsic_load_tcs_num_patches_amd: { - replacement = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 0, 7); + replacement = ac_nir_unpack_arg(b, &args->ac, args->ac.tcs_offchip_layout, 0, 7); break; } case nir_intrinsic_load_tcs_mem_attrib_stride: - replacement = nir_imul_imm(b, ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 12, 5), 256); + replacement = nir_imul_imm(b, ac_nir_unpack_arg(b, &args->ac, args->ac.tcs_offchip_layout, 12, 5), 256); break; case nir_intrinsic_load_hs_out_patch_data_offset_amd: { nir_def *num_tcs_mem_outputs; @@ -311,11 +311,11 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s if (stage == MESA_SHADER_TESS_CTRL) num_tcs_mem_outputs = nir_imm_int(b, sel->info.tess_io_info.highest_remapped_vram_output); else - num_tcs_mem_outputs = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 23, 6); + num_tcs_mem_outputs = ac_nir_unpack_arg(b, &args->ac, args->ac.tcs_offchip_layout, 23, 6); /* Get the stride of a single output. */ nir_def *attr_stride = - nir_imul_imm(b, ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 12, 5), 256); + nir_imul_imm(b, ac_nir_unpack_arg(b, &args->ac, args->ac.tcs_offchip_layout, 12, 5), 256); replacement = nir_imul(b, attr_stride, num_tcs_mem_outputs); break; } @@ -564,7 +564,7 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s if (shader->is_monolithic) { replacement = nir_imm_bool(b, key->ge.opt.tes_reads_tess_factors); } else { - replacement = nir_ine_imm(b, ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 31, 1), 0); + replacement = nir_ine_imm(b, ac_nir_unpack_arg(b, &args->ac, args->ac.tcs_offchip_layout, 31, 1), 0); } break; case nir_intrinsic_load_tcs_primitive_mode_amd: @@ -574,7 +574,7 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s if (b->shader->info.tess._primitive_mode != TESS_PRIMITIVE_UNSPECIFIED) replacement = nir_imm_int(b, b->shader->info.tess._primitive_mode); else - replacement = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 29, 2); + replacement = ac_nir_unpack_arg(b, &args->ac, args->ac.tcs_offchip_layout, 29, 2); } break; case nir_intrinsic_load_ring_gsvs_amd: { diff --git a/src/gallium/drivers/radeonsi/si_shader_args.c b/src/gallium/drivers/radeonsi/si_shader_args.c index 2e23357453c..48cd90121d4 100644 --- a/src/gallium/drivers/radeonsi/si_shader_args.c +++ b/src/gallium/drivers/radeonsi/si_shader_args.c @@ -225,7 +225,7 @@ void si_init_shader_args(struct si_shader *shader, struct si_shader_args *args, case MESA_SHADER_TESS_CTRL: /* GFX6-GFX8 */ declare_global_desc_pointers(args); declare_per_stage_desc_pointers(args, shader, info, true); - ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_offchip_layout); ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr); ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits); ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset); @@ -262,7 +262,7 @@ void si_init_shader_args(struct si_shader *shader, struct si_shader_args *args, ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.base_vertex); ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id); ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.start_instance); - ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_offchip_layout); ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr); /* VGPRs (first TCS, then VS) */ @@ -337,7 +337,7 @@ void si_init_shader_args(struct si_shader *shader, struct si_shader_args *args, ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id); ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.start_instance); } else if (stage == MESA_SHADER_TESS_EVAL) { - ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_offchip_layout); ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr); ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */ } else { @@ -393,7 +393,7 @@ void si_init_shader_args(struct si_shader *shader, struct si_shader_args *args, declare_global_desc_pointers(args); declare_per_stage_desc_pointers(args, shader, info, true); ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits); - ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_offchip_layout); ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr); if (shader->key.ge.as_es) { diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h index a0f363b71ac..876d2d39cab 100644 --- a/src/gallium/drivers/radeonsi/si_shader_internal.h +++ b/src/gallium/drivers/radeonsi/si_shader_internal.h @@ -46,18 +46,6 @@ struct si_shader_args { struct ac_arg vs_state_bits; struct ac_arg vs_blit_inputs; - /* API TCS & TES */ - /* Layout of TCS outputs in the offchip buffer - * [0:6] (7 bits) = the number of patches per threadgroup, max = 127 - * [7:11] (5 bits) = patch_vertices_in - 1, different for TCS and TES, max = 31 - * [12:16] (5 bits) = the stride of 1 TCS per-vertex output in memory / 256, max = 16 - * [17:22] (6 bits) = the number of LS outputs in LDS, max = 63 - * [23:28] (6 bits) = the number of HS per-vertex outputs in memory, max = 63 - * [29:30] (2 bits) = TES output primitive type (TCS only) - * [31] (1 bit) = whether TES reads tess factor outputs from TCS (TCS only) - */ - struct ac_arg tcs_offchip_layout; - /* API TCS & TES */ struct ac_arg tes_offchip_addr; /* PS */ diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c index 17846bcb0ab..f88fb2d9173 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c @@ -62,7 +62,7 @@ void si_llvm_ls_build_end(struct si_shader_context *ctx) ret = si_insert_input_ret(ctx, ret, ctx->args->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS); - ret = si_insert_input_ret(ctx, ret, ctx->args->tcs_offchip_layout, 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT); + ret = si_insert_input_ret(ctx, ret, ctx->args->ac.tcs_offchip_layout, 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT); ret = si_insert_input_ret(ctx, ret, ctx->args->tes_offchip_addr, 8 + GFX9_SGPR_TCS_OFFCHIP_ADDR); unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR;