mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-27 01:50:10 +01:00
radv,radeonsi: precompute and pass TCS per-vertex output stride via a user SGPR
It's a stride of 1 output, which isn't 16. It's 16 * num_threads, aligned to 256. tcs_offchip_layout has 5 unused bits, so let's use them. Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Reviewed-by: Timur Kristóf <timur.kristof@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34780>
This commit is contained in:
parent
742227c65c
commit
a59464b6e3
10 changed files with 51 additions and 43 deletions
|
|
@ -477,11 +477,7 @@ hs_per_vertex_output_vmem_offset(nir_builder *b, lower_tess_io_state *st, unsign
|
|||
nir_def *out_vertices_per_patch = b->shader->info.stage == MESA_SHADER_TESS_CTRL
|
||||
? nir_imm_int(b, b->shader->info.tess.tcs_vertices_out)
|
||||
: nir_load_patch_vertices_in(b);
|
||||
|
||||
nir_def *tcs_num_patches = nir_load_tcs_num_patches_amd(b);
|
||||
nir_def *attr_stride = nir_imul(b, tcs_num_patches, nir_imul_imm(b, out_vertices_per_patch, 16u));
|
||||
/* Align the stride to 256B. */
|
||||
attr_stride = nir_align_imm(b, attr_stride, 256);
|
||||
nir_def *attr_stride = nir_load_tcs_mem_attrib_stride(b);
|
||||
nir_def *off =
|
||||
ac_nir_calc_io_off(b, component, io_offset, attr_stride, 4u,
|
||||
hs_output_vram_map_io_location(b->shader, true, location, st));
|
||||
|
|
|
|||
|
|
@ -242,34 +242,35 @@ lower_abi_instr(nir_builder *b, nir_intrinsic_instr *intrin, void *state)
|
|||
}
|
||||
break;
|
||||
}
|
||||
case nir_intrinsic_load_hs_out_patch_data_offset_amd: {
|
||||
nir_def *num_patches, *out_vertices_per_patch, *num_tcs_mem_outputs;
|
||||
|
||||
case nir_intrinsic_load_tcs_mem_attrib_stride:
|
||||
case nir_intrinsic_load_hs_out_patch_data_offset_amd:
|
||||
if (s->info->num_tess_patches) {
|
||||
num_patches = nir_imm_int(b, s->info->num_tess_patches);
|
||||
/* The stride is a compile-time constant. */
|
||||
unsigned tcs_vertices_out =
|
||||
stage == MESA_SHADER_TESS_CTRL ? b->shader->info.tess.tcs_vertices_out : s->info->tes.tcs_vertices_out;
|
||||
assert(tcs_vertices_out);
|
||||
/* Align the stride to 256B. */
|
||||
replacement = nir_imm_int(b, align(s->info->num_tess_patches * tcs_vertices_out * 16, 256));
|
||||
} else {
|
||||
num_patches = GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_NUM_PATCHES);
|
||||
replacement = nir_imul_imm(
|
||||
b, GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_TCS_MEM_ATTRIB_STRIDE), 256);
|
||||
}
|
||||
|
||||
if (stage == MESA_SHADER_TESS_CTRL) {
|
||||
out_vertices_per_patch = nir_imm_int(b, s->info->tcs.tcs_vertices_out);
|
||||
num_tcs_mem_outputs = nir_imm_int(b, s->info->tcs.num_linked_outputs);
|
||||
} else if (s->info->inputs_linked) {
|
||||
out_vertices_per_patch = nir_imm_int(b, s->info->tes.tcs_vertices_out);
|
||||
num_tcs_mem_outputs = nir_imm_int(b, s->info->tes.num_linked_inputs);
|
||||
} else {
|
||||
assert(stage == MESA_SHADER_TESS_EVAL);
|
||||
nir_def *n = GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_PATCH_VERTICES_IN);
|
||||
out_vertices_per_patch = nir_iadd_imm_nuw(b, n, 1);
|
||||
num_tcs_mem_outputs = GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_NUM_HS_OUTPUTS);
|
||||
}
|
||||
if (intrin->intrinsic == nir_intrinsic_load_hs_out_patch_data_offset_amd) {
|
||||
nir_def *num_tcs_mem_outputs;
|
||||
|
||||
/* Compute the stride of a single output. */
|
||||
nir_def *attr_stride = nir_imul(b, num_patches, nir_imul_imm(b, out_vertices_per_patch, 16));
|
||||
attr_stride = nir_align_imm(b, attr_stride, 256);
|
||||
replacement = nir_imul(b, attr_stride, num_tcs_mem_outputs);
|
||||
if (stage == MESA_SHADER_TESS_CTRL) {
|
||||
num_tcs_mem_outputs = nir_imm_int(b, s->info->tcs.num_linked_outputs);
|
||||
} else if (s->info->inputs_linked) {
|
||||
num_tcs_mem_outputs = nir_imm_int(b, s->info->tes.num_linked_inputs);
|
||||
} else {
|
||||
assert(stage == MESA_SHADER_TESS_EVAL);
|
||||
num_tcs_mem_outputs = GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_NUM_HS_OUTPUTS);
|
||||
}
|
||||
|
||||
replacement = nir_imul(b, replacement, num_tcs_mem_outputs);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case nir_intrinsic_load_sample_positions_amd: {
|
||||
uint32_t sample_pos_offset = (RING_PS_SAMPLE_POSITIONS * 16) - 8;
|
||||
|
||||
|
|
|
|||
|
|
@ -10654,7 +10654,11 @@ radv_emit_tess_state(struct radv_cmd_buffer *cmd_buffer)
|
|||
const uint32_t tcs_offchip_layout_offset = radv_get_user_sgpr_loc(tcs, AC_UD_TCS_OFFCHIP_LAYOUT);
|
||||
const uint32_t tes_offchip_layout_offset = radv_get_user_sgpr_loc(tes, AC_UD_TCS_OFFCHIP_LAYOUT);
|
||||
if (tcs_offchip_layout_offset) {
|
||||
unsigned tcs_out_mem_attrib_stride =
|
||||
align(cmd_buffer->state.tess_num_patches * tcs->info.tcs.tcs_vertices_out * 16, 256) / 256;
|
||||
|
||||
uint32_t tmp = SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_NUM_PATCHES, cmd_buffer->state.tess_num_patches) |
|
||||
SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_TCS_MEM_ATTRIB_STRIDE, tcs_out_mem_attrib_stride) |
|
||||
SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_NUM_LS_OUTPUTS, vs->info.vs.num_linked_outputs) |
|
||||
SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_NUM_HS_OUTPUTS, tcs->info.tcs.num_linked_outputs) |
|
||||
SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_TES_READS_TF, tes->info.tes.reads_tess_factors) |
|
||||
|
|
|
|||
|
|
@ -202,8 +202,10 @@ struct radv_nir_compiler_options {
|
|||
|
||||
#define TCS_OFFCHIP_LAYOUT_NUM_PATCHES__SHIFT 0
|
||||
#define TCS_OFFCHIP_LAYOUT_NUM_PATCHES__MASK 0x7f
|
||||
#define TCS_OFFCHIP_LAYOUT_PATCH_VERTICES_IN__SHIFT 7
|
||||
#define TCS_OFFCHIP_LAYOUT_PATCH_VERTICES_IN__MASK 0x1f
|
||||
#define TCS_OFFCHIP_LAYOUT_PATCH_VERTICES_IN__SHIFT 7
|
||||
#define TCS_OFFCHIP_LAYOUT_PATCH_VERTICES_IN__MASK 0x1f
|
||||
#define TCS_OFFCHIP_LAYOUT_TCS_MEM_ATTRIB_STRIDE__SHIFT 12
|
||||
#define TCS_OFFCHIP_LAYOUT_TCS_MEM_ATTRIB_STRIDE__MASK 0x1f
|
||||
#define TCS_OFFCHIP_LAYOUT_NUM_LS_OUTPUTS__SHIFT 17
|
||||
#define TCS_OFFCHIP_LAYOUT_NUM_LS_OUTPUTS__MASK 0x3f
|
||||
#define TCS_OFFCHIP_LAYOUT_NUM_HS_OUTPUTS__SHIFT 23
|
||||
|
|
|
|||
|
|
@ -110,7 +110,7 @@ struct radv_shader_args {
|
|||
/* # [0:6] = the number of tessellation patches, max = 127
|
||||
* # [7:11] = TCS: the number of input patch control points minus one, max = 31
|
||||
* TES: the number of output patch control points minus one, max = 31
|
||||
* # [12:16] = (unused)
|
||||
* # [12:16] = the stride of 1 TCS per-vertex output in memory / 256, max = 16
|
||||
* # [17:22] = the number of LS outputs, up to 32
|
||||
* # [23:28] = the number of HS per-vertex outputs, up to 32
|
||||
* # [29:30] = tess_primitive_mode
|
||||
|
|
|
|||
|
|
@ -348,6 +348,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
|
|||
case nir_intrinsic_load_fbfetch_image_fmask_desc_amd:
|
||||
case nir_intrinsic_load_fbfetch_image_desc_amd:
|
||||
case nir_intrinsic_load_polygon_stipple_buffer_amd:
|
||||
case nir_intrinsic_load_tcs_mem_attrib_stride:
|
||||
case nir_intrinsic_load_printf_buffer_address:
|
||||
case nir_intrinsic_load_printf_buffer_size:
|
||||
case nir_intrinsic_load_core_id_agx:
|
||||
|
|
|
|||
|
|
@ -1709,6 +1709,8 @@ system_value("rasterization_primitive_amd", 1);
|
|||
|
||||
# Number of patches processed by each TCS workgroup
|
||||
system_value("tcs_num_patches_amd", 1)
|
||||
# The stride of 1 TCS per-vertex output in memory / 256
|
||||
system_value("tcs_mem_attrib_stride", 1)
|
||||
# Whether TCS should store tessellation level outputs for TES to read
|
||||
system_value("tcs_tess_levels_to_tes_amd", dest_comp=1, bit_sizes=[1])
|
||||
# Tessellation primitive mode for TCS
|
||||
|
|
|
|||
|
|
@ -302,22 +302,20 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s
|
|||
replacement = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 0, 7);
|
||||
break;
|
||||
}
|
||||
case nir_intrinsic_load_tcs_mem_attrib_stride:
|
||||
replacement = nir_imul_imm(b, ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 12, 5), 256);
|
||||
break;
|
||||
case nir_intrinsic_load_hs_out_patch_data_offset_amd: {
|
||||
nir_def *tcs_num_patches = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 0, 7);
|
||||
nir_def *tcs_out_vertices, *num_tcs_mem_outputs;
|
||||
nir_def *num_tcs_mem_outputs;
|
||||
|
||||
if (stage == MESA_SHADER_TESS_CTRL) {
|
||||
tcs_out_vertices = nir_imm_int(b, b->shader->info.tess.tcs_vertices_out);
|
||||
if (stage == MESA_SHADER_TESS_CTRL)
|
||||
num_tcs_mem_outputs = nir_imm_int(b, util_last_bit64(sel->info.tcs_outputs_written_for_tes));
|
||||
} else {
|
||||
tcs_out_vertices =
|
||||
nir_iadd_imm_nuw(b, ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 7, 5), 1);
|
||||
else
|
||||
num_tcs_mem_outputs = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 23, 6);
|
||||
}
|
||||
|
||||
/* Compute the stride of a single output. */
|
||||
nir_def *attr_stride = nir_imul(b, tcs_num_patches, nir_imul_imm(b, tcs_out_vertices, 16));
|
||||
attr_stride = nir_align_imm(b, attr_stride, 256);
|
||||
/* Get the stride of a single output. */
|
||||
nir_def *attr_stride =
|
||||
nir_imul_imm(b, ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 12, 5), 256);
|
||||
replacement = nir_imul(b, attr_stride, num_tcs_mem_outputs);
|
||||
break;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -50,7 +50,7 @@ struct si_shader_args {
|
|||
/* Layout of TCS outputs in the offchip buffer
|
||||
* [0:6] (7 bits) = the number of patches per threadgroup, max = 127
|
||||
* [7:11] (5 bits) = patch_vertices_in - 1, different for TCS and TES, max = 31
|
||||
* [12:16] (5 bits) = (unused)
|
||||
* [12:16] (5 bits) = the stride of 1 TCS per-vertex output in memory / 256, max = 16
|
||||
* [17:22] (6 bits) = the number of LS outputs in LDS, max = 63
|
||||
* [23:28] (6 bits) = the number of HS per-vertex outputs in memory, max = 63
|
||||
* [29:30] (2 bits) = TES output primitive type (TCS only)
|
||||
|
|
|
|||
|
|
@ -4833,10 +4833,13 @@ void si_update_tess_io_layout_state(struct si_context *sctx)
|
|||
|
||||
/* Compute userdata SGPRs. */
|
||||
unsigned num_lds_vs_outputs = lds_input_vertex_size / 16;
|
||||
unsigned tcs_mem_attrib_stride = align(num_patches * num_tcs_output_cp * 16, 256) / 256;
|
||||
|
||||
assert(ls_current->config.lds_size == 0);
|
||||
assert(num_tcs_input_cp <= 32);
|
||||
assert(num_tcs_output_cp <= 32);
|
||||
assert(num_patches <= 127);
|
||||
assert(tcs_mem_attrib_stride <= 31);
|
||||
assert(num_lds_vs_outputs <= 63);
|
||||
assert(num_mem_tcs_outputs <= 63);
|
||||
|
||||
|
|
@ -4846,7 +4849,8 @@ void si_update_tess_io_layout_state(struct si_context *sctx)
|
|||
si_resource(sctx->screen->tess_rings)->gpu_address;
|
||||
assert((ring_va & BITFIELD_MASK(19)) == 0);
|
||||
|
||||
unsigned shared_fields = num_patches | (num_lds_vs_outputs << 17) | (num_mem_tcs_outputs << 23);
|
||||
unsigned shared_fields = num_patches | (tcs_mem_attrib_stride << 12) |
|
||||
(num_lds_vs_outputs << 17) | (num_mem_tcs_outputs << 23);
|
||||
|
||||
sctx->tes_offchip_ring_va_sgpr = ring_va;
|
||||
sctx->tcs_offchip_layout = (sctx->tcs_offchip_layout & 0xe0000000) |
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue