radeonsi: Use one more bit for number of patches in TCS offchip layout.

There was 1 more bit left, may as well use it for something.
In the future, this may allow increasing the maximum number of
patches per workgroup.

Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28425>
This commit is contained in:
Timur Kristóf 2024-03-30 02:01:03 +01:00
parent 04dea4aef2
commit b34e99d021
3 changed files with 11 additions and 19 deletions

View file

@ -338,9 +338,9 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s
}
case nir_intrinsic_load_patch_vertices_in:
if (stage == MESA_SHADER_TESS_CTRL)
replacement = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 11, 5);
replacement = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 12, 5);
else if (stage == MESA_SHADER_TESS_EVAL) {
replacement = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 6, 5);
replacement = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 7, 5);
} else
unreachable("no nir_load_patch_vertices_in");
replacement = nir_iadd_imm(b, replacement, 1);
@ -372,7 +372,7 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s
}
break;
case nir_intrinsic_load_tcs_num_patches_amd: {
nir_def *tmp = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 0, 6);
nir_def *tmp = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 0, 7);
replacement = nir_iadd_imm(b, tmp, 1);
break;
}
@ -387,12 +387,12 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s
} else {
nir_def *num_hs_out = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 23, 6);
nir_def *out_vtx_size = nir_ishl_imm(b, num_hs_out, 4);
nir_def *o = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 6, 5);
nir_def *o = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 7, 5);
nir_def *out_vtx_per_patch = nir_iadd_imm_nuw(b, o, 1);
per_vtx_out_patch_size = nir_imul(b, out_vtx_per_patch, out_vtx_size);
}
nir_def *p = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 0, 6);
nir_def *p = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 0, 7);
nir_def *num_patches = nir_iadd_imm_nuw(b, p, 1);
replacement = nir_imul(b, per_vtx_out_patch_size, num_patches);
break;

View file

@ -51,14 +51,12 @@ struct si_shader_args {
/* API TCS & TES */
/* Layout of TCS outputs in the offchip buffer
* # 6 bits
* [0:5] = the number of patches per threadgroup - 1, max = 63
* # 7 bits
* [0:6] = the number of patches per threadgroup - 1, max = 127
* # 5 bits
* [6:10] = the number of output vertices per patch - 1, max = 31
* [7:11] = the number of output vertices per patch - 1, max = 31
* # 5 bits
* [11:15] = the number of input vertices per patch - 1, max = 31 (TCS only)
* # 1 bit
* [16] = reserved for future use
* [12:16] = the number of input vertices per patch - 1, max = 31 (TCS only)
* # 6 bits
* [17:22] = the number of LS outputs, max = 63
* # 6 bits

View file

@ -4587,18 +4587,12 @@ void si_update_tess_io_layout_state(struct si_context *sctx)
si_mark_atom_dirty(sctx, &sctx->atoms.s.vgt_pipeline_state);
}
unsigned output_patch0_offset = input_patch_size * num_patches;
unsigned perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size;
/* Compute userdata SGPRs. */
assert(((input_vertex_size / 4) & ~0xff) == 0);
assert(((perpatch_output_offset / 4) & ~0xffff) == 0);
assert(num_tcs_input_cp <= 32);
assert(num_tcs_output_cp <= 32);
assert(num_patches <= 64);
assert(num_patches <= 128);
assert(num_vs_outputs <= 63);
assert(num_tcs_outputs <= 63);
assert(((pervertex_output_patch_size * num_patches) & ~0xffff) == 0);
uint64_t ring_va =
sctx->ws->cs_is_secure(&sctx->gfx_cs) ?
@ -4609,7 +4603,7 @@ void si_update_tess_io_layout_state(struct si_context *sctx)
sctx->tes_offchip_ring_va_sgpr = ring_va;
sctx->tcs_offchip_layout &= 0xe0000000;
sctx->tcs_offchip_layout |=
(num_patches - 1) | ((num_tcs_output_cp - 1) << 6) | ((num_tcs_input_cp - 1) << 11) |
(num_patches - 1) | ((num_tcs_output_cp - 1) << 7) | ((num_tcs_input_cp - 1) << 12) |
(num_vs_outputs << 17) | (num_tcs_outputs << 23);
/* Compute the LDS size. */