mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-24 19:40:10 +01:00
radeonsi: move si_update/emit_tess_io_layout_state into si_state_shaders.cpp
to reduce the amount of code in si_state_draw.cpp. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24732>
This commit is contained in:
parent
9999660386
commit
e234c9fc21
2 changed files with 301 additions and 305 deletions
|
|
@ -631,309 +631,6 @@ static void si_prefetch_shaders(struct si_context *sctx)
|
|||
sctx->prefetch_L2_mask = 0;
|
||||
}
|
||||
|
||||
#if GFX_VER == 6 /* declare these functions only once because they support all chips. */
|
||||
|
||||
/**
|
||||
* This calculates the LDS size for tessellation shaders (VS, TCS, TES).
|
||||
* LS.LDS_SIZE is shared by all 3 shader stages.
|
||||
*
|
||||
* The information about LDS and other non-compile-time parameters is then
|
||||
* written to userdata SGPRs.
|
||||
*
|
||||
* This depends on:
|
||||
* - patch_vertices
|
||||
* - VS and the currently selected shader variant (called by si_update_shaders)
|
||||
* - TCS and the currently selected shader variant (called by si_update_shaders)
|
||||
* - tess_uses_prim_id (called by si_update_shaders)
|
||||
* - sh_base[TESS_EVAL] depending on GS on/off (called by si_update_shaders)
|
||||
*/
|
||||
void si_update_tess_io_layout_state(struct si_context *sctx)
|
||||
{
|
||||
struct si_shader *ls_current;
|
||||
struct si_shader_selector *ls;
|
||||
struct si_shader_selector *tcs = sctx->shader.tcs.cso;
|
||||
unsigned tess_uses_primid = sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id;
|
||||
bool has_primid_instancing_bug = sctx->gfx_level == GFX6 && sctx->screen->info.max_se == 1;
|
||||
unsigned tes_sh_base = sctx->shader_pointers.sh_base[PIPE_SHADER_TESS_EVAL];
|
||||
uint8_t num_tcs_input_cp = sctx->patch_vertices;
|
||||
|
||||
assert(sctx->shader.tcs.current);
|
||||
|
||||
/* Since GFX9 has merged LS-HS in the TCS state, set LS = TCS. */
|
||||
if (sctx->gfx_level >= GFX9) {
|
||||
ls_current = sctx->shader.tcs.current;
|
||||
ls = ls_current->key.ge.part.tcs.ls;
|
||||
} else {
|
||||
ls_current = sctx->shader.vs.current;
|
||||
ls = sctx->shader.vs.cso;
|
||||
}
|
||||
|
||||
if (sctx->last_ls == ls_current && sctx->last_tcs == tcs &&
|
||||
sctx->last_tes_sh_base == tes_sh_base && sctx->last_num_tcs_input_cp == num_tcs_input_cp &&
|
||||
(!has_primid_instancing_bug || (sctx->last_tess_uses_primid == tess_uses_primid)))
|
||||
return;
|
||||
|
||||
sctx->last_ls = ls_current;
|
||||
sctx->last_tcs = tcs;
|
||||
sctx->last_tes_sh_base = tes_sh_base;
|
||||
sctx->last_num_tcs_input_cp = num_tcs_input_cp;
|
||||
sctx->last_tess_uses_primid = tess_uses_primid;
|
||||
|
||||
/* This calculates how shader inputs and outputs among VS, TCS, and TES
|
||||
* are laid out in LDS. */
|
||||
unsigned num_tcs_outputs = util_last_bit64(tcs->info.outputs_written);
|
||||
unsigned num_tcs_output_cp = tcs->info.base.tess.tcs_vertices_out;
|
||||
unsigned num_tcs_patch_outputs = util_last_bit64(tcs->info.patch_outputs_written);
|
||||
|
||||
unsigned input_vertex_size = ls->info.lshs_vertex_stride;
|
||||
unsigned output_vertex_size = num_tcs_outputs * 16;
|
||||
unsigned input_patch_size;
|
||||
|
||||
/* Allocate LDS for TCS inputs only if it's used. */
|
||||
if (!ls_current->key.ge.opt.same_patch_vertices ||
|
||||
tcs->info.base.inputs_read & ~tcs->info.tcs_vgpr_only_inputs)
|
||||
input_patch_size = num_tcs_input_cp * input_vertex_size;
|
||||
else
|
||||
input_patch_size = 0;
|
||||
|
||||
unsigned pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size;
|
||||
unsigned output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
|
||||
unsigned lds_per_patch;
|
||||
|
||||
/* Compute the LDS size per patch.
|
||||
*
|
||||
* LDS is used to store TCS outputs if they are read, and to store tess
|
||||
* factors if they are not defined in all invocations.
|
||||
*/
|
||||
if (tcs->info.base.outputs_read ||
|
||||
tcs->info.base.patch_outputs_read ||
|
||||
!tcs->info.tessfactors_are_def_in_all_invocs) {
|
||||
lds_per_patch = input_patch_size + output_patch_size;
|
||||
} else {
|
||||
/* LDS will only store TCS inputs. The offchip buffer will only store TCS outputs. */
|
||||
lds_per_patch = MAX2(input_patch_size, output_patch_size);
|
||||
}
|
||||
|
||||
/* Ensure that we only need 4 waves per CU, so that we don't need to check
|
||||
* resource usage (such as whether we have enough VGPRs to fit the whole
|
||||
* threadgroup into the CU). It also ensures that the number of tcs in and out
|
||||
* vertices per threadgroup are at most 256, which is the hw limit.
|
||||
*/
|
||||
unsigned max_verts_per_patch = MAX2(num_tcs_input_cp, num_tcs_output_cp);
|
||||
unsigned num_patches = 256 / max_verts_per_patch;
|
||||
|
||||
/* Not necessary for correctness, but higher numbers are slower.
|
||||
* The hardware can do more, but the radeonsi shader constant is
|
||||
* limited to 6 bits.
|
||||
*/
|
||||
num_patches = MIN2(num_patches, 64); /* e.g. 64 triangles in exactly 3 waves */
|
||||
|
||||
/* When distributed tessellation is unsupported, switch between SEs
|
||||
* at a higher frequency to manually balance the workload between SEs.
|
||||
*/
|
||||
if (!sctx->screen->info.has_distributed_tess && sctx->screen->info.max_se > 1)
|
||||
num_patches = MIN2(num_patches, 16); /* recommended */
|
||||
|
||||
/* Make sure the output data fits in the offchip buffer */
|
||||
num_patches =
|
||||
MIN2(num_patches, (sctx->screen->hs.tess_offchip_block_dw_size * 4) / output_patch_size);
|
||||
|
||||
/* Make sure that the data fits in LDS. This assumes the shaders only
|
||||
* use LDS for the inputs and outputs.
|
||||
*
|
||||
* The maximum allowed LDS size is 32K. Higher numbers can hang.
|
||||
* Use 16K as the maximum, so that we can fit 2 workgroups on the same CU.
|
||||
*/
|
||||
ASSERTED unsigned max_lds_size = 32 * 1024; /* hw limit */
|
||||
unsigned target_lds_size = 16 * 1024; /* target at least 2 workgroups per CU, 16K each */
|
||||
num_patches = MIN2(num_patches, target_lds_size / lds_per_patch);
|
||||
num_patches = MAX2(num_patches, 1);
|
||||
assert(num_patches * lds_per_patch <= max_lds_size);
|
||||
|
||||
/* Make sure that vector lanes are fully occupied by cutting off the last wave
|
||||
* if it's only partially filled.
|
||||
*/
|
||||
unsigned temp_verts_per_tg = num_patches * max_verts_per_patch;
|
||||
unsigned wave_size = ls_current->wave_size;
|
||||
|
||||
if (temp_verts_per_tg > wave_size &&
|
||||
(wave_size - temp_verts_per_tg % wave_size >= MAX2(max_verts_per_patch, 8)))
|
||||
num_patches = (temp_verts_per_tg & ~(wave_size - 1)) / max_verts_per_patch;
|
||||
|
||||
if (sctx->gfx_level == GFX6) {
|
||||
/* GFX6 bug workaround, related to power management. Limit LS-HS
|
||||
* threadgroups to only one wave.
|
||||
*/
|
||||
unsigned one_wave = wave_size / max_verts_per_patch;
|
||||
num_patches = MIN2(num_patches, one_wave);
|
||||
}
|
||||
|
||||
/* The VGT HS block increments the patch ID unconditionally
|
||||
* within a single threadgroup. This results in incorrect
|
||||
* patch IDs when instanced draws are used.
|
||||
*
|
||||
* The intended solution is to restrict threadgroups to
|
||||
* a single instance by setting SWITCH_ON_EOI, which
|
||||
* should cause IA to split instances up. However, this
|
||||
* doesn't work correctly on GFX6 when there is no other
|
||||
* SE to switch to.
|
||||
*/
|
||||
if (has_primid_instancing_bug && tess_uses_primid)
|
||||
num_patches = 1;
|
||||
|
||||
sctx->num_patches_per_workgroup = num_patches;
|
||||
|
||||
unsigned output_patch0_offset = input_patch_size * num_patches;
|
||||
unsigned perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size;
|
||||
|
||||
/* Compute userdata SGPRs. */
|
||||
assert(((input_vertex_size / 4) & ~0xff) == 0);
|
||||
assert(((perpatch_output_offset / 4) & ~0xffff) == 0);
|
||||
assert(num_tcs_input_cp <= 32);
|
||||
assert(num_tcs_output_cp <= 32);
|
||||
assert(num_patches <= 64);
|
||||
assert(((pervertex_output_patch_size * num_patches) & ~0xffff) == 0);
|
||||
|
||||
uint64_t ring_va = (unlikely(sctx->ws->cs_is_secure(&sctx->gfx_cs)) ?
|
||||
si_resource(sctx->tess_rings_tmz) : si_resource(sctx->tess_rings))->gpu_address;
|
||||
assert((ring_va & u_bit_consecutive(0, 19)) == 0);
|
||||
|
||||
sctx->tes_offchip_ring_va_sgpr = ring_va;
|
||||
sctx->tcs_offchip_layout =
|
||||
(num_patches - 1) | ((num_tcs_output_cp - 1) << 6) | ((num_tcs_input_cp - 1) << 11) |
|
||||
((pervertex_output_patch_size * num_patches) << 16);
|
||||
|
||||
/* Compute the LDS size. */
|
||||
unsigned lds_size = lds_per_patch * num_patches;
|
||||
|
||||
if (sctx->gfx_level >= GFX7) {
|
||||
assert(lds_size <= 65536);
|
||||
lds_size = align(lds_size, 512) / 512;
|
||||
} else {
|
||||
assert(lds_size <= 32768);
|
||||
lds_size = align(lds_size, 256) / 256;
|
||||
}
|
||||
|
||||
/* Set SI_SGPR_VS_STATE_BITS. */
|
||||
SET_FIELD(sctx->current_vs_state, VS_STATE_LS_OUT_VERTEX_SIZE, input_vertex_size / 4);
|
||||
SET_FIELD(sctx->current_vs_state, VS_STATE_TCS_OUT_PATCH0_OFFSET, perpatch_output_offset / 4);
|
||||
|
||||
/* We should be able to support in-shader LDS use with LLVM >= 9
|
||||
* by just adding the lds_sizes together, but it has never
|
||||
* been tested. */
|
||||
assert(ls_current->config.lds_size == 0);
|
||||
|
||||
unsigned ls_hs_rsrc2;
|
||||
|
||||
if (sctx->gfx_level >= GFX9) {
|
||||
ls_hs_rsrc2 = sctx->shader.tcs.current->config.rsrc2;
|
||||
|
||||
if (sctx->gfx_level >= GFX10)
|
||||
ls_hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX10(lds_size);
|
||||
else
|
||||
ls_hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX9(lds_size);
|
||||
} else {
|
||||
ls_hs_rsrc2 = sctx->shader.vs.current->config.rsrc2;
|
||||
|
||||
si_multiwave_lds_size_workaround(sctx->screen, &lds_size);
|
||||
ls_hs_rsrc2 |= S_00B52C_LDS_SIZE(lds_size);
|
||||
}
|
||||
|
||||
sctx->ls_hs_rsrc2 = ls_hs_rsrc2;
|
||||
sctx->ls_hs_config =
|
||||
S_028B58_NUM_PATCHES(sctx->num_patches_per_workgroup) |
|
||||
S_028B58_HS_NUM_INPUT_CP(num_tcs_input_cp) |
|
||||
S_028B58_HS_NUM_OUTPUT_CP(num_tcs_output_cp);
|
||||
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.tess_io_layout);
|
||||
}
|
||||
|
||||
static void si_emit_tess_io_layout_state(struct si_context *sctx)
|
||||
{
|
||||
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
|
||||
radeon_begin(cs);
|
||||
|
||||
if (!sctx->shader.tes.cso || !sctx->shader.tcs.current)
|
||||
return;
|
||||
|
||||
if (sctx->screen->info.has_set_pairs_packets) {
|
||||
radeon_opt_push_gfx_sh_reg(R_00B42C_SPI_SHADER_PGM_RSRC2_HS,
|
||||
SI_TRACKED_SPI_SHADER_PGM_RSRC2_HS, sctx->ls_hs_rsrc2);
|
||||
|
||||
/* Set userdata SGPRs for merged LS-HS. */
|
||||
radeon_opt_push_gfx_sh_reg(R_00B430_SPI_SHADER_USER_DATA_HS_0 +
|
||||
GFX9_SGPR_TCS_OFFCHIP_LAYOUT * 4,
|
||||
SI_TRACKED_SPI_SHADER_USER_DATA_HS__TCS_OFFCHIP_LAYOUT,
|
||||
sctx->tcs_offchip_layout);
|
||||
radeon_opt_push_gfx_sh_reg(R_00B430_SPI_SHADER_USER_DATA_HS_0 +
|
||||
GFX9_SGPR_TCS_OFFCHIP_ADDR * 4,
|
||||
SI_TRACKED_SPI_SHADER_USER_DATA_HS__TCS_OFFCHIP_ADDR,
|
||||
sctx->tes_offchip_ring_va_sgpr);
|
||||
} else if (sctx->gfx_level >= GFX9) {
|
||||
radeon_opt_set_sh_reg(sctx, R_00B42C_SPI_SHADER_PGM_RSRC2_HS,
|
||||
SI_TRACKED_SPI_SHADER_PGM_RSRC2_HS, sctx->ls_hs_rsrc2);
|
||||
|
||||
/* Set userdata SGPRs for merged LS-HS. */
|
||||
radeon_opt_set_sh_reg2(sctx,
|
||||
R_00B430_SPI_SHADER_USER_DATA_HS_0 +
|
||||
GFX9_SGPR_TCS_OFFCHIP_LAYOUT * 4,
|
||||
SI_TRACKED_SPI_SHADER_USER_DATA_HS__TCS_OFFCHIP_LAYOUT,
|
||||
sctx->tcs_offchip_layout, sctx->tes_offchip_ring_va_sgpr);
|
||||
} else {
|
||||
/* Due to a hw bug, RSRC2_LS must be written twice with another
|
||||
* LS register written in between. */
|
||||
if (sctx->gfx_level == GFX7 && sctx->family != CHIP_HAWAII)
|
||||
radeon_set_sh_reg(R_00B52C_SPI_SHADER_PGM_RSRC2_LS, sctx->ls_hs_rsrc2);
|
||||
radeon_set_sh_reg_seq(R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2);
|
||||
radeon_emit(sctx->shader.vs.current->config.rsrc1);
|
||||
radeon_emit(sctx->ls_hs_rsrc2);
|
||||
|
||||
/* Set userdata SGPRs for TCS. */
|
||||
radeon_opt_set_sh_reg3(sctx,
|
||||
R_00B430_SPI_SHADER_USER_DATA_HS_0 +
|
||||
GFX6_SGPR_TCS_OFFCHIP_LAYOUT * 4,
|
||||
SI_TRACKED_SPI_SHADER_USER_DATA_HS__TCS_OFFCHIP_LAYOUT,
|
||||
sctx->tcs_offchip_layout, sctx->tes_offchip_ring_va_sgpr,
|
||||
sctx->current_vs_state);
|
||||
}
|
||||
|
||||
/* Set userdata SGPRs for TES. */
|
||||
unsigned tes_sh_base = sctx->shader_pointers.sh_base[PIPE_SHADER_TESS_EVAL];
|
||||
assert(tes_sh_base);
|
||||
|
||||
/* TES (as ES or VS) reuses the BaseVertex and DrawID user SGPRs that are used when
|
||||
* tessellation is disabled. That's because those user SGPRs are only set in LS
|
||||
* for tessellation.
|
||||
*/
|
||||
if (sctx->screen->info.has_set_pairs_packets) {
|
||||
radeon_opt_push_gfx_sh_reg(tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4,
|
||||
SI_TRACKED_SPI_SHADER_USER_DATA_ES__BASE_VERTEX,
|
||||
sctx->tcs_offchip_layout);
|
||||
radeon_opt_push_gfx_sh_reg(tes_sh_base + SI_SGPR_TES_OFFCHIP_ADDR * 4,
|
||||
SI_TRACKED_SPI_SHADER_USER_DATA_ES__DRAWID,
|
||||
sctx->tes_offchip_ring_va_sgpr);
|
||||
} else {
|
||||
bool has_gs = sctx->ngg || sctx->shader.gs.cso;
|
||||
|
||||
radeon_opt_set_sh_reg2(sctx, tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4,
|
||||
has_gs ? SI_TRACKED_SPI_SHADER_USER_DATA_ES__BASE_VERTEX
|
||||
: SI_TRACKED_SPI_SHADER_USER_DATA_VS__BASE_VERTEX,
|
||||
sctx->tcs_offchip_layout, sctx->tes_offchip_ring_va_sgpr);
|
||||
}
|
||||
radeon_end();
|
||||
|
||||
radeon_begin_again(cs);
|
||||
if (sctx->gfx_level >= GFX7) {
|
||||
radeon_opt_set_context_reg_idx(sctx, R_028B58_VGT_LS_HS_CONFIG,
|
||||
SI_TRACKED_VGT_LS_HS_CONFIG, 2, sctx->ls_hs_config);
|
||||
} else {
|
||||
radeon_opt_set_context_reg(sctx, R_028B58_VGT_LS_HS_CONFIG,
|
||||
SI_TRACKED_VGT_LS_HS_CONFIG, sctx->ls_hs_config);
|
||||
}
|
||||
radeon_end_update_context_roll(sctx);
|
||||
}
|
||||
#endif
|
||||
|
||||
static unsigned si_num_prims_for_vertices(enum mesa_prim prim,
|
||||
unsigned count, unsigned vertices_per_patch)
|
||||
{
|
||||
|
|
@ -2906,8 +2603,6 @@ void si_init_spi_map_functions(struct si_context *sctx)
|
|||
sctx->emit_spi_map[30] = si_emit_spi_map<30>;
|
||||
sctx->emit_spi_map[31] = si_emit_spi_map<31>;
|
||||
sctx->emit_spi_map[32] = si_emit_spi_map<32>;
|
||||
|
||||
sctx->atoms.s.tess_io_layout.emit = si_emit_tess_io_layout_state;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -4298,6 +4298,306 @@ static void si_set_patch_vertices(struct pipe_context *ctx, uint8_t patch_vertic
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This calculates the LDS size for tessellation shaders (VS, TCS, TES).
|
||||
* LS.LDS_SIZE is shared by all 3 shader stages.
|
||||
*
|
||||
* The information about LDS and other non-compile-time parameters is then
|
||||
* written to userdata SGPRs.
|
||||
*
|
||||
* This depends on:
|
||||
* - patch_vertices
|
||||
* - VS and the currently selected shader variant (called by si_update_shaders)
|
||||
* - TCS and the currently selected shader variant (called by si_update_shaders)
|
||||
* - tess_uses_prim_id (called by si_update_shaders)
|
||||
* - sh_base[TESS_EVAL] depending on GS on/off (called by si_update_shaders)
|
||||
*/
|
||||
void si_update_tess_io_layout_state(struct si_context *sctx)
|
||||
{
|
||||
struct si_shader *ls_current;
|
||||
struct si_shader_selector *ls;
|
||||
struct si_shader_selector *tcs = sctx->shader.tcs.cso;
|
||||
unsigned tess_uses_primid = sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id;
|
||||
bool has_primid_instancing_bug = sctx->gfx_level == GFX6 && sctx->screen->info.max_se == 1;
|
||||
unsigned tes_sh_base = sctx->shader_pointers.sh_base[PIPE_SHADER_TESS_EVAL];
|
||||
uint8_t num_tcs_input_cp = sctx->patch_vertices;
|
||||
|
||||
assert(sctx->shader.tcs.current);
|
||||
|
||||
/* Since GFX9 has merged LS-HS in the TCS state, set LS = TCS. */
|
||||
if (sctx->gfx_level >= GFX9) {
|
||||
ls_current = sctx->shader.tcs.current;
|
||||
ls = ls_current->key.ge.part.tcs.ls;
|
||||
} else {
|
||||
ls_current = sctx->shader.vs.current;
|
||||
ls = sctx->shader.vs.cso;
|
||||
}
|
||||
|
||||
if (sctx->last_ls == ls_current && sctx->last_tcs == tcs &&
|
||||
sctx->last_tes_sh_base == tes_sh_base && sctx->last_num_tcs_input_cp == num_tcs_input_cp &&
|
||||
(!has_primid_instancing_bug || (sctx->last_tess_uses_primid == tess_uses_primid)))
|
||||
return;
|
||||
|
||||
sctx->last_ls = ls_current;
|
||||
sctx->last_tcs = tcs;
|
||||
sctx->last_tes_sh_base = tes_sh_base;
|
||||
sctx->last_num_tcs_input_cp = num_tcs_input_cp;
|
||||
sctx->last_tess_uses_primid = tess_uses_primid;
|
||||
|
||||
/* This calculates how shader inputs and outputs among VS, TCS, and TES
|
||||
* are laid out in LDS. */
|
||||
unsigned num_tcs_outputs = util_last_bit64(tcs->info.outputs_written);
|
||||
unsigned num_tcs_output_cp = tcs->info.base.tess.tcs_vertices_out;
|
||||
unsigned num_tcs_patch_outputs = util_last_bit64(tcs->info.patch_outputs_written);
|
||||
|
||||
unsigned input_vertex_size = ls->info.lshs_vertex_stride;
|
||||
unsigned output_vertex_size = num_tcs_outputs * 16;
|
||||
unsigned input_patch_size;
|
||||
|
||||
/* Allocate LDS for TCS inputs only if it's used. */
|
||||
if (!ls_current->key.ge.opt.same_patch_vertices ||
|
||||
tcs->info.base.inputs_read & ~tcs->info.tcs_vgpr_only_inputs)
|
||||
input_patch_size = num_tcs_input_cp * input_vertex_size;
|
||||
else
|
||||
input_patch_size = 0;
|
||||
|
||||
unsigned pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size;
|
||||
unsigned output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
|
||||
unsigned lds_per_patch;
|
||||
|
||||
/* Compute the LDS size per patch.
|
||||
*
|
||||
* LDS is used to store TCS outputs if they are read, and to store tess
|
||||
* factors if they are not defined in all invocations.
|
||||
*/
|
||||
if (tcs->info.base.outputs_read ||
|
||||
tcs->info.base.patch_outputs_read ||
|
||||
!tcs->info.tessfactors_are_def_in_all_invocs) {
|
||||
lds_per_patch = input_patch_size + output_patch_size;
|
||||
} else {
|
||||
/* LDS will only store TCS inputs. The offchip buffer will only store TCS outputs. */
|
||||
lds_per_patch = MAX2(input_patch_size, output_patch_size);
|
||||
}
|
||||
|
||||
/* Ensure that we only need 4 waves per CU, so that we don't need to check
|
||||
* resource usage (such as whether we have enough VGPRs to fit the whole
|
||||
* threadgroup into the CU). It also ensures that the number of tcs in and out
|
||||
* vertices per threadgroup are at most 256, which is the hw limit.
|
||||
*/
|
||||
unsigned max_verts_per_patch = MAX2(num_tcs_input_cp, num_tcs_output_cp);
|
||||
unsigned num_patches = 256 / max_verts_per_patch;
|
||||
|
||||
/* Not necessary for correctness, but higher numbers are slower.
|
||||
* The hardware can do more, but the radeonsi shader constant is
|
||||
* limited to 6 bits.
|
||||
*/
|
||||
num_patches = MIN2(num_patches, 64); /* e.g. 64 triangles in exactly 3 waves */
|
||||
|
||||
/* When distributed tessellation is unsupported, switch between SEs
|
||||
* at a higher frequency to manually balance the workload between SEs.
|
||||
*/
|
||||
if (!sctx->screen->info.has_distributed_tess && sctx->screen->info.max_se > 1)
|
||||
num_patches = MIN2(num_patches, 16); /* recommended */
|
||||
|
||||
/* Make sure the output data fits in the offchip buffer */
|
||||
num_patches =
|
||||
MIN2(num_patches, (sctx->screen->hs.tess_offchip_block_dw_size * 4) / output_patch_size);
|
||||
|
||||
/* Make sure that the data fits in LDS. This assumes the shaders only
|
||||
* use LDS for the inputs and outputs.
|
||||
*
|
||||
* The maximum allowed LDS size is 32K. Higher numbers can hang.
|
||||
* Use 16K as the maximum, so that we can fit 2 workgroups on the same CU.
|
||||
*/
|
||||
ASSERTED unsigned max_lds_size = 32 * 1024; /* hw limit */
|
||||
unsigned target_lds_size = 16 * 1024; /* target at least 2 workgroups per CU, 16K each */
|
||||
num_patches = MIN2(num_patches, target_lds_size / lds_per_patch);
|
||||
num_patches = MAX2(num_patches, 1);
|
||||
assert(num_patches * lds_per_patch <= max_lds_size);
|
||||
|
||||
/* Make sure that vector lanes are fully occupied by cutting off the last wave
|
||||
* if it's only partially filled.
|
||||
*/
|
||||
unsigned temp_verts_per_tg = num_patches * max_verts_per_patch;
|
||||
unsigned wave_size = ls_current->wave_size;
|
||||
|
||||
if (temp_verts_per_tg > wave_size &&
|
||||
(wave_size - temp_verts_per_tg % wave_size >= MAX2(max_verts_per_patch, 8)))
|
||||
num_patches = (temp_verts_per_tg & ~(wave_size - 1)) / max_verts_per_patch;
|
||||
|
||||
if (sctx->gfx_level == GFX6) {
|
||||
/* GFX6 bug workaround, related to power management. Limit LS-HS
|
||||
* threadgroups to only one wave.
|
||||
*/
|
||||
unsigned one_wave = wave_size / max_verts_per_patch;
|
||||
num_patches = MIN2(num_patches, one_wave);
|
||||
}
|
||||
|
||||
/* The VGT HS block increments the patch ID unconditionally
|
||||
* within a single threadgroup. This results in incorrect
|
||||
* patch IDs when instanced draws are used.
|
||||
*
|
||||
* The intended solution is to restrict threadgroups to
|
||||
* a single instance by setting SWITCH_ON_EOI, which
|
||||
* should cause IA to split instances up. However, this
|
||||
* doesn't work correctly on GFX6 when there is no other
|
||||
* SE to switch to.
|
||||
*/
|
||||
if (has_primid_instancing_bug && tess_uses_primid)
|
||||
num_patches = 1;
|
||||
|
||||
sctx->num_patches_per_workgroup = num_patches;
|
||||
|
||||
unsigned output_patch0_offset = input_patch_size * num_patches;
|
||||
unsigned perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size;
|
||||
|
||||
/* Compute userdata SGPRs. */
|
||||
assert(((input_vertex_size / 4) & ~0xff) == 0);
|
||||
assert(((perpatch_output_offset / 4) & ~0xffff) == 0);
|
||||
assert(num_tcs_input_cp <= 32);
|
||||
assert(num_tcs_output_cp <= 32);
|
||||
assert(num_patches <= 64);
|
||||
assert(((pervertex_output_patch_size * num_patches) & ~0xffff) == 0);
|
||||
|
||||
uint64_t ring_va = (unlikely(sctx->ws->cs_is_secure(&sctx->gfx_cs)) ?
|
||||
si_resource(sctx->tess_rings_tmz) : si_resource(sctx->tess_rings))->gpu_address;
|
||||
assert((ring_va & u_bit_consecutive(0, 19)) == 0);
|
||||
|
||||
sctx->tes_offchip_ring_va_sgpr = ring_va;
|
||||
sctx->tcs_offchip_layout =
|
||||
(num_patches - 1) | ((num_tcs_output_cp - 1) << 6) | ((num_tcs_input_cp - 1) << 11) |
|
||||
((pervertex_output_patch_size * num_patches) << 16);
|
||||
|
||||
/* Compute the LDS size. */
|
||||
unsigned lds_size = lds_per_patch * num_patches;
|
||||
|
||||
if (sctx->gfx_level >= GFX7) {
|
||||
assert(lds_size <= 65536);
|
||||
lds_size = align(lds_size, 512) / 512;
|
||||
} else {
|
||||
assert(lds_size <= 32768);
|
||||
lds_size = align(lds_size, 256) / 256;
|
||||
}
|
||||
|
||||
/* Set SI_SGPR_VS_STATE_BITS. */
|
||||
SET_FIELD(sctx->current_vs_state, VS_STATE_LS_OUT_VERTEX_SIZE, input_vertex_size / 4);
|
||||
SET_FIELD(sctx->current_vs_state, VS_STATE_TCS_OUT_PATCH0_OFFSET, perpatch_output_offset / 4);
|
||||
|
||||
/* We should be able to support in-shader LDS use with LLVM >= 9
|
||||
* by just adding the lds_sizes together, but it has never
|
||||
* been tested. */
|
||||
assert(ls_current->config.lds_size == 0);
|
||||
|
||||
unsigned ls_hs_rsrc2;
|
||||
|
||||
if (sctx->gfx_level >= GFX9) {
|
||||
ls_hs_rsrc2 = sctx->shader.tcs.current->config.rsrc2;
|
||||
|
||||
if (sctx->gfx_level >= GFX10)
|
||||
ls_hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX10(lds_size);
|
||||
else
|
||||
ls_hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX9(lds_size);
|
||||
} else {
|
||||
ls_hs_rsrc2 = sctx->shader.vs.current->config.rsrc2;
|
||||
|
||||
si_multiwave_lds_size_workaround(sctx->screen, &lds_size);
|
||||
ls_hs_rsrc2 |= S_00B52C_LDS_SIZE(lds_size);
|
||||
}
|
||||
|
||||
sctx->ls_hs_rsrc2 = ls_hs_rsrc2;
|
||||
sctx->ls_hs_config =
|
||||
S_028B58_NUM_PATCHES(sctx->num_patches_per_workgroup) |
|
||||
S_028B58_HS_NUM_INPUT_CP(num_tcs_input_cp) |
|
||||
S_028B58_HS_NUM_OUTPUT_CP(num_tcs_output_cp);
|
||||
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.tess_io_layout);
|
||||
}
|
||||
|
||||
static void si_emit_tess_io_layout_state(struct si_context *sctx)
|
||||
{
|
||||
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
|
||||
radeon_begin(cs);
|
||||
|
||||
if (!sctx->shader.tes.cso || !sctx->shader.tcs.current)
|
||||
return;
|
||||
|
||||
if (sctx->screen->info.has_set_pairs_packets) {
|
||||
radeon_opt_push_gfx_sh_reg(R_00B42C_SPI_SHADER_PGM_RSRC2_HS,
|
||||
SI_TRACKED_SPI_SHADER_PGM_RSRC2_HS, sctx->ls_hs_rsrc2);
|
||||
|
||||
/* Set userdata SGPRs for merged LS-HS. */
|
||||
radeon_opt_push_gfx_sh_reg(R_00B430_SPI_SHADER_USER_DATA_HS_0 +
|
||||
GFX9_SGPR_TCS_OFFCHIP_LAYOUT * 4,
|
||||
SI_TRACKED_SPI_SHADER_USER_DATA_HS__TCS_OFFCHIP_LAYOUT,
|
||||
sctx->tcs_offchip_layout);
|
||||
radeon_opt_push_gfx_sh_reg(R_00B430_SPI_SHADER_USER_DATA_HS_0 +
|
||||
GFX9_SGPR_TCS_OFFCHIP_ADDR * 4,
|
||||
SI_TRACKED_SPI_SHADER_USER_DATA_HS__TCS_OFFCHIP_ADDR,
|
||||
sctx->tes_offchip_ring_va_sgpr);
|
||||
} else if (sctx->gfx_level >= GFX9) {
|
||||
radeon_opt_set_sh_reg(sctx, R_00B42C_SPI_SHADER_PGM_RSRC2_HS,
|
||||
SI_TRACKED_SPI_SHADER_PGM_RSRC2_HS, sctx->ls_hs_rsrc2);
|
||||
|
||||
/* Set userdata SGPRs for merged LS-HS. */
|
||||
radeon_opt_set_sh_reg2(sctx,
|
||||
R_00B430_SPI_SHADER_USER_DATA_HS_0 +
|
||||
GFX9_SGPR_TCS_OFFCHIP_LAYOUT * 4,
|
||||
SI_TRACKED_SPI_SHADER_USER_DATA_HS__TCS_OFFCHIP_LAYOUT,
|
||||
sctx->tcs_offchip_layout, sctx->tes_offchip_ring_va_sgpr);
|
||||
} else {
|
||||
/* Due to a hw bug, RSRC2_LS must be written twice with another
|
||||
* LS register written in between. */
|
||||
if (sctx->gfx_level == GFX7 && sctx->family != CHIP_HAWAII)
|
||||
radeon_set_sh_reg(R_00B52C_SPI_SHADER_PGM_RSRC2_LS, sctx->ls_hs_rsrc2);
|
||||
radeon_set_sh_reg_seq(R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2);
|
||||
radeon_emit(sctx->shader.vs.current->config.rsrc1);
|
||||
radeon_emit(sctx->ls_hs_rsrc2);
|
||||
|
||||
/* Set userdata SGPRs for TCS. */
|
||||
radeon_opt_set_sh_reg3(sctx,
|
||||
R_00B430_SPI_SHADER_USER_DATA_HS_0 +
|
||||
GFX6_SGPR_TCS_OFFCHIP_LAYOUT * 4,
|
||||
SI_TRACKED_SPI_SHADER_USER_DATA_HS__TCS_OFFCHIP_LAYOUT,
|
||||
sctx->tcs_offchip_layout, sctx->tes_offchip_ring_va_sgpr,
|
||||
sctx->current_vs_state);
|
||||
}
|
||||
|
||||
/* Set userdata SGPRs for TES. */
|
||||
unsigned tes_sh_base = sctx->shader_pointers.sh_base[PIPE_SHADER_TESS_EVAL];
|
||||
assert(tes_sh_base);
|
||||
|
||||
/* TES (as ES or VS) reuses the BaseVertex and DrawID user SGPRs that are used when
|
||||
* tessellation is disabled. That's because those user SGPRs are only set in LS
|
||||
* for tessellation.
|
||||
*/
|
||||
if (sctx->screen->info.has_set_pairs_packets) {
|
||||
radeon_opt_push_gfx_sh_reg(tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4,
|
||||
SI_TRACKED_SPI_SHADER_USER_DATA_ES__BASE_VERTEX,
|
||||
sctx->tcs_offchip_layout);
|
||||
radeon_opt_push_gfx_sh_reg(tes_sh_base + SI_SGPR_TES_OFFCHIP_ADDR * 4,
|
||||
SI_TRACKED_SPI_SHADER_USER_DATA_ES__DRAWID,
|
||||
sctx->tes_offchip_ring_va_sgpr);
|
||||
} else {
|
||||
bool has_gs = sctx->ngg || sctx->shader.gs.cso;
|
||||
|
||||
radeon_opt_set_sh_reg2(sctx, tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4,
|
||||
has_gs ? SI_TRACKED_SPI_SHADER_USER_DATA_ES__BASE_VERTEX
|
||||
: SI_TRACKED_SPI_SHADER_USER_DATA_VS__BASE_VERTEX,
|
||||
sctx->tcs_offchip_layout, sctx->tes_offchip_ring_va_sgpr);
|
||||
}
|
||||
radeon_end();
|
||||
|
||||
radeon_begin_again(cs);
|
||||
if (sctx->gfx_level >= GFX7) {
|
||||
radeon_opt_set_context_reg_idx(sctx, R_028B58_VGT_LS_HS_CONFIG,
|
||||
SI_TRACKED_VGT_LS_HS_CONFIG, 2, sctx->ls_hs_config);
|
||||
} else {
|
||||
radeon_opt_set_context_reg(sctx, R_028B58_VGT_LS_HS_CONFIG,
|
||||
SI_TRACKED_VGT_LS_HS_CONFIG, sctx->ls_hs_config);
|
||||
}
|
||||
radeon_end_update_context_roll(sctx);
|
||||
}
|
||||
|
||||
void si_init_screen_live_shader_cache(struct si_screen *sscreen)
|
||||
{
|
||||
util_live_shader_cache_init(&sscreen->live_shader_cache, si_create_shader_selector,
|
||||
|
|
@ -4308,6 +4608,7 @@ void si_init_shader_functions(struct si_context *sctx)
|
|||
{
|
||||
sctx->atoms.s.vgt_pipeline_state.emit = si_emit_vgt_pipeline_state;
|
||||
sctx->atoms.s.scratch_state.emit = si_emit_scratch_state;
|
||||
sctx->atoms.s.tess_io_layout.emit = si_emit_tess_io_layout_state;
|
||||
|
||||
sctx->b.create_vs_state = si_create_shader;
|
||||
sctx->b.create_tcs_state = si_create_shader;
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue