ac/nir: reserve the first LDS vec4 for the HS tf0/1 group vote in TCS

Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31673>
This commit is contained in:
Marek Olšák 2024-09-29 17:30:49 -04:00 committed by Marge Bot
parent fd5779c198
commit f4eebb373c
5 changed files with 38 additions and 10 deletions

View file

@ -17,6 +17,9 @@
extern "C" {
#endif
/* Reserve this size at the beginning of LDS for the tf0/1 shader message group vote. */
#define AC_HS_MSG_VOTE_LDS_BYTES 16
enum
{
/* SPI_PS_INPUT_CNTL_i.OFFSET[0:4] */
@ -76,6 +79,7 @@ bool ac_nir_optimize_outputs(nir_shader *nir, bool sprite_tex_disallowed,
void
ac_nir_lower_ls_outputs_to_mem(nir_shader *ls,
ac_nir_map_io_driver_location map,
enum amd_gfx_level gfx_level,
bool tcs_in_out_eq,
uint64_t tcs_inputs_read,
uint64_t tcs_temp_only_inputs);
@ -83,6 +87,7 @@ ac_nir_lower_ls_outputs_to_mem(nir_shader *ls,
void
ac_nir_lower_hs_inputs_to_mem(nir_shader *shader,
ac_nir_map_io_driver_location map,
enum amd_gfx_level gfx_level,
bool tcs_in_out_eq,
uint64_t tcs_temp_only_inputs);

View file

@ -61,8 +61,8 @@
* TCS per-vertex inputs for patch 1
* TCS per-vertex inputs for patch 2 < hs_per_vertex_input_lds_offset (rel_patch_id = 2)
* ...
* TCS per-vertex outputs for patch 0 < output_patch0_offset
* TCS per-patch outputs for patch 0 < output_patch0_patch_data_offset
* TCS per-vertex outputs for patch 0 < hs_output_lds_offset (rel_patch_id = 0, per-vertex)
* TCS per-patch outputs for patch 0 < hs_output_lds_offset (rel_patch_id = 0, per-patch)
* TCS per-vertex outputs for patch 1
* TCS per-patch outputs for patch 1
* TCS per-vertex outputs for patch 2 < hs_output_lds_offset (rel_patch_id = 2, per-vertex)
@ -284,6 +284,11 @@ lower_ls_output_store(nir_builder *b,
unsigned write_mask = nir_intrinsic_write_mask(intrin);
nir_def *off = nir_iadd_nuw(b, base_off_var, io_off);
/* The first vec4 is reserved for the tf0/1 shader message group vote. */
if (st->gfx_level >= GFX11)
off = nir_iadd_imm_nuw(b, off, AC_HS_MSG_VOTE_LDS_BYTES);
AC_NIR_STORE_IO(b, intrin->src[0].ssa, 0, write_mask, io_sem.high_16bits,
nir_store_shared, off, .write_mask = store_write_mask, .base = store_const_offset);
@ -354,8 +359,10 @@ hs_per_vertex_input_lds_offset(nir_builder *b,
const unsigned mapped = ac_nir_map_io_location(io_sem.location, st->tcs_inputs_read & ~st->tcs_temp_only_inputs,
st->map_io);
nir_def *io_offset = ac_nir_calc_io_off(b, instr, nir_imm_int(b, 16u), 4u, mapped);
nir_def *lds_offset = nir_iadd_nuw(b, nir_iadd_nuw(b, tcs_in_current_patch_offset, vertex_index_off), io_offset);
return nir_iadd_nuw(b, nir_iadd_nuw(b, tcs_in_current_patch_offset, vertex_index_off), io_offset);
/* The first LDS vec4 is reserved for the tf0/1 shader message group vote. */
return st->gfx_level >= GFX11 ? nir_iadd_imm_nuw(b, lds_offset, AC_HS_MSG_VOTE_LDS_BYTES) : lds_offset;
}
static unsigned
@ -419,17 +426,21 @@ hs_output_lds_offset(nir_builder *b,
nir_def *input_patch_size = nir_imul(b, tcs_in_vtxcnt, nir_load_lshs_vertex_stride_amd(b));
nir_def *output_patch0_offset = nir_imul(b, input_patch_size, tcs_num_patches);
nir_def *output_patch_offset = nir_iadd_nuw(b, patch_offset, output_patch0_offset);
nir_def *lds_offset;
if (per_vertex) {
nir_def *vertex_index = nir_get_io_arrayed_index_src(intrin)->ssa;
nir_def *vertex_index_off = nir_imul_imm(b, vertex_index, output_vertex_size);
off = nir_iadd_nuw(b, off, vertex_index_off);
return nir_iadd_nuw(b, off, output_patch_offset);
lds_offset = nir_iadd_nuw(b, off, output_patch_offset);
} else {
off = nir_iadd_imm_nuw(b, off, pervertex_output_patch_size);
return nir_iadd_nuw(b, off, output_patch_offset);
lds_offset = nir_iadd_nuw(b, off, output_patch_offset);
}
/* The first LDS vec4 is reserved for the tf0/1 shader message group vote. */
return st->gfx_level >= GFX11 ? nir_iadd_imm_nuw(b, lds_offset, AC_HS_MSG_VOTE_LDS_BYTES) : lds_offset;
}
static unsigned
@ -963,6 +974,7 @@ filter_any_input_access(const nir_instr *instr,
void
ac_nir_lower_ls_outputs_to_mem(nir_shader *shader,
ac_nir_map_io_driver_location map,
enum amd_gfx_level gfx_level,
bool tcs_in_out_eq,
uint64_t tcs_inputs_read,
uint64_t tcs_temp_only_inputs)
@ -970,6 +982,7 @@ ac_nir_lower_ls_outputs_to_mem(nir_shader *shader,
assert(shader->info.stage == MESA_SHADER_VERTEX);
lower_tess_io_state state = {
.gfx_level = gfx_level,
.tcs_in_out_eq = tcs_in_out_eq,
.tcs_inputs_read = tcs_inputs_read,
.tcs_temp_only_inputs = tcs_in_out_eq ? tcs_temp_only_inputs : 0,
@ -984,12 +997,14 @@ ac_nir_lower_ls_outputs_to_mem(nir_shader *shader,
void
ac_nir_lower_hs_inputs_to_mem(nir_shader *shader,
ac_nir_map_io_driver_location map,
enum amd_gfx_level gfx_level,
bool tcs_in_out_eq,
uint64_t tcs_temp_only_inputs)
{
assert(shader->info.stage == MESA_SHADER_TESS_CTRL);
lower_tess_io_state state = {
.gfx_level = gfx_level,
.tcs_inputs_read = shader->info.inputs_read,
.tcs_in_out_eq = tcs_in_out_eq,
.tcs_temp_only_inputs = tcs_in_out_eq ? tcs_temp_only_inputs : 0,

View file

@ -6,6 +6,7 @@
#include "ac_shader_util.h"
#include "ac_gpu_info.h"
#include "ac_nir.h"
#include "sid.h"
#include "util/u_math.h"
@ -1209,7 +1210,11 @@ uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t nu
uint32_t
ac_compute_tess_lds_size(const struct radeon_info *info, uint32_t lds_per_patch, uint32_t num_patches)
{
const unsigned lds_size = lds_per_patch * num_patches;
unsigned lds_size = lds_per_patch * num_patches;
/* The first vec4 is reserved for the tf0/1 shader message group vote. */
if (info->gfx_level >= GFX11)
lds_size += AC_HS_MSG_VOTE_LDS_BYTES;
assert(lds_size <= (info->gfx_level >= GFX9 ? 65536 : 32768));

View file

@ -219,7 +219,7 @@ radv_nir_lower_io_to_mem(struct radv_device *device, struct radv_shader_stage *s
if (nir->info.stage == MESA_SHADER_VERTEX) {
if (info->vs.as_ls) {
NIR_PASS_V(nir, ac_nir_lower_ls_outputs_to_mem, map_output, info->vs.tcs_in_out_eq,
NIR_PASS_V(nir, ac_nir_lower_ls_outputs_to_mem, map_output, pdev->info.gfx_level, info->vs.tcs_in_out_eq,
info->vs.hs_inputs_read, info->vs.tcs_temp_only_input_mask);
return true;
} else if (info->vs.as_es) {
@ -227,9 +227,10 @@ radv_nir_lower_io_to_mem(struct radv_device *device, struct radv_shader_stage *s
return true;
}
} else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
NIR_PASS_V(nir, ac_nir_lower_hs_inputs_to_mem, map_input, info->vs.tcs_in_out_eq, info->vs.tcs_temp_only_input_mask);
NIR_PASS_V(nir, ac_nir_lower_hs_outputs_to_mem, map_output, pdev->info.gfx_level,
info->tcs.tes_inputs_read, info->tcs.tes_patch_inputs_read, info->wave_size);
NIR_PASS_V(nir, ac_nir_lower_hs_inputs_to_mem, map_input, pdev->info.gfx_level, info->vs.tcs_in_out_eq,
info->vs.tcs_temp_only_input_mask);
NIR_PASS_V(nir, ac_nir_lower_hs_outputs_to_mem, map_output, pdev->info.gfx_level, info->tcs.tes_inputs_read,
info->tcs.tes_patch_inputs_read, info->wave_size);
return true;
} else if (nir->info.stage == MESA_SHADER_TESS_EVAL) {

View file

@ -1855,6 +1855,7 @@ static bool si_lower_io_to_mem(struct si_shader *shader, nir_shader *nir,
if (key->ge.as_ls) {
NIR_PASS_V(nir, ac_nir_lower_ls_outputs_to_mem,
is_gfx9_mono_tcs ? NULL : si_map_io_driver_location,
sel->screen->info.gfx_level,
key->ge.opt.same_patch_vertices,
is_gfx9_mono_tcs ? next_sel->info.base.inputs_read : ~0ull,
tcs_vgpr_only_inputs);
@ -1867,6 +1868,7 @@ static bool si_lower_io_to_mem(struct si_shader *shader, nir_shader *nir,
} else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
NIR_PASS_V(nir, ac_nir_lower_hs_inputs_to_mem,
is_gfx9_mono_tcs ? NULL : si_map_io_driver_location,
sel->screen->info.gfx_level,
key->ge.opt.same_patch_vertices, sel->info.tcs_vgpr_only_inputs);
/* Used by hs_emit_write_tess_factors() when monolithic shader. */