ac,radv,radeonsi: enable TCS input reads from VGPRs for all compatible loads

Cross-invocation TCS input access doesn't prevent same-invocation access.
This improves shaders that use both for the same inputs.

Also, if some components of a vec4 slot only use same-invocation access and
other components only use cross-invocation access (it's possible after
compaction), this takes the VGPR path for the components with
same-invocation access, which didn't happen previously because all masks
only describe whole vec4s.

Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31673>
This commit is contained in:
Marek Olšák 2024-10-02 16:48:39 -04:00 committed by Marge Bot
parent 99a03dc9d5
commit 85c20def94
16 changed files with 62 additions and 43 deletions

View file

@ -81,15 +81,16 @@ ac_nir_lower_ls_outputs_to_mem(nir_shader *ls,
ac_nir_map_io_driver_location map,
enum amd_gfx_level gfx_level,
bool tcs_in_out_eq,
uint64_t tcs_inputs_read,
uint64_t tcs_temp_only_inputs);
uint64_t tcs_inputs_via_temp,
uint64_t tcs_inputs_via_lds);
void
ac_nir_lower_hs_inputs_to_mem(nir_shader *shader,
ac_nir_map_io_driver_location map,
enum amd_gfx_level gfx_level,
bool tcs_in_out_eq,
uint64_t tcs_temp_only_inputs);
uint64_t tcs_inputs_via_temp,
uint64_t tcs_inputs_via_lds);
void
ac_nir_lower_hs_outputs_to_mem(nir_shader *shader,

View file

@ -1168,19 +1168,24 @@ ac_nir_lower_ls_outputs_to_mem(nir_shader *shader,
ac_nir_map_io_driver_location map,
enum amd_gfx_level gfx_level,
bool tcs_in_out_eq,
uint64_t tcs_inputs_read,
uint64_t tcs_temp_only_inputs)
uint64_t tcs_inputs_via_temp,
uint64_t tcs_inputs_via_lds)
{
assert(shader->info.stage == MESA_SHADER_VERTEX);
assert(gfx_level >= GFX9 || !tcs_in_out_eq);
lower_tess_io_state state = {
.gfx_level = gfx_level,
.tcs_inputs_via_temp = tcs_in_out_eq ? tcs_temp_only_inputs : 0,
.tcs_inputs_via_lds = tcs_inputs_read & (tcs_in_out_eq ? ~tcs_temp_only_inputs : ~0ull),
.map_io = map,
};
if (tcs_in_out_eq) {
state.tcs_inputs_via_temp = tcs_inputs_via_temp;
state.tcs_inputs_via_lds = tcs_inputs_via_lds;
} else {
state.tcs_inputs_via_lds = tcs_inputs_via_lds | tcs_inputs_via_temp;
}
nir_shader_intrinsics_pass(shader, lower_ls_output_store,
nir_metadata_control_flow,
&state);
@ -1191,18 +1196,24 @@ ac_nir_lower_hs_inputs_to_mem(nir_shader *shader,
ac_nir_map_io_driver_location map,
enum amd_gfx_level gfx_level,
bool tcs_in_out_eq,
uint64_t tcs_temp_only_inputs)
uint64_t tcs_inputs_via_temp,
uint64_t tcs_inputs_via_lds)
{
assert(shader->info.stage == MESA_SHADER_TESS_CTRL);
assert(gfx_level >= GFX9 || !tcs_in_out_eq);
lower_tess_io_state state = {
.gfx_level = gfx_level,
.tcs_inputs_via_temp = tcs_in_out_eq ? tcs_temp_only_inputs : 0,
.tcs_inputs_via_lds = shader->info.inputs_read & (tcs_in_out_eq ? ~tcs_temp_only_inputs : ~0ull),
.map_io = map,
};
if (tcs_in_out_eq) {
state.tcs_inputs_via_temp = tcs_inputs_via_temp;
state.tcs_inputs_via_lds = tcs_inputs_via_lds;
} else {
state.tcs_inputs_via_lds = shader->info.inputs_read;
}
nir_shader_lower_instructions(shader,
filter_load_tcs_per_vertex_input,
lower_hs_per_vertex_input_load,

View file

@ -11930,7 +11930,7 @@ select_program_merged(isel_context& ctx, const unsigned shader_count, nir_shader
/* Skip s_barrier from TCS when VS outputs are not stored in the LDS. */
const bool tcs_skip_barrier =
ctx.stage == vertex_tess_control_hs && ctx.tcs_temp_only_inputs == nir->info.inputs_read;
ctx.stage == vertex_tess_control_hs && !ctx.any_tcs_inputs_via_lds;
/* A barrier is usually needed at the beginning of the second shader, with exceptions. */
const bool need_barrier = i != 0 && !ngg_gs && !tcs_skip_barrier;

View file

@ -115,7 +115,7 @@ struct isel_context {
Temp ttmp8;
/* tessellation information */
uint64_t tcs_temp_only_inputs;
bool any_tcs_inputs_via_lds = false;
bool tcs_in_out_eq = false;
/* Fragment color output information */

View file

@ -228,7 +228,7 @@ void
setup_tcs_info(isel_context* ctx)
{
ctx->tcs_in_out_eq = ctx->program->info.vs.tcs_in_out_eq;
ctx->tcs_temp_only_inputs = ctx->program->info.vs.tcs_temp_only_input_mask;
ctx->any_tcs_inputs_via_lds = ctx->program->info.vs.any_tcs_inputs_via_lds;
}
void

View file

@ -107,7 +107,7 @@ struct aco_shader_info {
struct ac_arg epilog_pc; /* Vulkan only */
struct {
bool tcs_in_out_eq;
uint64_t tcs_temp_only_input_mask;
bool any_tcs_inputs_via_lds;
bool has_prolog;
} vs;
struct {

View file

@ -220,7 +220,7 @@ radv_nir_lower_io_to_mem(struct radv_device *device, struct radv_shader_stage *s
if (nir->info.stage == MESA_SHADER_VERTEX) {
if (info->vs.as_ls) {
NIR_PASS_V(nir, ac_nir_lower_ls_outputs_to_mem, map_output, pdev->info.gfx_level, info->vs.tcs_in_out_eq,
info->vs.hs_inputs_read, info->vs.tcs_temp_only_input_mask);
info->vs.tcs_inputs_via_temp, info->vs.tcs_inputs_via_lds);
return true;
} else if (info->vs.as_es) {
NIR_PASS_V(nir, ac_nir_lower_es_outputs_to_mem, map_output, pdev->info.gfx_level, info->esgs_itemsize, info->gs_inputs_read);
@ -228,7 +228,7 @@ radv_nir_lower_io_to_mem(struct radv_device *device, struct radv_shader_stage *s
}
} else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
NIR_PASS_V(nir, ac_nir_lower_hs_inputs_to_mem, map_input, pdev->info.gfx_level, info->vs.tcs_in_out_eq,
info->vs.tcs_temp_only_input_mask);
info->vs.tcs_inputs_via_temp, info->vs.tcs_inputs_via_lds);
NIR_PASS_V(nir, ac_nir_lower_hs_outputs_to_mem, map_output, pdev->info.gfx_level, info->tcs.tes_inputs_read,
info->tcs.tes_patch_inputs_read, info->wave_size);

View file

@ -34,11 +34,11 @@ radv_aco_convert_shader_info(struct aco_shader_info *aco_info, const struct radv
ASSIGN_FIELD(ps.has_epilog);
ASSIGN_FIELD(merged_shader_compiled_separately);
ASSIGN_FIELD(vs.tcs_in_out_eq);
ASSIGN_FIELD(vs.tcs_temp_only_input_mask);
ASSIGN_FIELD(vs.has_prolog);
ASSIGN_FIELD(tcs.num_lds_blocks);
ASSIGN_FIELD(ps.num_inputs);
ASSIGN_FIELD(cs.uses_full_subgroups);
aco_info->vs.any_tcs_inputs_via_lds = radv->vs.tcs_inputs_via_lds != 0;
aco_info->ps.spi_ps_input_ena = radv->ps.spi_ps_input_ena;
aco_info->ps.spi_ps_input_addr = radv->ps.spi_ps_input_addr;
aco_info->ps.has_prolog = false;

View file

@ -588,7 +588,7 @@ gather_shader_info_vs(struct radv_device *device, const nir_shader *nir,
}
info->gs_inputs_read = ~0ULL;
info->vs.hs_inputs_read = ~0ULL;
info->vs.tcs_inputs_via_lds = ~0ULL;
/* Use per-attribute vertex descriptors to prevent faults and for correct bounds checking. */
info->vs.use_per_attribute_vb_descs = radv_use_per_attribute_vb_descs(nir, gfx_state, stage_key);
@ -1782,7 +1782,7 @@ radv_link_shaders_info(struct radv_device *device, struct radv_shader_stage *pro
struct radv_shader_stage *vs_stage = producer;
struct radv_shader_stage *tcs_stage = consumer;
vs_stage->info.vs.hs_inputs_read = tcs_stage->nir->info.inputs_read;
vs_stage->info.vs.tcs_inputs_via_lds = tcs_stage->nir->info.inputs_read;
if (gfx_state->ts.patch_control_points) {
vs_stage->info.workgroup_size =
@ -1804,11 +1804,16 @@ radv_link_shaders_info(struct radv_device *device, struct radv_shader_stage *pro
gfx_state->ts.patch_control_points == tcs_stage->info.tcs.tcs_vertices_out &&
vs_stage->nir->info.float_controls_execution_mode == tcs_stage->nir->info.float_controls_execution_mode;
if (vs_stage->info.vs.tcs_in_out_eq)
vs_stage->info.vs.tcs_temp_only_input_mask =
tcs_stage->nir->info.inputs_read & vs_stage->nir->info.outputs_written &
~tcs_stage->nir->info.tess.tcs_cross_invocation_inputs_read &
~tcs_stage->nir->info.inputs_read_indirectly & ~vs_stage->nir->info.outputs_accessed_indirectly;
if (vs_stage->info.vs.tcs_in_out_eq) {
vs_stage->info.vs.tcs_inputs_via_temp = vs_stage->nir->info.outputs_written &
~vs_stage->nir->info.outputs_accessed_indirectly &
tcs_stage->nir->info.tess.tcs_same_invocation_inputs_read;
vs_stage->info.vs.tcs_inputs_via_lds = tcs_stage->nir->info.tess.tcs_cross_invocation_inputs_read |
(tcs_stage->nir->info.tess.tcs_same_invocation_inputs_read &
tcs_stage->nir->info.inputs_read_indirectly) |
(tcs_stage->nir->info.tess.tcs_same_invocation_inputs_read &
vs_stage->nir->info.outputs_accessed_indirectly);
}
}
}
}

View file

@ -118,7 +118,8 @@ struct radv_shader_info {
bool as_es;
bool as_ls;
bool tcs_in_out_eq;
uint64_t tcs_temp_only_input_mask;
uint64_t tcs_inputs_via_temp;
uint64_t tcs_inputs_via_lds;
uint8_t num_linked_outputs;
bool needs_base_instance;
bool use_per_attribute_vb_descs;
@ -128,7 +129,6 @@ struct radv_shader_info {
bool dynamic_inputs;
bool dynamic_num_verts_per_prim;
uint32_t num_outputs; /* For NGG streamout only */
uint64_t hs_inputs_read; /* Mask of HS inputs read (only used by linked LS) */
} vs;
struct {
uint8_t output_usage_mask[VARYING_SLOT_VAR31 + 1];

View file

@ -1857,8 +1857,8 @@ static bool si_lower_io_to_mem(struct si_shader *shader, nir_shader *nir,
is_gfx9_mono_tcs ? NULL : si_map_io_driver_location,
sel->screen->info.gfx_level,
key->ge.opt.same_patch_vertices,
is_gfx9_mono_tcs ? next_sel->info.base.inputs_read : ~0ull,
tcs_vgpr_only_inputs);
is_gfx9_mono_tcs ? next_sel->info.tcs_inputs_via_temp : 0,
is_gfx9_mono_tcs ? next_sel->info.tcs_inputs_via_lds : ~0ull);
return true;
} else if (key->ge.as_es) {
NIR_PASS_V(nir, ac_nir_lower_es_outputs_to_mem, si_map_io_driver_location,
@ -1868,8 +1868,8 @@ static bool si_lower_io_to_mem(struct si_shader *shader, nir_shader *nir,
} else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
NIR_PASS_V(nir, ac_nir_lower_hs_inputs_to_mem,
is_gfx9_mono_tcs ? NULL : si_map_io_driver_location,
sel->screen->info.gfx_level,
key->ge.opt.same_patch_vertices, sel->info.tcs_vgpr_only_inputs);
sel->screen->info.gfx_level, key->ge.opt.same_patch_vertices,
sel->info.tcs_inputs_via_temp, sel->info.tcs_inputs_via_lds);
/* Used by hs_emit_write_tess_factors() when monolithic shader. */
if (nir->info.tess._primitive_mode == TESS_PRIMITIVE_UNSPECIFIED)
@ -3644,8 +3644,7 @@ nir_shader *si_get_prev_stage_nir_shader(struct si_shader *shader,
si_init_shader_args(prev_shader, args);
nir_shader *nir = si_get_nir_shader(prev_shader, args, free_nir,
sel->info.tcs_vgpr_only_inputs, NULL);
nir_shader *nir = si_get_nir_shader(prev_shader, args, free_nir, 0, NULL);
si_update_shader_binary_info(shader, nir);

View file

@ -482,7 +482,8 @@ struct si_shader_info {
uint16_t enabled_streamout_buffer_mask;
uint64_t inputs_read; /* "get_unique_index" bits */
uint64_t tcs_vgpr_only_inputs; /* TCS inputs that are only in VGPRs, not LDS. */
uint64_t tcs_inputs_via_temp;
uint64_t tcs_inputs_via_lds;
/* For VS before {TCS, TES, GS} and TES before GS. */
uint64_t ls_es_outputs_written; /* "get_unique_index" bits */

View file

@ -87,7 +87,9 @@ si_fill_aco_shader_info(struct si_shader *shader, struct aco_shader_info *info,
switch (stage) {
case MESA_SHADER_TESS_CTRL:
info->vs.tcs_in_out_eq = key->ge.opt.same_patch_vertices;
info->vs.tcs_temp_only_input_mask = sel->info.tcs_vgpr_only_inputs;
info->vs.any_tcs_inputs_via_lds = sel->info.tcs_inputs_via_lds ||
(!shader->key.ge.opt.same_patch_vertices &&
sel->info.tcs_inputs_via_temp);
info->tcs.tcs_offchip_layout = args->tcs_offchip_layout;
break;
case MESA_SHADER_FRAGMENT:

View file

@ -702,9 +702,10 @@ void si_nir_scan_shader(struct si_screen *sscreen, const struct nir_shader *nir,
assert(((info->esgs_vertex_stride / 4) & C_028AAC_ITEMSIZE) == 0);
}
info->tcs_vgpr_only_inputs = ~info->base.tess.tcs_cross_invocation_inputs_read &
~info->base.inputs_read_indirectly &
info->base.inputs_read;
info->tcs_inputs_via_temp = info->base.tess.tcs_same_invocation_inputs_read;
info->tcs_inputs_via_lds = info->base.tess.tcs_cross_invocation_inputs_read |
(info->base.tess.tcs_same_invocation_inputs_read &
info->base.inputs_read_indirectly);
}
if (nir->info.stage == MESA_SHADER_GEOMETRY) {

View file

@ -702,8 +702,7 @@ static bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shade
if (ctx->stage == MESA_SHADER_TESS_CTRL) {
/* We need the barrier only if TCS inputs are read from LDS. */
if (!shader->key.ge.opt.same_patch_vertices ||
shader->selector->info.base.inputs_read &
~shader->selector->info.tcs_vgpr_only_inputs) {
shader->selector->info.tcs_inputs_via_lds) {
ac_build_waitcnt(&ctx->ac, AC_WAIT_DS);
/* If both input and output patches are wholly in one wave, we don't need a barrier.

View file

@ -4661,11 +4661,11 @@ unsigned si_shader_lshs_vertex_stride(struct si_shader *ls)
assert(tcs->selector->screen->info.gfx_level >= GFX9);
if (tcs->is_monolithic) {
uint64_t lds_inputs_read = tcs->selector->info.base.inputs_read;
uint64_t lds_inputs_read = tcs->selector->info.tcs_inputs_via_lds;
/* Don't allocate LDS for inputs passed via VGPRs. */
if (tcs->key.ge.opt.same_patch_vertices)
lds_inputs_read &= ~tcs->selector->info.tcs_vgpr_only_inputs;
/* If the TCS in/out number of vertices is different, all inputs are passed via LDS. */
if (!tcs->key.ge.opt.same_patch_vertices)
lds_inputs_read |= tcs->selector->info.tcs_inputs_via_temp;
/* NIR lowering passes pack LS outputs/HS inputs if the usage masks of both are known. */
num_slots = util_bitcount64(lds_inputs_read);