mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-24 15:20:10 +01:00
ac,radv,radeonsi: enable TCS input reads from VGPRs for all compatible loads
Cross-invocation TCS input access doesn't prevent same-invocation access. This improves shaders that use both for the same inputs. Also, if some components of a vec4 slot only use same-invocation access and other components only use cross-invocation access (it's possible after compaction), this takes the VGPR path for the components with same-invocation access, which didn't happen previously because all masks only describe whole vec4s. Reviewed-by: Timur Kristóf <timur.kristof@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31673>
This commit is contained in:
parent
99a03dc9d5
commit
85c20def94
16 changed files with 62 additions and 43 deletions
|
|
@ -81,15 +81,16 @@ ac_nir_lower_ls_outputs_to_mem(nir_shader *ls,
|
|||
ac_nir_map_io_driver_location map,
|
||||
enum amd_gfx_level gfx_level,
|
||||
bool tcs_in_out_eq,
|
||||
uint64_t tcs_inputs_read,
|
||||
uint64_t tcs_temp_only_inputs);
|
||||
uint64_t tcs_inputs_via_temp,
|
||||
uint64_t tcs_inputs_via_lds);
|
||||
|
||||
void
|
||||
ac_nir_lower_hs_inputs_to_mem(nir_shader *shader,
|
||||
ac_nir_map_io_driver_location map,
|
||||
enum amd_gfx_level gfx_level,
|
||||
bool tcs_in_out_eq,
|
||||
uint64_t tcs_temp_only_inputs);
|
||||
uint64_t tcs_inputs_via_temp,
|
||||
uint64_t tcs_inputs_via_lds);
|
||||
|
||||
void
|
||||
ac_nir_lower_hs_outputs_to_mem(nir_shader *shader,
|
||||
|
|
|
|||
|
|
@ -1168,19 +1168,24 @@ ac_nir_lower_ls_outputs_to_mem(nir_shader *shader,
|
|||
ac_nir_map_io_driver_location map,
|
||||
enum amd_gfx_level gfx_level,
|
||||
bool tcs_in_out_eq,
|
||||
uint64_t tcs_inputs_read,
|
||||
uint64_t tcs_temp_only_inputs)
|
||||
uint64_t tcs_inputs_via_temp,
|
||||
uint64_t tcs_inputs_via_lds)
|
||||
{
|
||||
assert(shader->info.stage == MESA_SHADER_VERTEX);
|
||||
assert(gfx_level >= GFX9 || !tcs_in_out_eq);
|
||||
|
||||
lower_tess_io_state state = {
|
||||
.gfx_level = gfx_level,
|
||||
.tcs_inputs_via_temp = tcs_in_out_eq ? tcs_temp_only_inputs : 0,
|
||||
.tcs_inputs_via_lds = tcs_inputs_read & (tcs_in_out_eq ? ~tcs_temp_only_inputs : ~0ull),
|
||||
.map_io = map,
|
||||
};
|
||||
|
||||
if (tcs_in_out_eq) {
|
||||
state.tcs_inputs_via_temp = tcs_inputs_via_temp;
|
||||
state.tcs_inputs_via_lds = tcs_inputs_via_lds;
|
||||
} else {
|
||||
state.tcs_inputs_via_lds = tcs_inputs_via_lds | tcs_inputs_via_temp;
|
||||
}
|
||||
|
||||
nir_shader_intrinsics_pass(shader, lower_ls_output_store,
|
||||
nir_metadata_control_flow,
|
||||
&state);
|
||||
|
|
@ -1191,18 +1196,24 @@ ac_nir_lower_hs_inputs_to_mem(nir_shader *shader,
|
|||
ac_nir_map_io_driver_location map,
|
||||
enum amd_gfx_level gfx_level,
|
||||
bool tcs_in_out_eq,
|
||||
uint64_t tcs_temp_only_inputs)
|
||||
uint64_t tcs_inputs_via_temp,
|
||||
uint64_t tcs_inputs_via_lds)
|
||||
{
|
||||
assert(shader->info.stage == MESA_SHADER_TESS_CTRL);
|
||||
assert(gfx_level >= GFX9 || !tcs_in_out_eq);
|
||||
|
||||
lower_tess_io_state state = {
|
||||
.gfx_level = gfx_level,
|
||||
.tcs_inputs_via_temp = tcs_in_out_eq ? tcs_temp_only_inputs : 0,
|
||||
.tcs_inputs_via_lds = shader->info.inputs_read & (tcs_in_out_eq ? ~tcs_temp_only_inputs : ~0ull),
|
||||
.map_io = map,
|
||||
};
|
||||
|
||||
if (tcs_in_out_eq) {
|
||||
state.tcs_inputs_via_temp = tcs_inputs_via_temp;
|
||||
state.tcs_inputs_via_lds = tcs_inputs_via_lds;
|
||||
} else {
|
||||
state.tcs_inputs_via_lds = shader->info.inputs_read;
|
||||
}
|
||||
|
||||
nir_shader_lower_instructions(shader,
|
||||
filter_load_tcs_per_vertex_input,
|
||||
lower_hs_per_vertex_input_load,
|
||||
|
|
|
|||
|
|
@ -11930,7 +11930,7 @@ select_program_merged(isel_context& ctx, const unsigned shader_count, nir_shader
|
|||
|
||||
/* Skip s_barrier from TCS when VS outputs are not stored in the LDS. */
|
||||
const bool tcs_skip_barrier =
|
||||
ctx.stage == vertex_tess_control_hs && ctx.tcs_temp_only_inputs == nir->info.inputs_read;
|
||||
ctx.stage == vertex_tess_control_hs && !ctx.any_tcs_inputs_via_lds;
|
||||
|
||||
/* A barrier is usually needed at the beginning of the second shader, with exceptions. */
|
||||
const bool need_barrier = i != 0 && !ngg_gs && !tcs_skip_barrier;
|
||||
|
|
|
|||
|
|
@ -115,7 +115,7 @@ struct isel_context {
|
|||
Temp ttmp8;
|
||||
|
||||
/* tessellation information */
|
||||
uint64_t tcs_temp_only_inputs;
|
||||
bool any_tcs_inputs_via_lds = false;
|
||||
bool tcs_in_out_eq = false;
|
||||
|
||||
/* Fragment color output information */
|
||||
|
|
|
|||
|
|
@ -228,7 +228,7 @@ void
|
|||
setup_tcs_info(isel_context* ctx)
|
||||
{
|
||||
ctx->tcs_in_out_eq = ctx->program->info.vs.tcs_in_out_eq;
|
||||
ctx->tcs_temp_only_inputs = ctx->program->info.vs.tcs_temp_only_input_mask;
|
||||
ctx->any_tcs_inputs_via_lds = ctx->program->info.vs.any_tcs_inputs_via_lds;
|
||||
}
|
||||
|
||||
void
|
||||
|
|
|
|||
|
|
@ -107,7 +107,7 @@ struct aco_shader_info {
|
|||
struct ac_arg epilog_pc; /* Vulkan only */
|
||||
struct {
|
||||
bool tcs_in_out_eq;
|
||||
uint64_t tcs_temp_only_input_mask;
|
||||
bool any_tcs_inputs_via_lds;
|
||||
bool has_prolog;
|
||||
} vs;
|
||||
struct {
|
||||
|
|
|
|||
|
|
@ -220,7 +220,7 @@ radv_nir_lower_io_to_mem(struct radv_device *device, struct radv_shader_stage *s
|
|||
if (nir->info.stage == MESA_SHADER_VERTEX) {
|
||||
if (info->vs.as_ls) {
|
||||
NIR_PASS_V(nir, ac_nir_lower_ls_outputs_to_mem, map_output, pdev->info.gfx_level, info->vs.tcs_in_out_eq,
|
||||
info->vs.hs_inputs_read, info->vs.tcs_temp_only_input_mask);
|
||||
info->vs.tcs_inputs_via_temp, info->vs.tcs_inputs_via_lds);
|
||||
return true;
|
||||
} else if (info->vs.as_es) {
|
||||
NIR_PASS_V(nir, ac_nir_lower_es_outputs_to_mem, map_output, pdev->info.gfx_level, info->esgs_itemsize, info->gs_inputs_read);
|
||||
|
|
@ -228,7 +228,7 @@ radv_nir_lower_io_to_mem(struct radv_device *device, struct radv_shader_stage *s
|
|||
}
|
||||
} else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
|
||||
NIR_PASS_V(nir, ac_nir_lower_hs_inputs_to_mem, map_input, pdev->info.gfx_level, info->vs.tcs_in_out_eq,
|
||||
info->vs.tcs_temp_only_input_mask);
|
||||
info->vs.tcs_inputs_via_temp, info->vs.tcs_inputs_via_lds);
|
||||
NIR_PASS_V(nir, ac_nir_lower_hs_outputs_to_mem, map_output, pdev->info.gfx_level, info->tcs.tes_inputs_read,
|
||||
info->tcs.tes_patch_inputs_read, info->wave_size);
|
||||
|
||||
|
|
|
|||
|
|
@ -34,11 +34,11 @@ radv_aco_convert_shader_info(struct aco_shader_info *aco_info, const struct radv
|
|||
ASSIGN_FIELD(ps.has_epilog);
|
||||
ASSIGN_FIELD(merged_shader_compiled_separately);
|
||||
ASSIGN_FIELD(vs.tcs_in_out_eq);
|
||||
ASSIGN_FIELD(vs.tcs_temp_only_input_mask);
|
||||
ASSIGN_FIELD(vs.has_prolog);
|
||||
ASSIGN_FIELD(tcs.num_lds_blocks);
|
||||
ASSIGN_FIELD(ps.num_inputs);
|
||||
ASSIGN_FIELD(cs.uses_full_subgroups);
|
||||
aco_info->vs.any_tcs_inputs_via_lds = radv->vs.tcs_inputs_via_lds != 0;
|
||||
aco_info->ps.spi_ps_input_ena = radv->ps.spi_ps_input_ena;
|
||||
aco_info->ps.spi_ps_input_addr = radv->ps.spi_ps_input_addr;
|
||||
aco_info->ps.has_prolog = false;
|
||||
|
|
|
|||
|
|
@ -588,7 +588,7 @@ gather_shader_info_vs(struct radv_device *device, const nir_shader *nir,
|
|||
}
|
||||
|
||||
info->gs_inputs_read = ~0ULL;
|
||||
info->vs.hs_inputs_read = ~0ULL;
|
||||
info->vs.tcs_inputs_via_lds = ~0ULL;
|
||||
|
||||
/* Use per-attribute vertex descriptors to prevent faults and for correct bounds checking. */
|
||||
info->vs.use_per_attribute_vb_descs = radv_use_per_attribute_vb_descs(nir, gfx_state, stage_key);
|
||||
|
|
@ -1782,7 +1782,7 @@ radv_link_shaders_info(struct radv_device *device, struct radv_shader_stage *pro
|
|||
struct radv_shader_stage *vs_stage = producer;
|
||||
struct radv_shader_stage *tcs_stage = consumer;
|
||||
|
||||
vs_stage->info.vs.hs_inputs_read = tcs_stage->nir->info.inputs_read;
|
||||
vs_stage->info.vs.tcs_inputs_via_lds = tcs_stage->nir->info.inputs_read;
|
||||
|
||||
if (gfx_state->ts.patch_control_points) {
|
||||
vs_stage->info.workgroup_size =
|
||||
|
|
@ -1804,11 +1804,16 @@ radv_link_shaders_info(struct radv_device *device, struct radv_shader_stage *pro
|
|||
gfx_state->ts.patch_control_points == tcs_stage->info.tcs.tcs_vertices_out &&
|
||||
vs_stage->nir->info.float_controls_execution_mode == tcs_stage->nir->info.float_controls_execution_mode;
|
||||
|
||||
if (vs_stage->info.vs.tcs_in_out_eq)
|
||||
vs_stage->info.vs.tcs_temp_only_input_mask =
|
||||
tcs_stage->nir->info.inputs_read & vs_stage->nir->info.outputs_written &
|
||||
~tcs_stage->nir->info.tess.tcs_cross_invocation_inputs_read &
|
||||
~tcs_stage->nir->info.inputs_read_indirectly & ~vs_stage->nir->info.outputs_accessed_indirectly;
|
||||
if (vs_stage->info.vs.tcs_in_out_eq) {
|
||||
vs_stage->info.vs.tcs_inputs_via_temp = vs_stage->nir->info.outputs_written &
|
||||
~vs_stage->nir->info.outputs_accessed_indirectly &
|
||||
tcs_stage->nir->info.tess.tcs_same_invocation_inputs_read;
|
||||
vs_stage->info.vs.tcs_inputs_via_lds = tcs_stage->nir->info.tess.tcs_cross_invocation_inputs_read |
|
||||
(tcs_stage->nir->info.tess.tcs_same_invocation_inputs_read &
|
||||
tcs_stage->nir->info.inputs_read_indirectly) |
|
||||
(tcs_stage->nir->info.tess.tcs_same_invocation_inputs_read &
|
||||
vs_stage->nir->info.outputs_accessed_indirectly);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -118,7 +118,8 @@ struct radv_shader_info {
|
|||
bool as_es;
|
||||
bool as_ls;
|
||||
bool tcs_in_out_eq;
|
||||
uint64_t tcs_temp_only_input_mask;
|
||||
uint64_t tcs_inputs_via_temp;
|
||||
uint64_t tcs_inputs_via_lds;
|
||||
uint8_t num_linked_outputs;
|
||||
bool needs_base_instance;
|
||||
bool use_per_attribute_vb_descs;
|
||||
|
|
@ -128,7 +129,6 @@ struct radv_shader_info {
|
|||
bool dynamic_inputs;
|
||||
bool dynamic_num_verts_per_prim;
|
||||
uint32_t num_outputs; /* For NGG streamout only */
|
||||
uint64_t hs_inputs_read; /* Mask of HS inputs read (only used by linked LS) */
|
||||
} vs;
|
||||
struct {
|
||||
uint8_t output_usage_mask[VARYING_SLOT_VAR31 + 1];
|
||||
|
|
|
|||
|
|
@ -1857,8 +1857,8 @@ static bool si_lower_io_to_mem(struct si_shader *shader, nir_shader *nir,
|
|||
is_gfx9_mono_tcs ? NULL : si_map_io_driver_location,
|
||||
sel->screen->info.gfx_level,
|
||||
key->ge.opt.same_patch_vertices,
|
||||
is_gfx9_mono_tcs ? next_sel->info.base.inputs_read : ~0ull,
|
||||
tcs_vgpr_only_inputs);
|
||||
is_gfx9_mono_tcs ? next_sel->info.tcs_inputs_via_temp : 0,
|
||||
is_gfx9_mono_tcs ? next_sel->info.tcs_inputs_via_lds : ~0ull);
|
||||
return true;
|
||||
} else if (key->ge.as_es) {
|
||||
NIR_PASS_V(nir, ac_nir_lower_es_outputs_to_mem, si_map_io_driver_location,
|
||||
|
|
@ -1868,8 +1868,8 @@ static bool si_lower_io_to_mem(struct si_shader *shader, nir_shader *nir,
|
|||
} else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
|
||||
NIR_PASS_V(nir, ac_nir_lower_hs_inputs_to_mem,
|
||||
is_gfx9_mono_tcs ? NULL : si_map_io_driver_location,
|
||||
sel->screen->info.gfx_level,
|
||||
key->ge.opt.same_patch_vertices, sel->info.tcs_vgpr_only_inputs);
|
||||
sel->screen->info.gfx_level, key->ge.opt.same_patch_vertices,
|
||||
sel->info.tcs_inputs_via_temp, sel->info.tcs_inputs_via_lds);
|
||||
|
||||
/* Used by hs_emit_write_tess_factors() when monolithic shader. */
|
||||
if (nir->info.tess._primitive_mode == TESS_PRIMITIVE_UNSPECIFIED)
|
||||
|
|
@ -3644,8 +3644,7 @@ nir_shader *si_get_prev_stage_nir_shader(struct si_shader *shader,
|
|||
|
||||
si_init_shader_args(prev_shader, args);
|
||||
|
||||
nir_shader *nir = si_get_nir_shader(prev_shader, args, free_nir,
|
||||
sel->info.tcs_vgpr_only_inputs, NULL);
|
||||
nir_shader *nir = si_get_nir_shader(prev_shader, args, free_nir, 0, NULL);
|
||||
|
||||
si_update_shader_binary_info(shader, nir);
|
||||
|
||||
|
|
|
|||
|
|
@ -482,7 +482,8 @@ struct si_shader_info {
|
|||
uint16_t enabled_streamout_buffer_mask;
|
||||
|
||||
uint64_t inputs_read; /* "get_unique_index" bits */
|
||||
uint64_t tcs_vgpr_only_inputs; /* TCS inputs that are only in VGPRs, not LDS. */
|
||||
uint64_t tcs_inputs_via_temp;
|
||||
uint64_t tcs_inputs_via_lds;
|
||||
|
||||
/* For VS before {TCS, TES, GS} and TES before GS. */
|
||||
uint64_t ls_es_outputs_written; /* "get_unique_index" bits */
|
||||
|
|
|
|||
|
|
@ -87,7 +87,9 @@ si_fill_aco_shader_info(struct si_shader *shader, struct aco_shader_info *info,
|
|||
switch (stage) {
|
||||
case MESA_SHADER_TESS_CTRL:
|
||||
info->vs.tcs_in_out_eq = key->ge.opt.same_patch_vertices;
|
||||
info->vs.tcs_temp_only_input_mask = sel->info.tcs_vgpr_only_inputs;
|
||||
info->vs.any_tcs_inputs_via_lds = sel->info.tcs_inputs_via_lds ||
|
||||
(!shader->key.ge.opt.same_patch_vertices &&
|
||||
sel->info.tcs_inputs_via_temp);
|
||||
info->tcs.tcs_offchip_layout = args->tcs_offchip_layout;
|
||||
break;
|
||||
case MESA_SHADER_FRAGMENT:
|
||||
|
|
|
|||
|
|
@ -702,9 +702,10 @@ void si_nir_scan_shader(struct si_screen *sscreen, const struct nir_shader *nir,
|
|||
assert(((info->esgs_vertex_stride / 4) & C_028AAC_ITEMSIZE) == 0);
|
||||
}
|
||||
|
||||
info->tcs_vgpr_only_inputs = ~info->base.tess.tcs_cross_invocation_inputs_read &
|
||||
~info->base.inputs_read_indirectly &
|
||||
info->base.inputs_read;
|
||||
info->tcs_inputs_via_temp = info->base.tess.tcs_same_invocation_inputs_read;
|
||||
info->tcs_inputs_via_lds = info->base.tess.tcs_cross_invocation_inputs_read |
|
||||
(info->base.tess.tcs_same_invocation_inputs_read &
|
||||
info->base.inputs_read_indirectly);
|
||||
}
|
||||
|
||||
if (nir->info.stage == MESA_SHADER_GEOMETRY) {
|
||||
|
|
|
|||
|
|
@ -702,8 +702,7 @@ static bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shade
|
|||
if (ctx->stage == MESA_SHADER_TESS_CTRL) {
|
||||
/* We need the barrier only if TCS inputs are read from LDS. */
|
||||
if (!shader->key.ge.opt.same_patch_vertices ||
|
||||
shader->selector->info.base.inputs_read &
|
||||
~shader->selector->info.tcs_vgpr_only_inputs) {
|
||||
shader->selector->info.tcs_inputs_via_lds) {
|
||||
ac_build_waitcnt(&ctx->ac, AC_WAIT_DS);
|
||||
|
||||
/* If both input and output patches are wholly in one wave, we don't need a barrier.
|
||||
|
|
|
|||
|
|
@ -4661,11 +4661,11 @@ unsigned si_shader_lshs_vertex_stride(struct si_shader *ls)
|
|||
assert(tcs->selector->screen->info.gfx_level >= GFX9);
|
||||
|
||||
if (tcs->is_monolithic) {
|
||||
uint64_t lds_inputs_read = tcs->selector->info.base.inputs_read;
|
||||
uint64_t lds_inputs_read = tcs->selector->info.tcs_inputs_via_lds;
|
||||
|
||||
/* Don't allocate LDS for inputs passed via VGPRs. */
|
||||
if (tcs->key.ge.opt.same_patch_vertices)
|
||||
lds_inputs_read &= ~tcs->selector->info.tcs_vgpr_only_inputs;
|
||||
/* If the TCS in/out number of vertices is different, all inputs are passed via LDS. */
|
||||
if (!tcs->key.ge.opt.same_patch_vertices)
|
||||
lds_inputs_read |= tcs->selector->info.tcs_inputs_via_temp;
|
||||
|
||||
/* NIR lowering passes pack LS outputs/HS inputs if the usage masks of both are known. */
|
||||
num_slots = util_bitcount64(lds_inputs_read);
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue