aco: disable wqm for tex loads when not needed

By only executing VMEM loads for lanes where the result is used, we can save
bandwidth.

The NIR pass only handles tex for now, but those are most common anyway.
We can extend it handle image/ssbo/ubo/global loads in the future.

Foz-DB GFX1201:
Totals from 32633 (40.66% of 80251) affected shaders:
Instrs: 22635910 -> 23193509 (+2.46%); split: -0.00%, +2.46%
CodeSize: 122880044 -> 125093428 (+1.80%); split: -0.00%, +1.81%
VGPRs: 1481868 -> 1481712 (-0.01%)
SpillSGPRs: 3877 -> 4301 (+10.94%); split: -0.52%, +11.45%
Latency: 171480552 -> 171685219 (+0.12%); split: -0.18%, +0.30%
InvThroughput: 24364743 -> 24373441 (+0.04%); split: -0.08%, +0.12%
VClause: 388318 -> 388557 (+0.06%); split: -0.06%, +0.13%
SClause: 774781 -> 776492 (+0.22%); split: -0.29%, +0.51%
Copies: 1416586 -> 1541199 (+8.80%); split: -0.16%, +8.96%
Branches: 419591 -> 419673 (+0.02%); split: -0.02%, +0.04%
PreSGPRs: 1330303 -> 1416540 (+6.48%)
PreVGPRs: 964864 -> 964863 (-0.00%)
VALU: 12919601 -> 12920254 (+0.01%); split: -0.01%, +0.01%
SALU: 2685402 -> 3224147 (+20.06%); split: -0.00%, +20.07%

Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35970>
This commit is contained in:
Georg Lehmann 2025-07-06 23:12:58 +02:00 committed by Marge Bot
parent 7159fd21f8
commit 883b1ca364
3 changed files with 13 additions and 9 deletions

View file

@ -367,6 +367,12 @@ init_context(isel_context* ctx, nir_shader* shader)
apply_nuw_to_offsets(ctx, impl);
ac_nir_flag_smem_for_loads(shader, ctx->program->gfx_level, false, true);
if (shader->info.stage == MESA_SHADER_FRAGMENT) {
nir_opt_load_skip_helpers_options skip_helper_options = {};
skip_helper_options.no_add_divergence = true;
nir_opt_load_skip_helpers(shader, &skip_helper_options);
}
/* sanitize control flow */
sanitize_cf_list(impl, &impl->body);
nir_progress(true, impl, nir_metadata_none);
@ -621,11 +627,8 @@ init_context(isel_context* ctx, nir_shader* shader)
}
case nir_instr_type_tex: {
nir_tex_instr* tex = nir_instr_as_tex(instr);
RegType type = tex->def.divergent ? RegType::vgpr : RegType::sgpr;
if (tex->op == nir_texop_texture_samples) {
assert(!tex->def.divergent);
}
RegType type =
tex->def.divergent || tex->skip_helpers ? RegType::vgpr : RegType::sgpr;
RegClass rc = get_reg_class(ctx, type, tex->def.num_components, tex->def.bit_size);
regclasses[tex->def.index] = rc;

View file

@ -83,6 +83,7 @@ visit_tex(isel_context* ctx, nir_tex_instr* instr)
assert(instr->op != nir_texop_samples_identical);
Builder bld(ctx->program, ctx->block);
bool disable_wqm = instr->skip_helpers;
bool has_bias = false, has_lod = false, level_zero = false, has_compare = false,
has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false,
has_sample_index = false, has_clamped_lod = false, has_wqm_coord = false;
@ -338,7 +339,7 @@ visit_tex(isel_context* ctx, nir_tex_instr* instr)
Temp tg4_lod = bld.copy(bld.def(v1), Operand::zero());
Temp size = bld.tmp(v2);
MIMG_instruction* tex = emit_mimg(bld, aco_opcode::image_get_resinfo, {size}, resource,
Operand(s4), std::vector<Temp>{tg4_lod}, false);
Operand(s4), std::vector<Temp>{tg4_lod}, disable_wqm);
tex->dim = dim;
tex->dmask = 0x3;
tex->da = da;
@ -495,7 +496,7 @@ visit_tex(isel_context* ctx, nir_tex_instr* instr)
: aco_opcode::image_load_mip;
Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);
MIMG_instruction* tex =
emit_mimg(bld, op, {tmp_dst}, resource, Operand(s4), args, false, vdata);
emit_mimg(bld, op, {tmp_dst}, resource, Operand(s4), args, disable_wqm, vdata);
if (instr->op == nir_texop_fragment_mask_fetch_amd)
tex->dim = da ? ac_image_2darray : ac_image_2d;
else
@ -675,7 +676,7 @@ visit_tex(isel_context* ctx, nir_tex_instr* instr)
Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);
MIMG_instruction* tex =
emit_mimg(bld, opcode, {tmp_dst}, resource, Operand(sampler), args, false, vdata);
emit_mimg(bld, opcode, {tmp_dst}, resource, Operand(sampler), args, disable_wqm, vdata);
tex->dim = dim;
tex->dmask = dmask & 0xf;
tex->da = da;

View file

@ -117,7 +117,7 @@ BEGIN_TEST(d3d11_derivs.discard)
/* The discard gets emitted as demote_if. */
//>> s2: %_:exec, s1: (kill)%_:scc = s_wqm_b64 %_
//! p_exit_early_if_not %_:exec
//>> v4: %_ = image_sample (kill)%_, (kill)%_, v1: undef, (kill)%_, (kill)%_ 2d
//>> v4: %_ = image_sample (kill)%_, (kill)%_, v1: undef, (kill)%_, (kill)%_, %_, (kill)%_ 2d disable_wqm
pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");
END_TEST