aco: avoid full 32bit imul for uniform reduce/scan

Foz-DB Navi31:
Totals from 24 (0.03% of 79395) affected shaders:
Instrs: 1172275 -> 1172078 (-0.02%)
CodeSize: 5974424 -> 5973860 (-0.01%)
Latency: 5896216 -> 5895923 (-0.00%)
InvThroughput: 1167928 -> 1167869 (-0.01%)
VALU: 625756 -> 625636 (-0.02%)

Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28223>
This commit is contained in:
Georg Lehmann 2024-03-16 11:58:01 +01:00 committed by Marge Bot
parent 96ff511b75
commit 67997fd735

View file

@ -7763,14 +7763,20 @@ emit_addition_uniform_reduce(isel_context* ctx, nir_op op, Definition dst, nir_s
assert(dst.getTemp().type() == count.type());
if (nir_src_is_const(src)) {
if (nir_src_as_uint(src) == 1 && dst.bytes() <= 2)
uint32_t imm = nir_src_as_uint(src);
if (imm == 1 && dst.bytes() <= 2)
bld.pseudo(aco_opcode::p_extract_vector, dst, count, Operand::zero());
else if (nir_src_as_uint(src) == 1)
else if (imm == 1)
bld.copy(dst, count);
else if (nir_src_as_uint(src) == 0)
else if (imm == 0)
bld.copy(dst, Operand::zero(dst.bytes()));
else if (count.type() == RegType::vgpr)
bld.v_mul_imm(dst, count, nir_src_as_uint(src));
bld.v_mul_imm(dst, count, imm, true, true);
else if (imm == 0xffffffff)
bld.sop2(aco_opcode::s_sub_i32, dst, bld.def(s1, scc), Operand::zero(), count);
else if (util_is_power_of_two_or_zero(imm))
bld.sop2(aco_opcode::s_lshl_b32, dst, bld.def(s1, scc), count,
Operand::c32(ffs(imm) - 1u));
else
bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count);
} else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX10) {