aco/optimizer: Optimize p_extract + v_mul_u32_u24 to v_mad_u32_u16.

This should perform the same but removes SDWA from the address
calculations in NGG culling shaders for example.

This is done because SDWA is no longer available on GFX11.

Fossil DB stats on GFX1100:
Totals from 36 (0.03% of 134913) affected shaders:
CodeSize: 300968 -> 300884 (-0.03%); split: -0.04%, +0.01%
Instrs: 60955 -> 60863 (-0.15%); split: -0.15%, +0.00%
Latency: 426809 -> 426819 (+0.00%); split: -0.06%, +0.06%
InvThroughput: 39076 -> 39025 (-0.13%); split: -0.14%, +0.01%
VClause: 1440 -> 1443 (+0.21%)
Copies: 5714 -> 5725 (+0.19%)

Fossil DB stats on GFX1100 with NGG culling enabled:
Totals from 60953 (45.18% of 134913) affected shaders:
VGPRs: 2273172 -> 2273160 (-0.00%)
CodeSize: 186401864 -> 186403036 (+0.00%); split: -0.00%, +0.00%
Instrs: 37038048 -> 36977353 (-0.16%); split: -0.16%, +0.00%
Latency: 146466770 -> 146350172 (-0.08%); split: -0.08%, +0.00%
InvThroughput: 15342790 -> 15228585 (-0.74%); split: -0.74%, +0.00%
VClause: 669662 -> 669665 (+0.00%)
Copies: 2972380 -> 2972482 (+0.00%); split: -0.01%, +0.01%

Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17924>
This commit is contained in:
Timur Kristóf 2022-08-06 09:40:44 +02:00 committed by Marge Bot
parent 171d76ded1
commit faba30a8f3

View file

@ -1106,6 +1106,11 @@ can_apply_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, ssa_i
((sel.size() == 2 && instr->operands[0].constantValue() >= 16u) ||
(sel.size() == 1 && instr->operands[0].constantValue() >= 24u))) {
return true;
} else if (instr->opcode == aco_opcode::v_mul_u32_u24 && ctx.program->gfx_level >= GFX10 &&
!instr->usesModifiers() && sel.size() == 2 && !sel.sign_extend() &&
(instr->operands[!idx].is16bit() ||
instr->operands[!idx].constantValue() <= UINT16_MAX)) {
return true;
} else if (idx < 2 && can_use_SDWA(ctx.program->gfx_level, instr, true) &&
(tmp.type() == RegType::vgpr || ctx.program->gfx_level >= GFX9)) {
if (instr->isSDWA() && instr->sdwa().sel[idx] != SubdwordSel::dword)
@ -1162,6 +1167,18 @@ apply_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, ssa_info&
(sel.size() == 1 && instr->operands[0].constantValue() >= 24u))) {
/* The undesireable upper bits are already shifted out. */
return;
} else if (instr->opcode == aco_opcode::v_mul_u32_u24 && ctx.program->gfx_level >= GFX10 &&
!instr->usesModifiers() && sel.size() == 2 && !sel.sign_extend() &&
(instr->operands[!idx].is16bit() ||
instr->operands[!idx].constantValue() <= UINT16_MAX)) {
Instruction* mad =
create_instruction<VOP3_instruction>(aco_opcode::v_mad_u32_u16, Format::VOP3, 3, 1);
mad->definitions[0] = instr->definitions[0];
mad->operands[0] = instr->operands[0];
mad->operands[1] = instr->operands[1];
mad->operands[2] = Operand::zero();
mad->vop3().opsel = (sel.offset() / 2) << idx;
instr.reset(mad);
} else if (can_use_SDWA(ctx.program->gfx_level, instr, true) &&
(tmp.type() == RegType::vgpr || ctx.program->gfx_level >= GFX9)) {
to_SDWA(ctx, instr);