aco/insert_delay_alu: do not delay lane mask fast forwarding

The delay actually hurts performance in this case.

Foz-DB Navi31:
Totals from 30340 (38.21% of 79395) affected shaders:
Instrs: 30778999 -> 30726605 (-0.17%); split: -0.17%, +0.00%
CodeSize: 162380180 -> 162170808 (-0.13%); split: -0.13%, +0.00%
Latency: 228185562 -> 228186976 (+0.00%); split: -0.00%, +0.00%
InvThroughput: 39001151 -> 39000897 (-0.00%); split: -0.00%, +0.00%

Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31132>
This commit is contained in:
Georg Lehmann 2024-09-10 23:40:43 +02:00 committed by Marge Bot
parent e4889fd4b5
commit ec11cfc69d

View file

@ -49,6 +49,9 @@ struct alu_delay_info {
/* Cycles until the writing SALU instruction is finished*/ /* Cycles until the writing SALU instruction is finished*/
int8_t salu_cycles = 0; int8_t salu_cycles = 0;
/* VALU wrote this as lane mask. */
bool lane_mask_forwarding = true;
bool combine(const alu_delay_info& other) bool combine(const alu_delay_info& other)
{ {
bool changed = other.valu_instrs < valu_instrs || other.trans_instrs < trans_instrs || bool changed = other.valu_instrs < valu_instrs || other.trans_instrs < trans_instrs ||
@ -59,6 +62,7 @@ struct alu_delay_info {
salu_cycles = std::max(salu_cycles, other.salu_cycles); salu_cycles = std::max(salu_cycles, other.salu_cycles);
valu_cycles = std::max(valu_cycles, other.valu_cycles); valu_cycles = std::max(valu_cycles, other.valu_cycles);
trans_cycles = std::max(trans_cycles, other.trans_cycles); trans_cycles = std::max(trans_cycles, other.trans_cycles);
lane_mask_forwarding &= other.lane_mask_forwarding;
return changed; return changed;
} }
@ -122,7 +126,9 @@ struct delay_ctx {
void void
check_alu(delay_ctx& ctx, alu_delay_info& delay, Instruction* instr) check_alu(delay_ctx& ctx, alu_delay_info& delay, Instruction* instr)
{ {
for (const Operand op : instr->operands) { for (unsigned i = 0; i < instr->operands.size(); i++) {
const Operand op = instr->operands[i];
alu_delay_info op_delay;
if (op.isConstant() || op.isUndefined()) if (op.isConstant() || op.isUndefined())
continue; continue;
@ -131,8 +137,17 @@ check_alu(delay_ctx& ctx, alu_delay_info& delay, Instruction* instr)
std::map<PhysReg, alu_delay_info>::iterator it = std::map<PhysReg, alu_delay_info>::iterator it =
ctx.gpr_map.find(PhysReg{op.physReg() + j}); ctx.gpr_map.find(PhysReg{op.physReg() + j});
if (it != ctx.gpr_map.end()) if (it != ctx.gpr_map.end())
delay.combine(it->second); op_delay.combine(it->second);
} }
bool fast_forward = (instr->opcode == aco_opcode::v_cndmask_b32 ||
instr->opcode == aco_opcode::v_cndmask_b16 ||
instr->opcode == aco_opcode::v_dual_cndmask_b32) &&
i == 2;
fast_forward |= instr->isVOPD() && instr->vopd().opy == aco_opcode::v_dual_cndmask_b32 &&
i + 1 == instr->operands.size();
if (!op_delay.lane_mask_forwarding || !fast_forward)
delay.combine(op_delay);
} }
} }
@ -216,6 +231,7 @@ gen_alu(Instruction* instr, delay_ctx& ctx)
if (is_trans || is_valu || instr->isSALU()) { if (is_trans || is_valu || instr->isSALU()) {
alu_delay_info delay; alu_delay_info delay;
delay.lane_mask_forwarding = false;
if (is_trans) { if (is_trans) {
delay.trans_instrs = 0; delay.trans_instrs = 0;
delay.trans_cycles = cycle_info.latency; delay.trans_cycles = cycle_info.latency;
@ -226,9 +242,14 @@ gen_alu(Instruction* instr, delay_ctx& ctx)
delay.salu_cycles = cycle_info.latency; delay.salu_cycles = cycle_info.latency;
} }
for (const Definition& def : instr->definitions) { for (Definition& def : instr->definitions) {
for (unsigned i = 0; i < def.size(); i++) { if (is_valu && def.regClass() == ctx.program->lane_mask) {
auto it = ctx.gpr_map.emplace(PhysReg{def.physReg().reg() + i}, delay); delay.lane_mask_forwarding = instr->opcode != aco_opcode::v_readlane_b32_e64 &&
instr->opcode != aco_opcode::v_readfirstlane_b32;
}
for (unsigned j = 0; j < def.size(); j++) {
auto it = ctx.gpr_map.emplace(PhysReg{def.physReg().reg() + j}, delay);
if (!it.second) if (!it.second)
it.first->second.combine(delay); it.first->second.combine(delay);
} }