aco: Use s_cbranch_vccz/nz in post-RA optimization.

A simple post-RA optimization which takes advantage of the
s_cbranch_vccz and s_cbranch_vccnz instructions.

It works on the following pattern:

vcc = v_cmp ...
scc = s_and vcc, exec
p_cbranch scc

The result looks like this:

vcc = v_cmp ...
p_cbranch vcc

Fossil DB results on Sienna Cichlid:

Totals from 4814 (3.21% of 149839) affected shaders:
CodeSize: 15371176 -> 15345964 (-0.16%)
Instrs: 3028557 -> 3022254 (-0.21%)
Latency: 21872753 -> 21823476 (-0.23%); split: -0.23%, +0.00%
InvThroughput: 4470282 -> 4468691 (-0.04%); split: -0.04%, +0.00%

Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7779>
This commit is contained in:
Timur Kristóf 2021-03-20 17:47:05 +01:00 committed by Marge Bot
parent 0e4747d3fb
commit a93092d0ed
3 changed files with 154 additions and 2 deletions

View file

@ -2910,7 +2910,6 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
Temp tmp = dst.bytes() == 8 ? bld.tmp(RegClass::get(dst.type(), 4)) : dst;
if (tmp.regClass() == s1) {
// TODO: in a post-RA optimization, we can check if src is in VCC, and directly use VCCNZ
bool_to_scalar_condition(ctx, src, tmp);
} else if (tmp.type() == RegType::vgpr) {
bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(tmp), Operand(0u), Operand(1u), src);
@ -10168,7 +10167,6 @@ static bool visit_if(isel_context *ctx, nir_if *if_stmt)
* merge block.
**/
// TODO: in a post-RA optimizer, we could check if the condition is in VCC and omit this instruction
assert(cond.regClass() == ctx->program->lane_mask);
cond = bool_to_scalar_condition(ctx, cond);

View file

@ -107,10 +107,67 @@ int last_writer_idx(pr_opt_ctx &ctx, const Operand &op)
return instr_idx;
}
void try_apply_branch_vcc(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
{
/* We are looking for the following pattern:
*
* vcc = ... ; last_vcc_wr
* sX, scc = s_and_bXX vcc, exec ; op0_instr
* (...vcc and exec must not be clobbered inbetween...)
* s_cbranch_XX scc ; instr
*
* If possible, the above is optimized into:
*
* vcc = ... ; last_vcc_wr
* s_cbranch_XX vcc ; instr modified to use vcc
*/
/* Don't try to optimize this on GFX6-7 because SMEM may corrupt the vccz bit. */
if (ctx.program->chip_class < GFX8)
return;
if (instr->format != Format::PSEUDO_BRANCH ||
instr->operands.size() == 0 ||
instr->operands[0].physReg() != scc)
return;
int op0_instr_idx = last_writer_idx(ctx, instr->operands[0]);
int last_vcc_wr_idx = last_writer_idx(ctx, vcc, ctx.program->lane_mask);
int last_exec_wr_idx = last_writer_idx(ctx, exec, ctx.program->lane_mask);
/* We need to make sure:
* - the operand register used by the branch, and VCC were both written in the current block
* - VCC was NOT written after the operand register
* - EXEC is sane and was NOT written after the operand register
*/
if (op0_instr_idx < 0 || last_vcc_wr_idx < 0 || last_vcc_wr_idx > op0_instr_idx ||
last_exec_wr_idx > last_vcc_wr_idx || last_exec_wr_idx < not_written_in_block)
return;
aco_ptr<Instruction> &op0_instr = ctx.current_block->instructions[op0_instr_idx];
aco_ptr<Instruction> &last_vcc_wr = ctx.current_block->instructions[last_vcc_wr_idx];
if ((op0_instr->opcode != aco_opcode::s_and_b64 /* wave64 */ &&
op0_instr->opcode != aco_opcode::s_and_b32 /* wave32 */) ||
op0_instr->operands[0].physReg() != vcc ||
op0_instr->operands[1].physReg() != exec ||
!last_vcc_wr->isVOPC())
return;
assert(last_vcc_wr->definitions[0].tempId() == op0_instr->operands[0].tempId());
/* Reduce the uses of the SCC def */
ctx.uses[instr->operands[0].tempId()]--;
/* Use VCC instead of SCC in the branch */
instr->operands[0] = op0_instr->operands[0];
}
void process_instruction(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
{
ctx.current_instr_idx++;
try_apply_branch_vcc(ctx, instr);
if (instr)
save_reg_writes(ctx, instr);
}

View file

@ -25,3 +25,100 @@
#include "helpers.h"
using namespace aco;
BEGIN_TEST(optimizer_postRA.vcmp)
PhysReg reg_v0(256);
PhysReg reg_s0(0);
PhysReg reg_s2(2);
PhysReg reg_s4(4);
//>> v1: %a:v[0] = p_startpgm
ASSERTED bool setup_ok = setup_cs("v1", GFX8);
assert(setup_ok);
auto &startpgm = bld.instructions->at(0);
assert(startpgm->opcode == aco_opcode::p_startpgm);
startpgm->definitions[0].setFixed(reg_v0);
Temp v_in = inputs[0];
{
/* Recognize when the result of VOPC goes to VCC, and use that for the branching then. */
//! s2: %b:vcc = v_cmp_eq_u32 0, %a:v[0]
//! s2: %e:s[2-3] = p_cbranch_z %b:vcc
//! p_unit_test 0, %e:s[2-3]
auto vcmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), Operand(0u), Operand(v_in, reg_v0));
auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), bld.vcc(vcmp), Operand(exec, bld.lm));
auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp()));
writeout(0, Operand(br, reg_s2));
}
//; del b, e
{
/* When VCC is overwritten inbetween, don't optimize. */
//! s2: %b:vcc = v_cmp_eq_u32 0, %a:v[0]
//! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:vcc, %x:exec
//! s2: %f:vcc = s_mov_b64 0
//! s2: %e:s[2-3] = p_cbranch_z %d:scc
//! p_unit_test 1, %e:s[2-3], %f:vcc
auto vcmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), Operand(0u), Operand(v_in, reg_v0));
auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), bld.vcc(vcmp), Operand(exec, bld.lm));
auto ovrwr = bld.sop1(Builder::s_mov, bld.def(bld.lm, vcc), Operand(0u));
auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp()));
writeout(1, Operand(br, reg_s2), Operand(ovrwr, vcc));
}
//; del b, c, d, e, f
{
/* When the result of VOPC goes to an SGPR pair other than VCC, don't optimize */
//! s2: %b:s[4-5] = v_cmp_eq_u32 0, %a:v[0]
//! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:s[4-5], %x:exec
//! s2: %e:s[2-3] = p_cbranch_z %d:scc
//! p_unit_test 2, %e:s[2-3]
auto vcmp = bld.vopc_e64(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, reg_s4), Operand(0u), Operand(v_in, reg_v0));
auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), Operand(vcmp, reg_s4), Operand(exec, bld.lm));
auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp()));
writeout(2, Operand(br, reg_s2));
}
//; del b, c, d, e
{
/* When the VCC isn't written by VOPC, don't optimize */
//! s2: %b:vcc, s1: %f:scc = s_or_b64 1, %0:s[4-5]
//! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:vcc, %x:exec
//! s2: %e:s[2-3] = p_cbranch_z %d:scc
//! p_unit_test 2, %e:s[2-3]
auto salu = bld.sop2(Builder::s_or, bld.def(bld.lm, vcc), bld.def(s1, scc), Operand(1u), Operand(reg_s4, bld.lm));
auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), Operand(salu, vcc), Operand(exec, bld.lm));
auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp()));
writeout(2, Operand(br, reg_s2));
}
//; del b, c, d, e, f, x
{
/* When EXEC is overwritten inbetween, don't optimize. */
//! s2: %b:vcc = v_cmp_eq_u32 0, %a:v[0]
//! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:vcc, %x:exec
//! s2: %f:exec = s_mov_b64 42
//! s2: %e:s[2-3] = p_cbranch_z %d:scc
//! p_unit_test 4, %e:s[2-3], %f:exec
auto vcmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), Operand(0u), Operand(v_in, reg_v0));
auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), bld.vcc(vcmp), Operand(exec, bld.lm));
auto ovrwr = bld.sop1(Builder::s_mov, bld.def(bld.lm, exec), Operand(42u));
auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp()));
writeout(4, Operand(br, reg_s2), Operand(ovrwr, exec));
}
//; del b, c, d, e, f, x
finish_optimizer_postRA_test();
END_TEST