mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-05 00:58:05 +02:00
aco: Use s_cbranch_vccz/nz in post-RA optimization.
A simple post-RA optimization which takes advantage of the s_cbranch_vccz and s_cbranch_vccnz instructions. It works on the following pattern: vcc = v_cmp ... scc = s_and vcc, exec p_cbranch scc The result looks like this: vcc = v_cmp ... p_cbranch vcc Fossil DB results on Sienna Cichlid: Totals from 4814 (3.21% of 149839) affected shaders: CodeSize: 15371176 -> 15345964 (-0.16%) Instrs: 3028557 -> 3022254 (-0.21%) Latency: 21872753 -> 21823476 (-0.23%); split: -0.23%, +0.00% InvThroughput: 4470282 -> 4468691 (-0.04%); split: -0.04%, +0.00% Signed-off-by: Timur Kristóf <timur.kristof@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7779>
This commit is contained in:
parent
0e4747d3fb
commit
a93092d0ed
3 changed files with 154 additions and 2 deletions
|
|
@ -2910,7 +2910,6 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
|
|||
|
||||
Temp tmp = dst.bytes() == 8 ? bld.tmp(RegClass::get(dst.type(), 4)) : dst;
|
||||
if (tmp.regClass() == s1) {
|
||||
// TODO: in a post-RA optimization, we can check if src is in VCC, and directly use VCCNZ
|
||||
bool_to_scalar_condition(ctx, src, tmp);
|
||||
} else if (tmp.type() == RegType::vgpr) {
|
||||
bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(tmp), Operand(0u), Operand(1u), src);
|
||||
|
|
@ -10168,7 +10167,6 @@ static bool visit_if(isel_context *ctx, nir_if *if_stmt)
|
|||
* merge block.
|
||||
**/
|
||||
|
||||
// TODO: in a post-RA optimizer, we could check if the condition is in VCC and omit this instruction
|
||||
assert(cond.regClass() == ctx->program->lane_mask);
|
||||
cond = bool_to_scalar_condition(ctx, cond);
|
||||
|
||||
|
|
|
|||
|
|
@ -107,10 +107,67 @@ int last_writer_idx(pr_opt_ctx &ctx, const Operand &op)
|
|||
return instr_idx;
|
||||
}
|
||||
|
||||
void try_apply_branch_vcc(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
|
||||
{
|
||||
/* We are looking for the following pattern:
|
||||
*
|
||||
* vcc = ... ; last_vcc_wr
|
||||
* sX, scc = s_and_bXX vcc, exec ; op0_instr
|
||||
* (...vcc and exec must not be clobbered inbetween...)
|
||||
* s_cbranch_XX scc ; instr
|
||||
*
|
||||
* If possible, the above is optimized into:
|
||||
*
|
||||
* vcc = ... ; last_vcc_wr
|
||||
* s_cbranch_XX vcc ; instr modified to use vcc
|
||||
*/
|
||||
|
||||
/* Don't try to optimize this on GFX6-7 because SMEM may corrupt the vccz bit. */
|
||||
if (ctx.program->chip_class < GFX8)
|
||||
return;
|
||||
|
||||
if (instr->format != Format::PSEUDO_BRANCH ||
|
||||
instr->operands.size() == 0 ||
|
||||
instr->operands[0].physReg() != scc)
|
||||
return;
|
||||
|
||||
int op0_instr_idx = last_writer_idx(ctx, instr->operands[0]);
|
||||
int last_vcc_wr_idx = last_writer_idx(ctx, vcc, ctx.program->lane_mask);
|
||||
int last_exec_wr_idx = last_writer_idx(ctx, exec, ctx.program->lane_mask);
|
||||
|
||||
/* We need to make sure:
|
||||
* - the operand register used by the branch, and VCC were both written in the current block
|
||||
* - VCC was NOT written after the operand register
|
||||
* - EXEC is sane and was NOT written after the operand register
|
||||
*/
|
||||
if (op0_instr_idx < 0 || last_vcc_wr_idx < 0 || last_vcc_wr_idx > op0_instr_idx ||
|
||||
last_exec_wr_idx > last_vcc_wr_idx || last_exec_wr_idx < not_written_in_block)
|
||||
return;
|
||||
|
||||
aco_ptr<Instruction> &op0_instr = ctx.current_block->instructions[op0_instr_idx];
|
||||
aco_ptr<Instruction> &last_vcc_wr = ctx.current_block->instructions[last_vcc_wr_idx];
|
||||
|
||||
if ((op0_instr->opcode != aco_opcode::s_and_b64 /* wave64 */ &&
|
||||
op0_instr->opcode != aco_opcode::s_and_b32 /* wave32 */) ||
|
||||
op0_instr->operands[0].physReg() != vcc ||
|
||||
op0_instr->operands[1].physReg() != exec ||
|
||||
!last_vcc_wr->isVOPC())
|
||||
return;
|
||||
|
||||
assert(last_vcc_wr->definitions[0].tempId() == op0_instr->operands[0].tempId());
|
||||
|
||||
/* Reduce the uses of the SCC def */
|
||||
ctx.uses[instr->operands[0].tempId()]--;
|
||||
/* Use VCC instead of SCC in the branch */
|
||||
instr->operands[0] = op0_instr->operands[0];
|
||||
}
|
||||
|
||||
void process_instruction(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
|
||||
{
|
||||
ctx.current_instr_idx++;
|
||||
|
||||
try_apply_branch_vcc(ctx, instr);
|
||||
|
||||
if (instr)
|
||||
save_reg_writes(ctx, instr);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -25,3 +25,100 @@
|
|||
#include "helpers.h"
|
||||
|
||||
using namespace aco;
|
||||
|
||||
BEGIN_TEST(optimizer_postRA.vcmp)
|
||||
PhysReg reg_v0(256);
|
||||
PhysReg reg_s0(0);
|
||||
PhysReg reg_s2(2);
|
||||
PhysReg reg_s4(4);
|
||||
|
||||
//>> v1: %a:v[0] = p_startpgm
|
||||
ASSERTED bool setup_ok = setup_cs("v1", GFX8);
|
||||
assert(setup_ok);
|
||||
|
||||
auto &startpgm = bld.instructions->at(0);
|
||||
assert(startpgm->opcode == aco_opcode::p_startpgm);
|
||||
startpgm->definitions[0].setFixed(reg_v0);
|
||||
|
||||
Temp v_in = inputs[0];
|
||||
|
||||
{
|
||||
/* Recognize when the result of VOPC goes to VCC, and use that for the branching then. */
|
||||
|
||||
//! s2: %b:vcc = v_cmp_eq_u32 0, %a:v[0]
|
||||
//! s2: %e:s[2-3] = p_cbranch_z %b:vcc
|
||||
//! p_unit_test 0, %e:s[2-3]
|
||||
auto vcmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), Operand(0u), Operand(v_in, reg_v0));
|
||||
auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), bld.vcc(vcmp), Operand(exec, bld.lm));
|
||||
auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp()));
|
||||
writeout(0, Operand(br, reg_s2));
|
||||
}
|
||||
|
||||
//; del b, e
|
||||
|
||||
{
|
||||
/* When VCC is overwritten inbetween, don't optimize. */
|
||||
|
||||
//! s2: %b:vcc = v_cmp_eq_u32 0, %a:v[0]
|
||||
//! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:vcc, %x:exec
|
||||
//! s2: %f:vcc = s_mov_b64 0
|
||||
//! s2: %e:s[2-3] = p_cbranch_z %d:scc
|
||||
//! p_unit_test 1, %e:s[2-3], %f:vcc
|
||||
auto vcmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), Operand(0u), Operand(v_in, reg_v0));
|
||||
auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), bld.vcc(vcmp), Operand(exec, bld.lm));
|
||||
auto ovrwr = bld.sop1(Builder::s_mov, bld.def(bld.lm, vcc), Operand(0u));
|
||||
auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp()));
|
||||
writeout(1, Operand(br, reg_s2), Operand(ovrwr, vcc));
|
||||
}
|
||||
|
||||
//; del b, c, d, e, f
|
||||
|
||||
{
|
||||
/* When the result of VOPC goes to an SGPR pair other than VCC, don't optimize */
|
||||
|
||||
//! s2: %b:s[4-5] = v_cmp_eq_u32 0, %a:v[0]
|
||||
//! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:s[4-5], %x:exec
|
||||
//! s2: %e:s[2-3] = p_cbranch_z %d:scc
|
||||
//! p_unit_test 2, %e:s[2-3]
|
||||
auto vcmp = bld.vopc_e64(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, reg_s4), Operand(0u), Operand(v_in, reg_v0));
|
||||
auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), Operand(vcmp, reg_s4), Operand(exec, bld.lm));
|
||||
auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp()));
|
||||
writeout(2, Operand(br, reg_s2));
|
||||
}
|
||||
|
||||
//; del b, c, d, e
|
||||
|
||||
{
|
||||
/* When the VCC isn't written by VOPC, don't optimize */
|
||||
|
||||
//! s2: %b:vcc, s1: %f:scc = s_or_b64 1, %0:s[4-5]
|
||||
//! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:vcc, %x:exec
|
||||
//! s2: %e:s[2-3] = p_cbranch_z %d:scc
|
||||
//! p_unit_test 2, %e:s[2-3]
|
||||
auto salu = bld.sop2(Builder::s_or, bld.def(bld.lm, vcc), bld.def(s1, scc), Operand(1u), Operand(reg_s4, bld.lm));
|
||||
auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), Operand(salu, vcc), Operand(exec, bld.lm));
|
||||
auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp()));
|
||||
writeout(2, Operand(br, reg_s2));
|
||||
}
|
||||
|
||||
//; del b, c, d, e, f, x
|
||||
|
||||
{
|
||||
/* When EXEC is overwritten inbetween, don't optimize. */
|
||||
|
||||
//! s2: %b:vcc = v_cmp_eq_u32 0, %a:v[0]
|
||||
//! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:vcc, %x:exec
|
||||
//! s2: %f:exec = s_mov_b64 42
|
||||
//! s2: %e:s[2-3] = p_cbranch_z %d:scc
|
||||
//! p_unit_test 4, %e:s[2-3], %f:exec
|
||||
auto vcmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), Operand(0u), Operand(v_in, reg_v0));
|
||||
auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), bld.vcc(vcmp), Operand(exec, bld.lm));
|
||||
auto ovrwr = bld.sop1(Builder::s_mov, bld.def(bld.lm, exec), Operand(42u));
|
||||
auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp()));
|
||||
writeout(4, Operand(br, reg_s2), Operand(ovrwr, exec));
|
||||
}
|
||||
|
||||
//; del b, c, d, e, f, x
|
||||
|
||||
finish_optimizer_postRA_test();
|
||||
END_TEST
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue