mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-05 11:48:06 +02:00
aco: optimize for purely linear VGPR copies
fossil-db: Totals from 2 (0.00% of 79242) affected shaders: Instrs: 1344 -> 1340 (-0.30%) CodeSize: 6968 -> 6952 (-0.23%) Latency: 4414 -> 4410 (-0.09%) InvThroughput: 1018 -> 1020 (+0.20%) Copies: 60 -> 56 (-6.67%) SALU: 40 -> 36 (-10.00%) Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27697>
This commit is contained in:
parent
8cd3a3a520
commit
3b28ba8239
2 changed files with 58 additions and 101 deletions
|
|
@ -1342,54 +1342,6 @@ copy_constant(lower_context* ctx, Builder& bld, Definition dst, Operand op)
|
|||
}
|
||||
}
|
||||
|
||||
void
|
||||
copy_linear_vgpr(Builder& bld, Definition def, Operand op, bool preserve_scc, PhysReg scratch_sgpr)
|
||||
{
|
||||
if (preserve_scc)
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1), Operand(scc, s1));
|
||||
|
||||
for (unsigned i = 0; i < 2; i++) {
|
||||
if (def.size() == 2)
|
||||
bld.vop3(aco_opcode::v_lshrrev_b64, def, Operand::zero(), op);
|
||||
else
|
||||
bld.vop1(aco_opcode::v_mov_b32, def, op);
|
||||
|
||||
bld.sop1(Builder::s_not, Definition(exec, bld.lm), Definition(scc, s1),
|
||||
Operand(exec, bld.lm));
|
||||
}
|
||||
|
||||
if (preserve_scc)
|
||||
bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(scratch_sgpr, s1),
|
||||
Operand::zero());
|
||||
}
|
||||
|
||||
void
|
||||
swap_linear_vgpr(Builder& bld, Definition def, Operand op, bool preserve_scc, PhysReg scratch_sgpr)
|
||||
{
|
||||
if (preserve_scc)
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1), Operand(scc, s1));
|
||||
|
||||
Operand def_as_op = Operand(def.physReg(), def.regClass());
|
||||
Definition op_as_def = Definition(op.physReg(), op.regClass());
|
||||
|
||||
for (unsigned i = 0; i < 2; i++) {
|
||||
if (bld.program->gfx_level >= GFX9) {
|
||||
bld.vop1(aco_opcode::v_swap_b32, def, op_as_def, op, def_as_op);
|
||||
} else {
|
||||
bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
|
||||
bld.vop2(aco_opcode::v_xor_b32, def, op, def_as_op);
|
||||
bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
|
||||
}
|
||||
|
||||
bld.sop1(Builder::s_not, Definition(exec, bld.lm), Definition(scc, s1),
|
||||
Operand(exec, bld.lm));
|
||||
}
|
||||
|
||||
if (preserve_scc)
|
||||
bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(scratch_sgpr, s1),
|
||||
Operand::zero());
|
||||
}
|
||||
|
||||
void
|
||||
addsub_subdword_gfx11(Builder& bld, Definition dst, Operand src0, Operand src1, bool sub)
|
||||
{
|
||||
|
|
@ -1423,8 +1375,6 @@ do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool* pres
|
|||
*preserve_scc = true;
|
||||
} else if (op.isConstant()) {
|
||||
copy_constant(ctx, bld, def, op);
|
||||
} else if (def.regClass().is_linear_vgpr()) {
|
||||
copy_linear_vgpr(bld, def, op, *preserve_scc, scratch_sgpr);
|
||||
} else if (def.regClass() == v1) {
|
||||
bld.vop1(aco_opcode::v_mov_b32, def, op);
|
||||
} else if (def.regClass() == v2) {
|
||||
|
|
@ -1564,9 +1514,7 @@ do_swap(lower_context* ctx, Builder& bld, const copy_operation& copy, bool prese
|
|||
assert(op.regClass() == def.regClass());
|
||||
Operand def_as_op = Operand(def.physReg(), def.regClass());
|
||||
Definition op_as_def = Definition(op.physReg(), op.regClass());
|
||||
if (def.regClass().is_linear_vgpr()) {
|
||||
swap_linear_vgpr(bld, def, op, preserve_scc, pi->scratch_sgpr);
|
||||
} else if (ctx->program->gfx_level >= GFX9 && def.regClass() == v1) {
|
||||
if (ctx->program->gfx_level >= GFX9 && def.regClass() == v1) {
|
||||
bld.vop1(aco_opcode::v_swap_b32, def, op_as_def, op, def_as_op);
|
||||
} else if (def.regClass() == v1) {
|
||||
assert(def.physReg().byte() == 0 && op.physReg().byte() == 0);
|
||||
|
|
@ -1767,9 +1715,6 @@ try_coalesce_copies(lower_context* ctx, std::map<PhysReg, copy_operation>& copy_
|
|||
copy.op.isConstant() != other->second.op.isConstant())
|
||||
return;
|
||||
|
||||
if (other->second.def.regClass().is_linear_vgpr() != copy.def.regClass().is_linear_vgpr())
|
||||
return;
|
||||
|
||||
/* don't create 64-bit copies before GFX10 */
|
||||
if (copy.bytes >= 4 && copy.def.regClass().type() == RegType::vgpr &&
|
||||
ctx->program->gfx_level < GFX10)
|
||||
|
|
@ -2169,6 +2114,41 @@ handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx,
|
|||
ctx->instructions.size() - num_instructions_before;
|
||||
}
|
||||
|
||||
void
|
||||
handle_operands_linear_vgpr(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx,
|
||||
amd_gfx_level gfx_level, Pseudo_instruction* pi)
|
||||
{
|
||||
Builder bld(ctx->program, &ctx->instructions);
|
||||
|
||||
for (auto& copy : copy_map) {
|
||||
copy.second.op =
|
||||
Operand(copy.second.op.physReg(), RegClass::get(RegType::vgpr, copy.second.op.bytes()));
|
||||
copy.second.def = Definition(copy.second.def.physReg(),
|
||||
RegClass::get(RegType::vgpr, copy.second.def.bytes()));
|
||||
}
|
||||
|
||||
std::map<PhysReg, copy_operation> second_map(copy_map);
|
||||
handle_operands(second_map, ctx, gfx_level, pi);
|
||||
|
||||
bool tmp_in_scc = pi->tmp_in_scc;
|
||||
if (tmp_in_scc) {
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), Operand(scc, s1));
|
||||
pi->tmp_in_scc = false;
|
||||
}
|
||||
bld.sop1(Builder::s_not, Definition(exec, bld.lm), Definition(scc, s1), Operand(exec, bld.lm));
|
||||
|
||||
handle_operands(copy_map, ctx, gfx_level, pi);
|
||||
|
||||
bld.sop1(Builder::s_not, Definition(exec, bld.lm), Definition(scc, s1), Operand(exec, bld.lm));
|
||||
if (tmp_in_scc) {
|
||||
bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(pi->scratch_sgpr, s1),
|
||||
Operand::zero());
|
||||
pi->tmp_in_scc = true;
|
||||
}
|
||||
|
||||
ctx->program->statistics[aco_statistic_copies] += tmp_in_scc ? 4 : 2;
|
||||
}
|
||||
|
||||
void
|
||||
emit_set_mode(Builder& bld, float_mode new_mode, bool set_round, bool set_denorm)
|
||||
{
|
||||
|
|
@ -2404,12 +2384,20 @@ lower_to_hw_instr(Program* program)
|
|||
}
|
||||
case aco_opcode::p_parallelcopy: {
|
||||
std::map<PhysReg, copy_operation> copy_operations;
|
||||
bool linear_vgpr = false;
|
||||
bool non_linear_vgpr = false;
|
||||
for (unsigned j = 0; j < instr->operands.size(); j++) {
|
||||
assert(instr->definitions[j].bytes() == instr->operands[j].bytes());
|
||||
copy_operations[instr->definitions[j].physReg()] = {
|
||||
instr->operands[j], instr->definitions[j], instr->operands[j].bytes()};
|
||||
linear_vgpr |= instr->definitions[j].regClass().is_linear_vgpr();
|
||||
non_linear_vgpr |= !instr->definitions[j].regClass().is_linear_vgpr();
|
||||
}
|
||||
handle_operands(copy_operations, &ctx, program->gfx_level, pi);
|
||||
assert(!linear_vgpr || !non_linear_vgpr);
|
||||
if (linear_vgpr)
|
||||
handle_operands_linear_vgpr(copy_operations, &ctx, program->gfx_level, pi);
|
||||
else
|
||||
handle_operands(copy_operations, &ctx, program->gfx_level, pi);
|
||||
break;
|
||||
}
|
||||
case aco_opcode::p_exit_early_if: {
|
||||
|
|
|
|||
|
|
@ -762,10 +762,7 @@ BEGIN_TEST(to_hw_instr.copy_linear_vgpr_scc)
|
|||
if (!setup_cs(NULL, GFX10))
|
||||
return;
|
||||
|
||||
PhysReg reg_s0{0};
|
||||
PhysReg v0_lo{256};
|
||||
PhysReg v0_b3{256};
|
||||
v0_b3.reg_b += 3;
|
||||
PhysReg v1_lo{257};
|
||||
|
||||
//>> p_unit_test 0
|
||||
|
|
@ -775,17 +772,16 @@ BEGIN_TEST(to_hw_instr.copy_linear_vgpr_scc)
|
|||
* enough
|
||||
*/
|
||||
|
||||
//! s1: %0:scc = s_cmp_lg_i32 %0:s[0], 0
|
||||
//! v1: %0:v[0] = v_mov_b32 %0:v[1]
|
||||
//! s1: %0:m0 = s_mov_b32 %0:scc
|
||||
//! lv1: %0:v[0] = v_mov_b32 %0:v[1]
|
||||
//! s2: %0:exec, s1: %0:scc = s_not_b64 %0:exec
|
||||
//! lv1: %0:v[0] = v_mov_b32 %0:v[1]
|
||||
//! v1: %0:v[0] = v_mov_b32 %0:v[1]
|
||||
//! s2: %0:exec, s1: %0:scc = s_not_b64 %0:exec
|
||||
//! s1: %0:scc = s_cmp_lg_i32 %0:m0, 0
|
||||
Instruction* instr =
|
||||
bld.pseudo(aco_opcode::p_parallelcopy, Definition(scc, s1), Definition(v0_lo, v1.as_linear()),
|
||||
Operand(reg_s0, s1), Operand(v1_lo, v1.as_linear()));
|
||||
Instruction* instr = bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1.as_linear()),
|
||||
Operand(v1_lo, v1.as_linear()));
|
||||
instr->pseudo().scratch_sgpr = m0;
|
||||
instr->pseudo().tmp_in_scc = true;
|
||||
|
||||
finish_to_hw_instr_test();
|
||||
END_TEST
|
||||
|
|
@ -801,9 +797,9 @@ BEGIN_TEST(to_hw_instr.swap_linear_vgpr)
|
|||
//>> p_unit_test 0
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
|
||||
|
||||
//! lv1: %0:v[0], lv1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
|
||||
//! v1: %0:v[0], v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
|
||||
//! s2: %0:exec, s1: %0:scc = s_not_b64 %0:exec
|
||||
//! lv1: %0:v[0], lv1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
|
||||
//! v1: %0:v[0], v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
|
||||
//! s2: %0:exec, s1: %0:scc = s_not_b64 %0:exec
|
||||
Instruction* instr = bld.pseudo(aco_opcode::p_parallelcopy, Definition(reg_v0, v1_linear),
|
||||
Definition(reg_v1, v1_linear), Operand(reg_v1, v1_linear),
|
||||
|
|
@ -824,13 +820,11 @@ BEGIN_TEST(to_hw_instr.copy_linear_vgpr_v3)
|
|||
//>> p_unit_test 0
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
|
||||
|
||||
//! lv2: %0:v[0-1] = v_lshrrev_b64 0, %0:v[4-5]
|
||||
//! v2: %0:v[0-1] = v_lshrrev_b64 0, %0:v[4-5]
|
||||
//! v1: %0:v[2] = v_mov_b32 %0:v[6]
|
||||
//! s2: %0:exec, s1: %0:scc = s_not_b64 %0:exec
|
||||
//! lv2: %0:v[0-1] = v_lshrrev_b64 0, %0:v[4-5]
|
||||
//! s2: %0:exec, s1: %0:scc = s_not_b64 %0:exec
|
||||
//! lv1: %0:v[2] = v_mov_b32 %0:v[6]
|
||||
//! s2: %0:exec, s1: %0:scc = s_not_b64 %0:exec
|
||||
//! lv1: %0:v[2] = v_mov_b32 %0:v[6]
|
||||
//! v2: %0:v[0-1] = v_lshrrev_b64 0, %0:v[4-5]
|
||||
//! v1: %0:v[2] = v_mov_b32 %0:v[6]
|
||||
//! s2: %0:exec, s1: %0:scc = s_not_b64 %0:exec
|
||||
Instruction* instr = bld.pseudo(aco_opcode::p_parallelcopy, Definition(reg_v0, v3_linear),
|
||||
Operand(reg_v4, v3_linear));
|
||||
|
|
@ -850,9 +844,9 @@ BEGIN_TEST(to_hw_instr.copy_linear_vgpr_coalesce)
|
|||
RegClass v1_linear = v1.as_linear();
|
||||
|
||||
//>> p_unit_test 0
|
||||
//! lv2: %0:v[0-1] = v_lshrrev_b64 0, %0:v[4-5]
|
||||
//! v2: %0:v[0-1] = v_lshrrev_b64 0, %0:v[4-5]
|
||||
//! s2: %0:exec, s1: %0:scc = s_not_b64 %0:exec
|
||||
//! lv2: %0:v[0-1] = v_lshrrev_b64 0, %0:v[4-5]
|
||||
//! v2: %0:v[0-1] = v_lshrrev_b64 0, %0:v[4-5]
|
||||
//! s2: %0:exec, s1: %0:scc = s_not_b64 %0:exec
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
|
||||
|
||||
|
|
@ -861,31 +855,6 @@ BEGIN_TEST(to_hw_instr.copy_linear_vgpr_coalesce)
|
|||
Operand(reg_v5, v1_linear));
|
||||
instr->pseudo().scratch_sgpr = m0;
|
||||
|
||||
//! p_unit_test 1
|
||||
//! lv1: %0:v[0] = v_mov_b32 %0:v[4]
|
||||
//! s2: %0:exec, s1: %0:scc = s_not_b64 %0:exec
|
||||
//! lv1: %0:v[0] = v_mov_b32 %0:v[4]
|
||||
//! s2: %0:exec, s1: %0:scc = s_not_b64 %0:exec
|
||||
//! v1: %0:v[1] = v_mov_b32 %0:v[5]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
|
||||
|
||||
instr = bld.pseudo(aco_opcode::p_parallelcopy, Definition(reg_v0, v1_linear),
|
||||
Definition(reg_v1, v1), Operand(reg_v4, v1_linear), Operand(reg_v5, v1));
|
||||
instr->pseudo().scratch_sgpr = m0;
|
||||
|
||||
//! p_unit_test 2
|
||||
//! v1: %0:v[0] = v_mov_b32 %0:v[4]
|
||||
//! lv1: %0:v[1] = v_mov_b32 %0:v[5]
|
||||
//! s2: %0:exec, s1: %0:scc = s_not_b64 %0:exec
|
||||
//! lv1: %0:v[1] = v_mov_b32 %0:v[5]
|
||||
//! s2: %0:exec, s1: %0:scc = s_not_b64 %0:exec
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
|
||||
|
||||
instr =
|
||||
bld.pseudo(aco_opcode::p_parallelcopy, Definition(reg_v0, v1), Definition(reg_v1, v1_linear),
|
||||
Operand(reg_v4, v1), Operand(reg_v5, v1_linear));
|
||||
instr->pseudo().scratch_sgpr = m0;
|
||||
|
||||
finish_to_hw_instr_test();
|
||||
END_TEST
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue