aco: use v_lshrrev_b64 for 64-bit VGPR copies on GFX10+

This isn't worth it on GFX9-, but the proprietary compiler uses it on
GFX10.

fossil-db (Navi):
Totals from 23825 (17.17% of 138791) affected shaders:
CodeSize: 130623632 -> 130623800 (+0.00%); split: -0.00%, +0.00%
Instrs: 25185559 -> 25108597 (-0.31%)
Cycles: 709864740 -> 708910860 (-0.13%)
VMEM: 7205343 -> 7168839 (-0.51%); split: +0.00%, -0.51%
SMEM: 1584946 -> 1575183 (-0.62%)
Copies: 2043134 -> 1966230 (-3.76%)

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7798>
This commit is contained in:
Rhys Perry 2020-11-26 14:18:18 +00:00 committed by Marge Bot
parent 8c02a8e2d2
commit f53d4e5f60

View file

@ -932,16 +932,20 @@ void split_copy(lower_context *ctx, unsigned offset, Definition *def, Operand *o
def_reg.reg_b += offset;
op_reg.reg_b += offset;
max_size = MIN2(max_size, src.def.regClass().type() == RegType::vgpr ? 4 : 8);
/* 64-bit VGPR copies (implemented with v_lshrrev_b64) are slow before GFX10 */
if (ctx->program->chip_class < GFX10 &&
src.def.regClass().type() == RegType::vgpr)
max_size = MIN2(max_size, 4);
unsigned max_align = src.def.regClass().type() == RegType::vgpr ? 4 : 16;
/* make sure the size is a power of two and reg % bytes == 0 */
unsigned bytes = 1;
for (; bytes <= max_size; bytes *= 2) {
unsigned next = bytes * 2u;
bool can_increase = def_reg.reg_b % next == 0 &&
bool can_increase = def_reg.reg_b % MIN2(next, max_align) == 0 &&
offset + next <= src.bytes && next <= max_size;
if (!src.op.isConstant() && can_increase)
can_increase = op_reg.reg_b % next == 0;
can_increase = op_reg.reg_b % MIN2(next, max_align) == 0;
for (unsigned i = 0; !ignore_uses && can_increase && (i < bytes); i++)
can_increase = (src.uses[offset + bytes + i] == 0) == (src.uses[offset] == 0);
if (!can_increase)
@ -1007,7 +1011,16 @@ void copy_constant(lower_context *ctx, Builder& bld, Definition dst, Operand op)
if (dst.regClass() == s1) {
bld.sop1(aco_opcode::s_mov_b32, dst, op);
} else if (dst.regClass() == s2) {
/* s_ashr_i64 writes SCC, so we can't use it */
assert(Operand::is_constant_representable(op.constantValue64(), 8, true, false));
bld.sop1(aco_opcode::s_mov_b64, dst, op);
} else if (dst.regClass() == v2) {
if (Operand::is_constant_representable(op.constantValue64(), 8, true, false)) {
bld.vop3(aco_opcode::v_lshrrev_b64, dst, Operand(0u), op);
} else {
assert(Operand::is_constant_representable(op.constantValue64(), 8, false, true));
bld.vop3(aco_opcode::v_ashrrev_i64, dst, Operand(0u), op);
}
} else if (dst.regClass() == v1) {
bld.vop1(aco_opcode::v_mov_b32, dst, op);
} else if (dst.regClass() == v1b) {
@ -1076,6 +1089,8 @@ bool do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool
copy_constant(ctx, bld, def, op);
} else if (def.regClass() == v1) {
bld.vop1(aco_opcode::v_mov_b32, def, op);
} else if (def.regClass() == v2) {
bld.vop3(aco_opcode::v_lshrrev_b64, def, Operand(0u), op);
} else if (def.regClass() == s1) {
bld.sop1(aco_opcode::s_mov_b32, def, op);
} else if (def.regClass() == s2) {
@ -1155,7 +1170,8 @@ void do_swap(lower_context *ctx, Builder& bld, const copy_operation& copy, bool
for (; offset < copy.bytes;) {
Definition def;
Operand op;
split_copy(ctx, offset, &def, &op, copy, true, 8);
unsigned max_size = copy.def.regClass().type() == RegType::vgpr ? 4 : 8;
split_copy(ctx, offset, &def, &op, copy, true, max_size);
assert(op.regClass() == def.regClass());
Operand def_as_op = Operand(def.physReg(), def.regClass());
@ -1353,9 +1369,16 @@ void handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context*
}
/* try to coalesce copies */
unsigned next_def_align = util_next_power_of_two(it->second.bytes + 1);
unsigned next_op_align = next_def_align;
if (it->second.def.regClass().type() == RegType::vgpr)
next_def_align = MIN2(next_def_align, 4);
if (it->second.op.regClass().type() == RegType::vgpr)
next_op_align = MIN2(next_op_align, 4);
if (it->second.bytes < 8 && !it->second.op.isConstant() &&
it->first.reg_b % util_next_power_of_two(it->second.bytes + 1) == 0 &&
it->second.op.physReg().reg_b % util_next_power_of_two(it->second.bytes + 1) == 0) {
it->first.reg_b % next_def_align == 0 &&
it->second.op.physReg().reg_b % next_op_align == 0) {
// TODO try more relaxed alignment for subdword copies
PhysReg other_def_reg = it->first;
other_def_reg.reg_b += it->second.bytes;