mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-04-20 23:00:36 +02:00
aco: use v_lshrrev_b64 for 64-bit VGPR copies on GFX10+
This isn't worth it on GFX9-, but the proprietary compiler uses it on GFX10. fossil-db (Navi): Totals from 23825 (17.17% of 138791) affected shaders: CodeSize: 130623632 -> 130623800 (+0.00%); split: -0.00%, +0.00% Instrs: 25185559 -> 25108597 (-0.31%) Cycles: 709864740 -> 708910860 (-0.13%) VMEM: 7205343 -> 7168839 (-0.51%); split: +0.00%, -0.51% SMEM: 1584946 -> 1575183 (-0.62%) Copies: 2043134 -> 1966230 (-3.76%) Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7798>
This commit is contained in:
parent
8c02a8e2d2
commit
f53d4e5f60
1 changed files with 29 additions and 6 deletions
|
|
@ -932,16 +932,20 @@ void split_copy(lower_context *ctx, unsigned offset, Definition *def, Operand *o
|
|||
def_reg.reg_b += offset;
|
||||
op_reg.reg_b += offset;
|
||||
|
||||
max_size = MIN2(max_size, src.def.regClass().type() == RegType::vgpr ? 4 : 8);
|
||||
/* 64-bit VGPR copies (implemented with v_lshrrev_b64) are slow before GFX10 */
|
||||
if (ctx->program->chip_class < GFX10 &&
|
||||
src.def.regClass().type() == RegType::vgpr)
|
||||
max_size = MIN2(max_size, 4);
|
||||
unsigned max_align = src.def.regClass().type() == RegType::vgpr ? 4 : 16;
|
||||
|
||||
/* make sure the size is a power of two and reg % bytes == 0 */
|
||||
unsigned bytes = 1;
|
||||
for (; bytes <= max_size; bytes *= 2) {
|
||||
unsigned next = bytes * 2u;
|
||||
bool can_increase = def_reg.reg_b % next == 0 &&
|
||||
bool can_increase = def_reg.reg_b % MIN2(next, max_align) == 0 &&
|
||||
offset + next <= src.bytes && next <= max_size;
|
||||
if (!src.op.isConstant() && can_increase)
|
||||
can_increase = op_reg.reg_b % next == 0;
|
||||
can_increase = op_reg.reg_b % MIN2(next, max_align) == 0;
|
||||
for (unsigned i = 0; !ignore_uses && can_increase && (i < bytes); i++)
|
||||
can_increase = (src.uses[offset + bytes + i] == 0) == (src.uses[offset] == 0);
|
||||
if (!can_increase)
|
||||
|
|
@ -1007,7 +1011,16 @@ void copy_constant(lower_context *ctx, Builder& bld, Definition dst, Operand op)
|
|||
if (dst.regClass() == s1) {
|
||||
bld.sop1(aco_opcode::s_mov_b32, dst, op);
|
||||
} else if (dst.regClass() == s2) {
|
||||
/* s_ashr_i64 writes SCC, so we can't use it */
|
||||
assert(Operand::is_constant_representable(op.constantValue64(), 8, true, false));
|
||||
bld.sop1(aco_opcode::s_mov_b64, dst, op);
|
||||
} else if (dst.regClass() == v2) {
|
||||
if (Operand::is_constant_representable(op.constantValue64(), 8, true, false)) {
|
||||
bld.vop3(aco_opcode::v_lshrrev_b64, dst, Operand(0u), op);
|
||||
} else {
|
||||
assert(Operand::is_constant_representable(op.constantValue64(), 8, false, true));
|
||||
bld.vop3(aco_opcode::v_ashrrev_i64, dst, Operand(0u), op);
|
||||
}
|
||||
} else if (dst.regClass() == v1) {
|
||||
bld.vop1(aco_opcode::v_mov_b32, dst, op);
|
||||
} else if (dst.regClass() == v1b) {
|
||||
|
|
@ -1076,6 +1089,8 @@ bool do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool
|
|||
copy_constant(ctx, bld, def, op);
|
||||
} else if (def.regClass() == v1) {
|
||||
bld.vop1(aco_opcode::v_mov_b32, def, op);
|
||||
} else if (def.regClass() == v2) {
|
||||
bld.vop3(aco_opcode::v_lshrrev_b64, def, Operand(0u), op);
|
||||
} else if (def.regClass() == s1) {
|
||||
bld.sop1(aco_opcode::s_mov_b32, def, op);
|
||||
} else if (def.regClass() == s2) {
|
||||
|
|
@ -1155,7 +1170,8 @@ void do_swap(lower_context *ctx, Builder& bld, const copy_operation& copy, bool
|
|||
for (; offset < copy.bytes;) {
|
||||
Definition def;
|
||||
Operand op;
|
||||
split_copy(ctx, offset, &def, &op, copy, true, 8);
|
||||
unsigned max_size = copy.def.regClass().type() == RegType::vgpr ? 4 : 8;
|
||||
split_copy(ctx, offset, &def, &op, copy, true, max_size);
|
||||
|
||||
assert(op.regClass() == def.regClass());
|
||||
Operand def_as_op = Operand(def.physReg(), def.regClass());
|
||||
|
|
@ -1353,9 +1369,16 @@ void handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context*
|
|||
}
|
||||
|
||||
/* try to coalesce copies */
|
||||
unsigned next_def_align = util_next_power_of_two(it->second.bytes + 1);
|
||||
unsigned next_op_align = next_def_align;
|
||||
if (it->second.def.regClass().type() == RegType::vgpr)
|
||||
next_def_align = MIN2(next_def_align, 4);
|
||||
if (it->second.op.regClass().type() == RegType::vgpr)
|
||||
next_op_align = MIN2(next_op_align, 4);
|
||||
|
||||
if (it->second.bytes < 8 && !it->second.op.isConstant() &&
|
||||
it->first.reg_b % util_next_power_of_two(it->second.bytes + 1) == 0 &&
|
||||
it->second.op.physReg().reg_b % util_next_power_of_two(it->second.bytes + 1) == 0) {
|
||||
it->first.reg_b % next_def_align == 0 &&
|
||||
it->second.op.physReg().reg_b % next_op_align == 0) {
|
||||
// TODO try more relaxed alignment for subdword copies
|
||||
PhysReg other_def_reg = it->first;
|
||||
other_def_reg.reg_b += it->second.bytes;
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue