aco: use v_lshrrev_b64 for 64-bit VGPR copies on GFX10+

This isn't worth it on GFX9-, but the proprietary compiler uses it on GFX10. fossil-db (Navi): Totals from 23825 (17.17% of 138791) affected shaders: CodeSize: 130623632 -> 130623800 (+0.00%); split: -0.00%, +0.00% Instrs: 25185559 -> 25108597 (-0.31%) Cycles: 709864740 -> 708910860 (-0.13%) VMEM: 7205343 -> 7168839 (-0.51%); split: +0.00%, -0.51% SMEM: 1584946 -> 1575183 (-0.62%) Copies: 2043134 -> 1966230 (-3.76%) Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7798>
2026-04-20 23:00:36 +02:00 · 2020-11-26 14:18:18 +00:00 · 2020-11-26 14:18:18 +00:00 · f53d4e5f60
commit f53d4e5f60
parent 8c02a8e2d2
1 changed files with 29 additions and 6 deletions
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
@ -932,16 +932,20 @@ void split_copy(lower_context *ctx, unsigned offset, Definition *def, Operand *o
   def_reg.reg_b += offset;
   op_reg.reg_b += offset;

-   max_size = MIN2(max_size, src.def.regClass().type() == RegType::vgpr ? 4 : 8);
+   /* 64-bit VGPR copies (implemented with v_lshrrev_b64) are slow before GFX10 */
+   if (ctx->program->chip_class < GFX10 &&
+       src.def.regClass().type() == RegType::vgpr)
+      max_size = MIN2(max_size, 4);
+   unsigned max_align = src.def.regClass().type() == RegType::vgpr ? 4 : 16;

   /* make sure the size is a power of two and reg % bytes == 0 */
   unsigned bytes = 1;
   for (; bytes <= max_size; bytes *= 2) {
      unsigned next = bytes * 2u;
-      bool can_increase = def_reg.reg_b % next == 0 &&
+      bool can_increase = def_reg.reg_b % MIN2(next, max_align) == 0 &&
                          offset + next <= src.bytes && next <= max_size;
      if (!src.op.isConstant() && can_increase)
-         can_increase = op_reg.reg_b % next == 0;
+         can_increase = op_reg.reg_b % MIN2(next, max_align) == 0;
      for (unsigned i = 0; !ignore_uses && can_increase && (i < bytes); i++)
         can_increase = (src.uses[offset + bytes + i] == 0) == (src.uses[offset] == 0);
      if (!can_increase)
@ -1007,7 +1011,16 @@ void copy_constant(lower_context *ctx, Builder& bld, Definition dst, Operand op)
   if (dst.regClass() == s1) {
      bld.sop1(aco_opcode::s_mov_b32, dst, op);
   } else if (dst.regClass() == s2) {
+      /* s_ashr_i64 writes SCC, so we can't use it */
+      assert(Operand::is_constant_representable(op.constantValue64(), 8, true, false));
      bld.sop1(aco_opcode::s_mov_b64, dst, op);
+   } else if (dst.regClass() == v2) {
+      if (Operand::is_constant_representable(op.constantValue64(), 8, true, false)) {
+         bld.vop3(aco_opcode::v_lshrrev_b64, dst, Operand(0u), op);
+      } else {
+         assert(Operand::is_constant_representable(op.constantValue64(), 8, false, true));
+         bld.vop3(aco_opcode::v_ashrrev_i64, dst, Operand(0u), op);
+      }
   } else if (dst.regClass() == v1) {
      bld.vop1(aco_opcode::v_mov_b32, dst, op);
   } else if (dst.regClass() == v1b) {
@ -1076,6 +1089,8 @@ bool do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool
         copy_constant(ctx, bld, def, op);
      } else if (def.regClass() == v1) {
         bld.vop1(aco_opcode::v_mov_b32, def, op);
+      } else if (def.regClass() == v2) {
+         bld.vop3(aco_opcode::v_lshrrev_b64, def, Operand(0u), op);
      } else if (def.regClass() == s1) {
         bld.sop1(aco_opcode::s_mov_b32, def, op);
      } else if (def.regClass() == s2) {
@ -1155,7 +1170,8 @@ void do_swap(lower_context *ctx, Builder& bld, const copy_operation& copy, bool
   for (; offset < copy.bytes;) {
      Definition def;
      Operand op;
-      split_copy(ctx, offset, &def, &op, copy, true, 8);
+      unsigned max_size = copy.def.regClass().type() == RegType::vgpr ? 4 : 8;
+      split_copy(ctx, offset, &def, &op, copy, true, max_size);

      assert(op.regClass() == def.regClass());
      Operand def_as_op = Operand(def.physReg(), def.regClass());
@ -1353,9 +1369,16 @@ void handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context*
      }

      /* try to coalesce copies */
+      unsigned next_def_align = util_next_power_of_two(it->second.bytes + 1);
+      unsigned next_op_align = next_def_align;
+      if (it->second.def.regClass().type() == RegType::vgpr)
+         next_def_align = MIN2(next_def_align, 4);
+      if (it->second.op.regClass().type() == RegType::vgpr)
+         next_op_align = MIN2(next_op_align, 4);
+
      if (it->second.bytes < 8 && !it->second.op.isConstant() &&
-          it->first.reg_b % util_next_power_of_two(it->second.bytes + 1) == 0 &&
-          it->second.op.physReg().reg_b % util_next_power_of_two(it->second.bytes + 1) == 0) {
+          it->first.reg_b % next_def_align == 0 &&
+          it->second.op.physReg().reg_b % next_op_align == 0) {
         // TODO try more relaxed alignment for subdword copies
         PhysReg other_def_reg = it->first;
         other_def_reg.reg_b += it->second.bytes;