pan/compiler: Use SHADDX instruction for i64 add

For Valhall, use SHADDX instruction for 64-bit integer addition instead of lowering it to 32-bit operations. The instruction sequence for doing it in 32-bit costs 3 cycles but SHADDX only takes 2 cycles to perform. Reviewed-by: Lars-Ivar Hesselberg Simonsen <lars-ivar.simonsen@arm.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40841>
2026-05-07 07:08:04 +02:00 · 2026-03-27 11:02:55 +01:00 · 2026-03-27 11:02:55 +01:00 · 4542982062
commit 4542982062
parent d4b843c24d
8 changed files with 57 additions and 10 deletions
--- a/src/panfrost/compiler/bifrost/bi_ra.c
+++ b/src/panfrost/compiler/bifrost/bi_ra.c
@ -8,6 +8,7 @@
 #include "bi_builder.h"
 #include "compiler.h"
 #include "nodearray.h"
+#include "valhall.h"

 struct lcra_state {
   unsigned node_count;
@ -380,6 +381,9 @@ bi_mark_interference(bi_block *block, struct lcra_state *l, uint8_t *live,
         bi_foreach_ssa_src(ins, s) {
            if (bi_count_read_registers(ins, s) >= 2)
               l->affinity[ins->src[s].value] &= EVEN_BITS_MASK;
+            else if (s < valhall_opcodes[ins->op].nr_srcs &&
+                     va_src_info(ins->op, s).size > VA_SIZE_32)
+               l->affinity[ins->src[s].value] &= EVEN_BITS_MASK;
         }
      }

--- a/src/panfrost/compiler/bifrost/bifrost_compile.c
+++ b/src/panfrost/compiler/bifrost/bifrost_compile.c
@ -2310,6 +2310,15 @@ bi_alu_src_index(bi_builder *b, nir_alu_src src, unsigned comps)
 {
   unsigned bitsize = nir_src_bit_size(src.src);

+   if (b->shader->arch >= 9 && bitsize == 64) {
+      /* For Valhall, 64-bit instructions only encode one register but will read
+       * the adjacent register that comes right after as well. Therefore we
+       * don't need to extract a single register here.
+       */
+      assert(comps == 1);
+      return bi_src_index(&src.src);
+   }
+
   /* the bi_index carries the 32-bit (word) offset separate from the
    * subword swizzle, first handle the offset */

@ -3373,7 +3382,14 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
      break;

   case nir_op_iadd:
-      bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, false);
+      if (sz == 64) {
+         assert(b->shader->arch >= 9);
+         bi_shaddx_s64_to(b, dst, s0, s1, 0);
+         bi_index dsts[4] = {bi_null(), bi_null(), bi_null(), bi_null()};
+         bi_emit_split_i32(b, dsts, dst, 2);
+         bi_cache_collect(b, dst, dsts, 2);
+      } else
+         bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, false);
      break;

   case nir_op_iadd_sat:
--- a/src/panfrost/compiler/bifrost/bifrost_compile.h
+++ b/src/panfrost/compiler/bifrost/bifrost_compile.h
@ -116,7 +116,7 @@ bool valhall_can_merge_workgroups(nir_shader *nir);
                                                                               \
      .lower_doubles_options =                                                 \
         nir_lower_dmod, /* TODO: Don't lower supported 64-bit operations */   \
-      .lower_int64_options = ~0, /* TODO: Use IMULD on v7 */                   \
+      .lower_int64_options = arch >= 9 ? ~(nir_lower_iadd64) : ~0,             \
      .lower_mul_high = true,                                                  \
      .lower_fisnormal = true,                                                 \
      .lower_uadd_carry = true,                                                \
--- a/src/panfrost/compiler/bifrost/bir.c
+++ b/src/panfrost/compiler/bifrost/bir.c
@ -77,6 +77,8 @@ bi_count_read_registers(const bi_instr *ins, unsigned s)
      return ins->sr_count_2; /* Dual source blending */
   else if (s == 0 && ins->op == BI_OPCODE_SPLIT_I32)
      return ins->nr_dests;
+   else if (ins->op == BI_OPCODE_SHADDX_S64 || ins->op == BI_OPCODE_SHADDX_U64)
+      return 2;
   else
      return 1;
 }
@ -123,7 +125,9 @@ bi_count_write_registers(const bi_instr *ins, unsigned d)
      default:
         return bi_count_staging_registers(ins);
      }
-   } else if (ins->op == BI_OPCODE_SEG_ADD_I64) {
+   } else if (ins->op == BI_OPCODE_SEG_ADD_I64 ||
+              ins->op == BI_OPCODE_SHADDX_S64 ||
+              ins->op == BI_OPCODE_SHADDX_U64) {
      return 2;
   } else if (ins->op == BI_OPCODE_TEXC_DUAL && d == 1) {
      return ins->sr_count_2;
--- a/src/panfrost/compiler/bifrost/valhall/ISA.xml
+++ b/src/panfrost/compiler/bifrost/valhall/ISA.xml
@ -2626,7 +2626,7 @@
    <src widen="true">B</src>
  </group>

-  <group name="SHADDX" title="Shift, extend, and 64-bit add" dests="1" unused="true" unit="CVT">
+  <group name="SHADDX" title="Shift, extend, and 64-bit add" dests="1" unit="CVT">
    <desc>
      Sign or zero extend B to 64-bits, left-shift by `shift`, and add the
      64-bit value A. These instructions accelerate address arithmetic, but may
--- a/src/panfrost/compiler/bifrost/valhall/va_lower_split_64bit.c
+++ b/src/panfrost/compiler/bifrost/valhall/va_lower_split_64bit.c
@ -26,14 +26,18 @@ lower_split_src(bi_context *ctx, bi_instr *I, unsigned s, bi_instr** lut)
      return;
   }

-   /* Check if the source regs are already coming from a split. */
+   /* Check if the source regs are already coming from a split/collect pair. */
   bi_index *src_a = &I->src[s];
   bi_index *src_b = &I->src[s + 1];
   if (bi_is_ssa(*src_a) && bi_is_ssa(*src_b)) {
      bi_instr *src_ins_a = lut[src_a->value];
      bi_instr *src_ins_b = lut[src_b->value];
-      if (src_ins_a->op == BI_OPCODE_SPLIT_I32 && src_ins_a == src_ins_b)
-         return;
+      if (src_ins_a->op == BI_OPCODE_SPLIT_I32 && src_ins_a == src_ins_b) {
+         bi_index split_src = src_ins_a->src[0];
+         if (!bi_is_ssa(split_src) ||
+             lut[split_src.value]->op == BI_OPCODE_COLLECT_I32)
+            return;
+      }
   }

   /* Allocate temporary before the instruction */
@ -76,7 +80,9 @@ va_lower_split_64bit(bi_context *ctx)

         struct va_src_info info = va_src_info(I->op, s);

-         if (info.size == VA_SIZE_64)
+         /* Only split if the instruction expects 64-bit inputs as two separate
+          * sources. */
+         if (info.size == VA_SIZE_64 && bi_count_read_registers(I, s) == 1)
            lower_split_src(ctx, I, s, lut);
      }
   }
--- a/src/panfrost/compiler/bifrost/valhall/va_pack.c
+++ b/src/panfrost/compiler/bifrost/valhall/va_pack.c
@ -325,6 +325,25 @@ va_pack_widen(const bi_instr *I, enum bi_swizzle swz, enum va_size size)
      default:
         invalid_instruction(I, "32-bit widen");
      }
+   } else if (size == VA_SIZE_64) {
+      switch (swz) {
+      case BI_SWIZZLE_H01:
+         return VA_SWIZZLES_64_BIT_NONE;
+      case BI_SWIZZLE_H0:
+         return VA_SWIZZLES_64_BIT_H0;
+      case BI_SWIZZLE_H1:
+         return VA_SWIZZLES_64_BIT_H1;
+      case BI_SWIZZLE_B0:
+         return VA_SWIZZLES_64_BIT_B0;
+      case BI_SWIZZLE_B1:
+         return VA_SWIZZLES_64_BIT_B1;
+      case BI_SWIZZLE_B2:
+         return VA_SWIZZLES_64_BIT_B2;
+      case BI_SWIZZLE_B3:
+         return VA_SWIZZLES_64_BIT_B3;
+      default:
+         invalid_instruction(I, "64-bit widen");
+      }
   } else {
      invalid_instruction(I, "type size for widen");
   }
--- a/src/panfrost/compiler/bifrost/valhall/valhall.c.py
+++ b/src/panfrost/compiler/bifrost/valhall/valhall.c.py
@ -34,8 +34,6 @@ SKIP = set([
        "ISUB.u64",
        "ISUB.s64",
        "IMULD.u64",
-        "SHADDX.u64",
-        "SHADDX.s64",
        "IMULD.u64",
        "LSHIFT_AND.i64",
        "RSHIFT_AND.i64",