From 4542982062ad5194dc8dd3d40174fcb7821c13f1 Mon Sep 17 00:00:00 2001 From: Jakob Sinclair Date: Fri, 27 Mar 2026 11:02:55 +0100 Subject: [PATCH] pan/compiler: Use SHADDX instruction for i64 add For Valhall, use SHADDX instruction for 64-bit integer addition instead of lowering it to 32-bit operations. The instruction sequence for doing it in 32-bit costs 3 cycles but SHADDX only takes 2 cycles to perform. Reviewed-by: Lars-Ivar Hesselberg Simonsen Part-of: --- src/panfrost/compiler/bifrost/bi_ra.c | 4 ++++ .../compiler/bifrost/bifrost_compile.c | 18 +++++++++++++++++- .../compiler/bifrost/bifrost_compile.h | 2 +- src/panfrost/compiler/bifrost/bir.c | 6 +++++- src/panfrost/compiler/bifrost/valhall/ISA.xml | 2 +- .../bifrost/valhall/va_lower_split_64bit.c | 14 ++++++++++---- .../compiler/bifrost/valhall/va_pack.c | 19 +++++++++++++++++++ .../compiler/bifrost/valhall/valhall.c.py | 2 -- 8 files changed, 57 insertions(+), 10 deletions(-) diff --git a/src/panfrost/compiler/bifrost/bi_ra.c b/src/panfrost/compiler/bifrost/bi_ra.c index e10ee92bfa3..f8579f8c983 100644 --- a/src/panfrost/compiler/bifrost/bi_ra.c +++ b/src/panfrost/compiler/bifrost/bi_ra.c @@ -8,6 +8,7 @@ #include "bi_builder.h" #include "compiler.h" #include "nodearray.h" +#include "valhall.h" struct lcra_state { unsigned node_count; @@ -380,6 +381,9 @@ bi_mark_interference(bi_block *block, struct lcra_state *l, uint8_t *live, bi_foreach_ssa_src(ins, s) { if (bi_count_read_registers(ins, s) >= 2) l->affinity[ins->src[s].value] &= EVEN_BITS_MASK; + else if (s < valhall_opcodes[ins->op].nr_srcs && + va_src_info(ins->op, s).size > VA_SIZE_32) + l->affinity[ins->src[s].value] &= EVEN_BITS_MASK; } } diff --git a/src/panfrost/compiler/bifrost/bifrost_compile.c b/src/panfrost/compiler/bifrost/bifrost_compile.c index 1690a43292d..98312dd6d06 100644 --- a/src/panfrost/compiler/bifrost/bifrost_compile.c +++ b/src/panfrost/compiler/bifrost/bifrost_compile.c @@ -2310,6 +2310,15 @@ bi_alu_src_index(bi_builder *b, nir_alu_src src, unsigned comps) { unsigned bitsize = nir_src_bit_size(src.src); + if (b->shader->arch >= 9 && bitsize == 64) { + /* For Valhall, 64-bit instructions only encode one register but will read + * the adjacent register that comes right after as well. Therefore we + * don't need to extract a single register here. + */ + assert(comps == 1); + return bi_src_index(&src.src); + } + /* the bi_index carries the 32-bit (word) offset separate from the * subword swizzle, first handle the offset */ @@ -3373,7 +3382,14 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr) break; case nir_op_iadd: - bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, false); + if (sz == 64) { + assert(b->shader->arch >= 9); + bi_shaddx_s64_to(b, dst, s0, s1, 0); + bi_index dsts[4] = {bi_null(), bi_null(), bi_null(), bi_null()}; + bi_emit_split_i32(b, dsts, dst, 2); + bi_cache_collect(b, dst, dsts, 2); + } else + bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, false); break; case nir_op_iadd_sat: diff --git a/src/panfrost/compiler/bifrost/bifrost_compile.h b/src/panfrost/compiler/bifrost/bifrost_compile.h index 70f6221d9b7..811f5f4339b 100644 --- a/src/panfrost/compiler/bifrost/bifrost_compile.h +++ b/src/panfrost/compiler/bifrost/bifrost_compile.h @@ -116,7 +116,7 @@ bool valhall_can_merge_workgroups(nir_shader *nir); \ .lower_doubles_options = \ nir_lower_dmod, /* TODO: Don't lower supported 64-bit operations */ \ - .lower_int64_options = ~0, /* TODO: Use IMULD on v7 */ \ + .lower_int64_options = arch >= 9 ? ~(nir_lower_iadd64) : ~0, \ .lower_mul_high = true, \ .lower_fisnormal = true, \ .lower_uadd_carry = true, \ diff --git a/src/panfrost/compiler/bifrost/bir.c b/src/panfrost/compiler/bifrost/bir.c index 366a0e5d064..8da950f32ae 100644 --- a/src/panfrost/compiler/bifrost/bir.c +++ b/src/panfrost/compiler/bifrost/bir.c @@ -77,6 +77,8 @@ bi_count_read_registers(const bi_instr *ins, unsigned s) return ins->sr_count_2; /* Dual source blending */ else if (s == 0 && ins->op == BI_OPCODE_SPLIT_I32) return ins->nr_dests; + else if (ins->op == BI_OPCODE_SHADDX_S64 || ins->op == BI_OPCODE_SHADDX_U64) + return 2; else return 1; } @@ -123,7 +125,9 @@ bi_count_write_registers(const bi_instr *ins, unsigned d) default: return bi_count_staging_registers(ins); } - } else if (ins->op == BI_OPCODE_SEG_ADD_I64) { + } else if (ins->op == BI_OPCODE_SEG_ADD_I64 || + ins->op == BI_OPCODE_SHADDX_S64 || + ins->op == BI_OPCODE_SHADDX_U64) { return 2; } else if (ins->op == BI_OPCODE_TEXC_DUAL && d == 1) { return ins->sr_count_2; diff --git a/src/panfrost/compiler/bifrost/valhall/ISA.xml b/src/panfrost/compiler/bifrost/valhall/ISA.xml index f4d53389c79..43b292f2c57 100644 --- a/src/panfrost/compiler/bifrost/valhall/ISA.xml +++ b/src/panfrost/compiler/bifrost/valhall/ISA.xml @@ -2626,7 +2626,7 @@ B - + Sign or zero extend B to 64-bits, left-shift by `shift`, and add the 64-bit value A. These instructions accelerate address arithmetic, but may diff --git a/src/panfrost/compiler/bifrost/valhall/va_lower_split_64bit.c b/src/panfrost/compiler/bifrost/valhall/va_lower_split_64bit.c index 06a0bb05200..6b81346845c 100644 --- a/src/panfrost/compiler/bifrost/valhall/va_lower_split_64bit.c +++ b/src/panfrost/compiler/bifrost/valhall/va_lower_split_64bit.c @@ -26,14 +26,18 @@ lower_split_src(bi_context *ctx, bi_instr *I, unsigned s, bi_instr** lut) return; } - /* Check if the source regs are already coming from a split. */ + /* Check if the source regs are already coming from a split/collect pair. */ bi_index *src_a = &I->src[s]; bi_index *src_b = &I->src[s + 1]; if (bi_is_ssa(*src_a) && bi_is_ssa(*src_b)) { bi_instr *src_ins_a = lut[src_a->value]; bi_instr *src_ins_b = lut[src_b->value]; - if (src_ins_a->op == BI_OPCODE_SPLIT_I32 && src_ins_a == src_ins_b) - return; + if (src_ins_a->op == BI_OPCODE_SPLIT_I32 && src_ins_a == src_ins_b) { + bi_index split_src = src_ins_a->src[0]; + if (!bi_is_ssa(split_src) || + lut[split_src.value]->op == BI_OPCODE_COLLECT_I32) + return; + } } /* Allocate temporary before the instruction */ @@ -76,7 +80,9 @@ va_lower_split_64bit(bi_context *ctx) struct va_src_info info = va_src_info(I->op, s); - if (info.size == VA_SIZE_64) + /* Only split if the instruction expects 64-bit inputs as two separate + * sources. */ + if (info.size == VA_SIZE_64 && bi_count_read_registers(I, s) == 1) lower_split_src(ctx, I, s, lut); } } diff --git a/src/panfrost/compiler/bifrost/valhall/va_pack.c b/src/panfrost/compiler/bifrost/valhall/va_pack.c index 5eefb9b5ff8..9665cc1cfd5 100644 --- a/src/panfrost/compiler/bifrost/valhall/va_pack.c +++ b/src/panfrost/compiler/bifrost/valhall/va_pack.c @@ -325,6 +325,25 @@ va_pack_widen(const bi_instr *I, enum bi_swizzle swz, enum va_size size) default: invalid_instruction(I, "32-bit widen"); } + } else if (size == VA_SIZE_64) { + switch (swz) { + case BI_SWIZZLE_H01: + return VA_SWIZZLES_64_BIT_NONE; + case BI_SWIZZLE_H0: + return VA_SWIZZLES_64_BIT_H0; + case BI_SWIZZLE_H1: + return VA_SWIZZLES_64_BIT_H1; + case BI_SWIZZLE_B0: + return VA_SWIZZLES_64_BIT_B0; + case BI_SWIZZLE_B1: + return VA_SWIZZLES_64_BIT_B1; + case BI_SWIZZLE_B2: + return VA_SWIZZLES_64_BIT_B2; + case BI_SWIZZLE_B3: + return VA_SWIZZLES_64_BIT_B3; + default: + invalid_instruction(I, "64-bit widen"); + } } else { invalid_instruction(I, "type size for widen"); } diff --git a/src/panfrost/compiler/bifrost/valhall/valhall.c.py b/src/panfrost/compiler/bifrost/valhall/valhall.c.py index 3645092b836..81e9a2ba523 100644 --- a/src/panfrost/compiler/bifrost/valhall/valhall.c.py +++ b/src/panfrost/compiler/bifrost/valhall/valhall.c.py @@ -34,8 +34,6 @@ SKIP = set([ "ISUB.u64", "ISUB.s64", "IMULD.u64", - "SHADDX.u64", - "SHADDX.s64", "IMULD.u64", "LSHIFT_AND.i64", "RSHIFT_AND.i64",