pan/compiler: Use SHADDX instruction for i64 add

For Valhall, use SHADDX instruction for 64-bit integer addition instead
of lowering it to 32-bit operations. The instruction sequence for doing
it in 32-bit costs 3 cycles but SHADDX only takes 2 cycles to perform.

Reviewed-by: Lars-Ivar Hesselberg Simonsen <lars-ivar.simonsen@arm.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40841>
This commit is contained in:
Jakob Sinclair 2026-03-27 11:02:55 +01:00 committed by Marge Bot
parent d4b843c24d
commit 4542982062
8 changed files with 57 additions and 10 deletions

View file

@ -8,6 +8,7 @@
#include "bi_builder.h"
#include "compiler.h"
#include "nodearray.h"
#include "valhall.h"
struct lcra_state {
unsigned node_count;
@ -380,6 +381,9 @@ bi_mark_interference(bi_block *block, struct lcra_state *l, uint8_t *live,
bi_foreach_ssa_src(ins, s) {
if (bi_count_read_registers(ins, s) >= 2)
l->affinity[ins->src[s].value] &= EVEN_BITS_MASK;
else if (s < valhall_opcodes[ins->op].nr_srcs &&
va_src_info(ins->op, s).size > VA_SIZE_32)
l->affinity[ins->src[s].value] &= EVEN_BITS_MASK;
}
}

View file

@ -2310,6 +2310,15 @@ bi_alu_src_index(bi_builder *b, nir_alu_src src, unsigned comps)
{
unsigned bitsize = nir_src_bit_size(src.src);
if (b->shader->arch >= 9 && bitsize == 64) {
/* For Valhall, 64-bit instructions only encode one register but will read
* the adjacent register that comes right after as well. Therefore we
* don't need to extract a single register here.
*/
assert(comps == 1);
return bi_src_index(&src.src);
}
/* the bi_index carries the 32-bit (word) offset separate from the
* subword swizzle, first handle the offset */
@ -3373,7 +3382,14 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
break;
case nir_op_iadd:
bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, false);
if (sz == 64) {
assert(b->shader->arch >= 9);
bi_shaddx_s64_to(b, dst, s0, s1, 0);
bi_index dsts[4] = {bi_null(), bi_null(), bi_null(), bi_null()};
bi_emit_split_i32(b, dsts, dst, 2);
bi_cache_collect(b, dst, dsts, 2);
} else
bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, false);
break;
case nir_op_iadd_sat:

View file

@ -116,7 +116,7 @@ bool valhall_can_merge_workgroups(nir_shader *nir);
\
.lower_doubles_options = \
nir_lower_dmod, /* TODO: Don't lower supported 64-bit operations */ \
.lower_int64_options = ~0, /* TODO: Use IMULD on v7 */ \
.lower_int64_options = arch >= 9 ? ~(nir_lower_iadd64) : ~0, \
.lower_mul_high = true, \
.lower_fisnormal = true, \
.lower_uadd_carry = true, \

View file

@ -77,6 +77,8 @@ bi_count_read_registers(const bi_instr *ins, unsigned s)
return ins->sr_count_2; /* Dual source blending */
else if (s == 0 && ins->op == BI_OPCODE_SPLIT_I32)
return ins->nr_dests;
else if (ins->op == BI_OPCODE_SHADDX_S64 || ins->op == BI_OPCODE_SHADDX_U64)
return 2;
else
return 1;
}
@ -123,7 +125,9 @@ bi_count_write_registers(const bi_instr *ins, unsigned d)
default:
return bi_count_staging_registers(ins);
}
} else if (ins->op == BI_OPCODE_SEG_ADD_I64) {
} else if (ins->op == BI_OPCODE_SEG_ADD_I64 ||
ins->op == BI_OPCODE_SHADDX_S64 ||
ins->op == BI_OPCODE_SHADDX_U64) {
return 2;
} else if (ins->op == BI_OPCODE_TEXC_DUAL && d == 1) {
return ins->sr_count_2;

View file

@ -2626,7 +2626,7 @@
<src widen="true">B</src>
</group>
<group name="SHADDX" title="Shift, extend, and 64-bit add" dests="1" unused="true" unit="CVT">
<group name="SHADDX" title="Shift, extend, and 64-bit add" dests="1" unit="CVT">
<desc>
Sign or zero extend B to 64-bits, left-shift by `shift`, and add the
64-bit value A. These instructions accelerate address arithmetic, but may

View file

@ -26,14 +26,18 @@ lower_split_src(bi_context *ctx, bi_instr *I, unsigned s, bi_instr** lut)
return;
}
/* Check if the source regs are already coming from a split. */
/* Check if the source regs are already coming from a split/collect pair. */
bi_index *src_a = &I->src[s];
bi_index *src_b = &I->src[s + 1];
if (bi_is_ssa(*src_a) && bi_is_ssa(*src_b)) {
bi_instr *src_ins_a = lut[src_a->value];
bi_instr *src_ins_b = lut[src_b->value];
if (src_ins_a->op == BI_OPCODE_SPLIT_I32 && src_ins_a == src_ins_b)
return;
if (src_ins_a->op == BI_OPCODE_SPLIT_I32 && src_ins_a == src_ins_b) {
bi_index split_src = src_ins_a->src[0];
if (!bi_is_ssa(split_src) ||
lut[split_src.value]->op == BI_OPCODE_COLLECT_I32)
return;
}
}
/* Allocate temporary before the instruction */
@ -76,7 +80,9 @@ va_lower_split_64bit(bi_context *ctx)
struct va_src_info info = va_src_info(I->op, s);
if (info.size == VA_SIZE_64)
/* Only split if the instruction expects 64-bit inputs as two separate
* sources. */
if (info.size == VA_SIZE_64 && bi_count_read_registers(I, s) == 1)
lower_split_src(ctx, I, s, lut);
}
}

View file

@ -325,6 +325,25 @@ va_pack_widen(const bi_instr *I, enum bi_swizzle swz, enum va_size size)
default:
invalid_instruction(I, "32-bit widen");
}
} else if (size == VA_SIZE_64) {
switch (swz) {
case BI_SWIZZLE_H01:
return VA_SWIZZLES_64_BIT_NONE;
case BI_SWIZZLE_H0:
return VA_SWIZZLES_64_BIT_H0;
case BI_SWIZZLE_H1:
return VA_SWIZZLES_64_BIT_H1;
case BI_SWIZZLE_B0:
return VA_SWIZZLES_64_BIT_B0;
case BI_SWIZZLE_B1:
return VA_SWIZZLES_64_BIT_B1;
case BI_SWIZZLE_B2:
return VA_SWIZZLES_64_BIT_B2;
case BI_SWIZZLE_B3:
return VA_SWIZZLES_64_BIT_B3;
default:
invalid_instruction(I, "64-bit widen");
}
} else {
invalid_instruction(I, "type size for widen");
}

View file

@ -34,8 +34,6 @@ SKIP = set([
"ISUB.u64",
"ISUB.s64",
"IMULD.u64",
"SHADDX.u64",
"SHADDX.s64",
"IMULD.u64",
"LSHIFT_AND.i64",
"RSHIFT_AND.i64",