From 4885eb02ab9e581b144942fd4a03b5ae24cac9d7 Mon Sep 17 00:00:00 2001 From: Adam Jackson Date: Wed, 6 May 2026 12:59:42 -0400 Subject: [PATCH] i915/corm: optimize seq/sne against zero to 2 instructions When opts.seq_sne_opt is set and one operand is zero, use the abs+compare pattern: x == 0 becomes -abs(x) >= 0, and x != 0 becomes -abs(x) < 0. This reduces from 3 ALU instructions to 2. This is a variant dimension because it can increase register pressure in some shaders; the multi-variant framework picks the winner per-shader. shader-db (I915_FS=nir): 212/403 compiled, 3228 alu shader-db (I915_FS=both): nir won 212 (26 identical, 16 tied, 167 better, 3 only), 75 TGSI, 116 neither Assisted-by: Claude --- src/gallium/drivers/i915/i915_fpc_nir.c | 54 +++++++++++++++++++------ 1 file changed, 42 insertions(+), 12 deletions(-) diff --git a/src/gallium/drivers/i915/i915_fpc_nir.c b/src/gallium/drivers/i915/i915_fpc_nir.c index 96e28a23fe4..99a61d385c3 100644 --- a/src/gallium/drivers/i915/i915_fpc_nir.c +++ b/src/gallium/drivers/i915/i915_fpc_nir.c @@ -394,21 +394,51 @@ emit_alu(struct nir_to_i915 *c, nir_alu_instr *alu) i915_emit_arith(p, A0_SGE, dest, mask, 0, src0, src1, 0); break; case nir_op_seq: { - /* seq(a,b) = sge(a,b) * sge(b,a) */ - uint32_t tmp = i915_get_utemp(p); - i915_emit_arith(p, A0_SGE, tmp, A0_DEST_CHANNEL_ALL, 0, - src0, src1, 0); - i915_emit_arith(p, A0_SGE, dest, mask, 0, src1, src0, 0); - i915_emit_arith(p, A0_MUL, dest, mask, 0, dest, tmp, 0); + const uint32_t zero = + swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO); + if (c->opts.seq_sne_opt && + ((src0 & UREG_XYZW_CHANNEL_MASK) == (zero & UREG_XYZW_CHANNEL_MASK) || + (src1 & UREG_XYZW_CHANNEL_MASK) == (zero & UREG_XYZW_CHANNEL_MASK))) { + if ((src0 & UREG_XYZW_CHANNEL_MASK) == (zero & UREG_XYZW_CHANNEL_MASK)) + src0 = src1; + /* x == 0 <-> -abs(x) >= 0: 2 insns instead of 3 */ + uint32_t tmp = i915_get_utemp(p); + i915_emit_arith(p, A0_MAX, tmp, A0_DEST_CHANNEL_ALL, 0, + src0, negate(src0, 1, 1, 1, 1), 0); + i915_emit_arith(p, A0_SGE, dest, mask, 0, + negate(tmp, 1, 1, 1, 1), zero, 0); + } else { + /* seq(a,b) = sge(a,b) * sge(b,a) */ + uint32_t tmp = i915_get_utemp(p); + i915_emit_arith(p, A0_SGE, tmp, A0_DEST_CHANNEL_ALL, 0, + src0, src1, 0); + i915_emit_arith(p, A0_SGE, dest, mask, 0, src1, src0, 0); + i915_emit_arith(p, A0_MUL, dest, mask, 0, dest, tmp, 0); + } break; } case nir_op_sne: { - /* sne(a,b) = slt(a,b) + slt(b,a) */ - uint32_t tmp = i915_get_utemp(p); - i915_emit_arith(p, A0_SLT, tmp, A0_DEST_CHANNEL_ALL, 0, - src0, src1, 0); - i915_emit_arith(p, A0_SLT, dest, mask, 0, src1, src0, 0); - i915_emit_arith(p, A0_ADD, dest, mask, 0, dest, tmp, 0); + const uint32_t zero = + swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO); + if (c->opts.seq_sne_opt && + ((src0 & UREG_XYZW_CHANNEL_MASK) == (zero & UREG_XYZW_CHANNEL_MASK) || + (src1 & UREG_XYZW_CHANNEL_MASK) == (zero & UREG_XYZW_CHANNEL_MASK))) { + if ((src0 & UREG_XYZW_CHANNEL_MASK) == (zero & UREG_XYZW_CHANNEL_MASK)) + src0 = src1; + /* x != 0 <-> -abs(x) < 0: 2 insns instead of 3 */ + uint32_t tmp = i915_get_utemp(p); + i915_emit_arith(p, A0_MAX, tmp, A0_DEST_CHANNEL_ALL, 0, + src0, negate(src0, 1, 1, 1, 1), 0); + i915_emit_arith(p, A0_SLT, dest, mask, 0, + negate(tmp, 1, 1, 1, 1), zero, 0); + } else { + /* sne(a,b) = slt(a,b) + slt(b,a) */ + uint32_t tmp = i915_get_utemp(p); + i915_emit_arith(p, A0_SLT, tmp, A0_DEST_CHANNEL_ALL, 0, + src0, src1, 0); + i915_emit_arith(p, A0_SLT, dest, mask, 0, src1, src0, 0); + i915_emit_arith(p, A0_ADD, dest, mask, 0, dest, tmp, 0); + } break; } case nir_op_fpow: {