i915/corm: optimize seq/sne against zero to 2 instructions

When opts.seq_sne_opt is set and one operand is zero, use the
abs+compare pattern: x == 0 becomes -abs(x) >= 0, and x != 0
becomes -abs(x) < 0. This reduces from 3 ALU instructions to 2.

This is a variant dimension because it can increase register
pressure in some shaders; the multi-variant framework picks the
winner per-shader.

shader-db (I915_FS=nir): 212/403 compiled, 3228 alu
shader-db (I915_FS=both): nir won 212 (26 identical, 16 tied, 167 better, 3 only),
  75 TGSI, 116 neither

Assisted-by: Claude
This commit is contained in:
Adam Jackson 2026-05-06 12:59:42 -04:00
parent 75ef9f6d65
commit 4885eb02ab

View file

@ -394,21 +394,51 @@ emit_alu(struct nir_to_i915 *c, nir_alu_instr *alu)
i915_emit_arith(p, A0_SGE, dest, mask, 0, src0, src1, 0);
break;
case nir_op_seq: {
/* seq(a,b) = sge(a,b) * sge(b,a) */
uint32_t tmp = i915_get_utemp(p);
i915_emit_arith(p, A0_SGE, tmp, A0_DEST_CHANNEL_ALL, 0,
src0, src1, 0);
i915_emit_arith(p, A0_SGE, dest, mask, 0, src1, src0, 0);
i915_emit_arith(p, A0_MUL, dest, mask, 0, dest, tmp, 0);
const uint32_t zero =
swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO);
if (c->opts.seq_sne_opt &&
((src0 & UREG_XYZW_CHANNEL_MASK) == (zero & UREG_XYZW_CHANNEL_MASK) ||
(src1 & UREG_XYZW_CHANNEL_MASK) == (zero & UREG_XYZW_CHANNEL_MASK))) {
if ((src0 & UREG_XYZW_CHANNEL_MASK) == (zero & UREG_XYZW_CHANNEL_MASK))
src0 = src1;
/* x == 0 <-> -abs(x) >= 0: 2 insns instead of 3 */
uint32_t tmp = i915_get_utemp(p);
i915_emit_arith(p, A0_MAX, tmp, A0_DEST_CHANNEL_ALL, 0,
src0, negate(src0, 1, 1, 1, 1), 0);
i915_emit_arith(p, A0_SGE, dest, mask, 0,
negate(tmp, 1, 1, 1, 1), zero, 0);
} else {
/* seq(a,b) = sge(a,b) * sge(b,a) */
uint32_t tmp = i915_get_utemp(p);
i915_emit_arith(p, A0_SGE, tmp, A0_DEST_CHANNEL_ALL, 0,
src0, src1, 0);
i915_emit_arith(p, A0_SGE, dest, mask, 0, src1, src0, 0);
i915_emit_arith(p, A0_MUL, dest, mask, 0, dest, tmp, 0);
}
break;
}
case nir_op_sne: {
/* sne(a,b) = slt(a,b) + slt(b,a) */
uint32_t tmp = i915_get_utemp(p);
i915_emit_arith(p, A0_SLT, tmp, A0_DEST_CHANNEL_ALL, 0,
src0, src1, 0);
i915_emit_arith(p, A0_SLT, dest, mask, 0, src1, src0, 0);
i915_emit_arith(p, A0_ADD, dest, mask, 0, dest, tmp, 0);
const uint32_t zero =
swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO);
if (c->opts.seq_sne_opt &&
((src0 & UREG_XYZW_CHANNEL_MASK) == (zero & UREG_XYZW_CHANNEL_MASK) ||
(src1 & UREG_XYZW_CHANNEL_MASK) == (zero & UREG_XYZW_CHANNEL_MASK))) {
if ((src0 & UREG_XYZW_CHANNEL_MASK) == (zero & UREG_XYZW_CHANNEL_MASK))
src0 = src1;
/* x != 0 <-> -abs(x) < 0: 2 insns instead of 3 */
uint32_t tmp = i915_get_utemp(p);
i915_emit_arith(p, A0_MAX, tmp, A0_DEST_CHANNEL_ALL, 0,
src0, negate(src0, 1, 1, 1, 1), 0);
i915_emit_arith(p, A0_SLT, dest, mask, 0,
negate(tmp, 1, 1, 1, 1), zero, 0);
} else {
/* sne(a,b) = slt(a,b) + slt(b,a) */
uint32_t tmp = i915_get_utemp(p);
i915_emit_arith(p, A0_SLT, tmp, A0_DEST_CHANNEL_ALL, 0,
src0, src1, 0);
i915_emit_arith(p, A0_SLT, dest, mask, 0, src1, src0, 0);
i915_emit_arith(p, A0_ADD, dest, mask, 0, dest, tmp, 0);
}
break;
}
case nir_op_fpow: {