From ed934ae17b28a353aaba8a826f777acd8eac0e51 Mon Sep 17 00:00:00 2001 From: Adam Jackson Date: Wed, 6 May 2026 12:57:28 -0400 Subject: [PATCH] i915/corm: add vec construction optimizations Optimize vec2/3/4 construction with several strategies: - same_reg: when all components come from the same register, collapse to a single swizzle+negate alias (zero instructions) - const-swizzle piggybacking: ZERO/ONE sources share a MOV with real-register sources from the same register - per-channel negate: preserve per-channel negate bits through the swizzle path instead of emitting separate negation shader-db (I915_FS=nir): 130/403 compiled, 1614 alu shader-db (I915_FS=both): nir won 130 (26 identical, 16 tied, 86 better, 2 only), 156 TGSI, 117 neither Assisted-by: Claude --- src/gallium/drivers/i915/i915_fpc_nir.c | 88 ++++++++++++++++++++++++- 1 file changed, 85 insertions(+), 3 deletions(-) diff --git a/src/gallium/drivers/i915/i915_fpc_nir.c b/src/gallium/drivers/i915/i915_fpc_nir.c index 4df750d4734..e4834c94c4b 100644 --- a/src/gallium/drivers/i915/i915_fpc_nir.c +++ b/src/gallium/drivers/i915/i915_fpc_nir.c @@ -361,14 +361,96 @@ emit_alu(struct nir_to_i915 *c, nir_alu_instr *alu) case nir_op_vec3: case nir_op_vec4: { unsigned n = nir_op_infos[alu->op].num_inputs; + uint32_t srcs[4] = { 0 }; + for (unsigned i = 0; i < n; i++) + srcs[i] = alu_src_ureg(c, &alu->src[i]); + + bool same_reg = true; + for (unsigned i = 1; i < n; i++) { + if ((srcs[i] & UREG_TYPE_NR_MASK) != (srcs[0] & UREG_TYPE_NR_MASK)) { + same_reg = false; + break; + } + } + + if (same_reg) { + uint32_t base = UREG(GET_UREG_TYPE(srcs[0]), GET_UREG_NR(srcs[0])); + uint32_t ch[4] = { X, Y, Z, W }; + int ng[4] = { 0, 0, 0, 0 }; + for (unsigned i = 0; i < n; i++) { + ch[i] = (srcs[i] >> UREG_CHANNEL_X_SHIFT) & 0x7; + ng[i] = (srcs[i] >> UREG_CHANNEL_X_NEGATE_SHIFT) & 0x1; + } + i915_release_temp(p, GET_UREG_NR(dest)); + set_ureg(c, def, negate(swizzle(base, ch[0], ch[1], ch[2], ch[3]), + ng[0], ng[1], ng[2], ng[3])); + return; + } + static const uint32_t chan_mask[] = { A0_DEST_CHANNEL_X, A0_DEST_CHANNEL_Y, A0_DEST_CHANNEL_Z, A0_DEST_CHANNEL_W, }; + bool emitted[4] = { false }; + uint32_t ch_sel[4]; + int neg_sel[4] = { 0, 0, 0, 0 }; for (unsigned i = 0; i < n; i++) { - uint32_t s = alu_src_ureg(c, &alu->src[i]); - i915_emit_arith(p, A0_MOV, dest, chan_mask[i] & mask, 0, - swizzle(s, X, X, X, X), 0, 0); + ch_sel[i] = (srcs[i] >> UREG_CHANNEL_X_SHIFT) & 0x7; + neg_sel[i] = (srcs[i] >> UREG_CHANNEL_X_NEGATE_SHIFT) & 0x1; + } + + /* Process real-register sources first, folding in any ZERO/ONE + * const-swizzle sources that can piggyback on the same MOV. + * Use the unswizzled base register since swizzle() composes. + */ + for (unsigned i = 0; i < n; i++) { + if (emitted[i] || ch_sel[i] >= SRC_ZERO) + continue; + uint32_t base = UREG(GET_UREG_TYPE(srcs[i]), GET_UREG_NR(srcs[i])); + uint32_t group_mask = chan_mask[i]; + uint32_t ch[4] = { X, Y, Z, W }; + int ng[4] = { 0, 0, 0, 0 }; + ch[i] = ch_sel[i]; + ng[i] = neg_sel[i]; + for (unsigned j = i + 1; j < n; j++) { + if (!emitted[j] && + (ch_sel[j] >= SRC_ZERO || + (srcs[j] & UREG_TYPE_NR_MASK) == + (srcs[i] & UREG_TYPE_NR_MASK))) { + group_mask |= chan_mask[j]; + ch[j] = ch_sel[j]; + ng[j] = neg_sel[j]; + emitted[j] = true; + } + } + i915_emit_arith(p, A0_MOV, dest, group_mask & mask, 0, + negate(swizzle(base, ch[0], ch[1], ch[2], ch[3]), + ng[0], ng[1], ng[2], ng[3]), + 0, 0); + emitted[i] = true; + } + /* Any remaining const-swizzle-only sources */ + for (unsigned i = 0; i < n; i++) { + if (emitted[i]) + continue; + uint32_t group_mask = chan_mask[i]; + uint32_t ch[4] = { X, Y, Z, W }; + int ng[4] = { 0, 0, 0, 0 }; + ch[i] = ch_sel[i]; + ng[i] = neg_sel[i]; + for (unsigned j = i + 1; j < n; j++) { + if (!emitted[j]) { + group_mask |= chan_mask[j]; + ch[j] = ch_sel[j]; + ng[j] = neg_sel[j]; + emitted[j] = true; + } + } + i915_emit_arith(p, A0_MOV, dest, group_mask & mask, 0, + negate(swizzle(srcs[i], ch[0], ch[1], ch[2], ch[3]), + ng[0], ng[1], ng[2], ng[3]), + 0, 0); + emitted[i] = true; } break; }