From 800375c3c4bc09bf0719f5b39927d4f34eb8f01a Mon Sep 17 00:00:00 2001 From: Adam Jackson Date: Thu, 7 May 2026 09:51:19 -0400 Subject: [PATCH] i915/corm: fuse binary ALU ops through vec construction When a vec's only consumer is a binary ALU op (MUL, ADD, MIN, MAX) and the other source is a single register, emit the ALU op directly per register group with partial writemasks instead of building the vec with MOVs and then applying the ALU op. For example, fmul(vec4(a.zw, b.xy), tex) becomes: MUL oC.xy, a.zw, tex MUL oC.zw, b.xy, tex instead of: MOV R.xy, a.zw MOV R.zw, b.xy MUL oC, R, tex shader-db (I915_FS=nir): 248/403 compiled, 3544 alu shader-db (I915_FS=both): nir won 248 (26 identical, 1 tied, 218 better, 3 only), 39 TGSI, 116 neither Assisted-by: Claude --- src/gallium/drivers/i915/i915_fpc_nir.c | 88 +++++++++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/src/gallium/drivers/i915/i915_fpc_nir.c b/src/gallium/drivers/i915/i915_fpc_nir.c index 2e395c16777..15e935cd67b 100644 --- a/src/gallium/drivers/i915/i915_fpc_nir.c +++ b/src/gallium/drivers/i915/i915_fpc_nir.c @@ -292,6 +292,10 @@ emit_alu(struct nir_to_i915 *c, nir_alu_instr *alu) { struct i915_fp_compile *p = c->p; nir_def *def = &alu->def; + + if (def->index < c->ureg_map_size && c->ureg_map[def->index] != 0) + return; + uint32_t mask = def_mask(def); uint32_t dest = UREG(REG_TYPE_R, i915_get_temp(p)); set_ureg(c, def, dest); @@ -594,6 +598,90 @@ emit_alu(struct nir_to_i915 *c, nir_alu_instr *alu) emitted[i + j] = true; } + /* ALU consumer fusion: if this vec feeds a single binary ALU op + * and the other ALU source is a single register, emit the ALU op + * per-group with partial writemasks instead of MOV+ALU. + */ + if (list_is_singular(&def->uses)) { + nir_src *use = list_first_entry(&def->uses, nir_src, use_link); + nir_instr *use_instr = nir_src_use_instr(use); + if (use_instr->type == nir_instr_type_alu) { + nir_alu_instr *consumer = nir_instr_as_alu(use_instr); + unsigned nargs = nir_op_infos[consumer->op].num_inputs; + int vec_arg = -1; + for (unsigned a = 0; a < nargs; a++) { + if (consumer->src[a].src.ssa == def) { + vec_arg = a; + break; + } + } + uint32_t hw_op = 0; + bool can_fuse = (vec_arg >= 0 && nargs == 2); + if (can_fuse) { + switch (consumer->op) { + case nir_op_fmul: hw_op = A0_MUL; break; + case nir_op_fadd: hw_op = A0_ADD; break; + case nir_op_fmin: case nir_op_imin: case nir_op_umin: + hw_op = A0_MIN; break; + case nir_op_fmax: case nir_op_imax: case nir_op_umax: + hw_op = A0_MAX; break; + default: can_fuse = false; break; + } + } + /* check the non-vec source is a single register */ + if (can_fuse) { + int other_arg = 1 - vec_arg; + nir_def *other_def = consumer->src[other_arg].src.ssa; + if (other_def->index < c->ureg_map_size && + c->ureg_map[other_def->index] != UREG_BAD) { + uint32_t other = alu_src_ureg(c, &consumer->src[other_arg]); + nir_def *cdef = &consumer->def; + uint32_t cdest = dest; + uint32_t cmask = def_mask(cdef); + + for (unsigned i = 0; i < n; i++) { + if (emitted[i]) + continue; + uint32_t base = UREG(GET_UREG_TYPE(srcs[i]), + GET_UREG_NR(srcs[i])); + uint32_t group_mask = chan_mask[i]; + uint32_t ch[4] = { X, Y, Z, W }; + int ng[4] = { 0, 0, 0, 0 }; + ch[i] = ch_sel[i]; + ng[i] = neg_sel[i]; + for (unsigned j = i + 1; j < n; j++) { + if (!emitted[j] && + (ch_sel[j] >= SRC_ZERO || + (srcs[j] & UREG_TYPE_NR_MASK) == + (srcs[i] & UREG_TYPE_NR_MASK))) { + group_mask |= chan_mask[j]; + ch[j] = ch_sel[j]; + ng[j] = neg_sel[j]; + emitted[j] = true; + } + } + uint32_t fused_src = negate( + swizzle(base, ch[0], ch[1], ch[2], ch[3]), + ng[0], ng[1], ng[2], ng[3]); + if (vec_arg == 0) + i915_emit_arith(p, hw_op, cdest, + group_mask & cmask, 0, + fused_src, other, 0); + else + i915_emit_arith(p, hw_op, cdest, + group_mask & cmask, 0, + other, fused_src, 0); + emitted[i] = true; + } + + set_ureg(c, cdef, cdest); + c->def_csr[cdef->index] = p->csr - 3; + break; + } + } + } + } + /* Process real-register sources first, folding in any ZERO/ONE * const-swizzle sources that can piggyback on the same MOV. * Use the unswizzled base register since swizzle() composes.