i915/corm: fuse binary ALU ops through vec construction

When a vec's only consumer is a binary ALU op (MUL, ADD, MIN, MAX) and the other source is a single register, emit the ALU op directly per register group with partial writemasks instead of building the vec with MOVs and then applying the ALU op. For example, fmul(vec4(a.zw, b.xy), tex) becomes: MUL oC.xy, a.zw, tex MUL oC.zw, b.xy, tex instead of: MOV R.xy, a.zw MOV R.zw, b.xy MUL oC, R, tex shader-db (I915_FS=nir): 248/403 compiled, 3544 alu shader-db (I915_FS=both): nir won 248 (26 identical, 1 tied, 218 better, 3 only), 39 TGSI, 116 neither Assisted-by: Claude
2026-05-08 09:08:10 +02:00 · 2026-05-07 09:51:19 -04:00 · 2026-05-07 09:51:19 -04:00 · 800375c3c4
commit 800375c3c4
parent 595b9850e0
1 changed files with 88 additions and 0 deletions
--- a/src/gallium/drivers/i915/i915_fpc_nir.c
+++ b/src/gallium/drivers/i915/i915_fpc_nir.c
@ -292,6 +292,10 @@ emit_alu(struct nir_to_i915 *c, nir_alu_instr *alu)
 {
   struct i915_fp_compile *p = c->p;
   nir_def *def = &alu->def;
+
+   if (def->index < c->ureg_map_size && c->ureg_map[def->index] != 0)
+      return;
+
   uint32_t mask = def_mask(def);
   uint32_t dest = UREG(REG_TYPE_R, i915_get_temp(p));
   set_ureg(c, def, dest);
@ -594,6 +598,90 @@ emit_alu(struct nir_to_i915 *c, nir_alu_instr *alu)
            emitted[i + j] = true;
      }

+      /* ALU consumer fusion: if this vec feeds a single binary ALU op
+       * and the other ALU source is a single register, emit the ALU op
+       * per-group with partial writemasks instead of MOV+ALU.
+       */
+      if (list_is_singular(&def->uses)) {
+         nir_src *use = list_first_entry(&def->uses, nir_src, use_link);
+         nir_instr *use_instr = nir_src_use_instr(use);
+         if (use_instr->type == nir_instr_type_alu) {
+            nir_alu_instr *consumer = nir_instr_as_alu(use_instr);
+            unsigned nargs = nir_op_infos[consumer->op].num_inputs;
+            int vec_arg = -1;
+            for (unsigned a = 0; a < nargs; a++) {
+               if (consumer->src[a].src.ssa == def) {
+                  vec_arg = a;
+                  break;
+               }
+            }
+            uint32_t hw_op = 0;
+            bool can_fuse = (vec_arg >= 0 && nargs == 2);
+            if (can_fuse) {
+               switch (consumer->op) {
+               case nir_op_fmul: hw_op = A0_MUL; break;
+               case nir_op_fadd: hw_op = A0_ADD; break;
+               case nir_op_fmin: case nir_op_imin: case nir_op_umin:
+                  hw_op = A0_MIN; break;
+               case nir_op_fmax: case nir_op_imax: case nir_op_umax:
+                  hw_op = A0_MAX; break;
+               default: can_fuse = false; break;
+               }
+            }
+            /* check the non-vec source is a single register */
+            if (can_fuse) {
+               int other_arg = 1 - vec_arg;
+               nir_def *other_def = consumer->src[other_arg].src.ssa;
+               if (other_def->index < c->ureg_map_size &&
+                   c->ureg_map[other_def->index] != UREG_BAD) {
+                  uint32_t other = alu_src_ureg(c, &consumer->src[other_arg]);
+                  nir_def *cdef = &consumer->def;
+                  uint32_t cdest = dest;
+                  uint32_t cmask = def_mask(cdef);
+
+                  for (unsigned i = 0; i < n; i++) {
+                     if (emitted[i])
+                        continue;
+                     uint32_t base = UREG(GET_UREG_TYPE(srcs[i]),
+                                          GET_UREG_NR(srcs[i]));
+                     uint32_t group_mask = chan_mask[i];
+                     uint32_t ch[4] = { X, Y, Z, W };
+                     int ng[4] = { 0, 0, 0, 0 };
+                     ch[i] = ch_sel[i];
+                     ng[i] = neg_sel[i];
+                     for (unsigned j = i + 1; j < n; j++) {
+                        if (!emitted[j] &&
+                            (ch_sel[j] >= SRC_ZERO ||
+                             (srcs[j] & UREG_TYPE_NR_MASK) ==
+                             (srcs[i] & UREG_TYPE_NR_MASK))) {
+                           group_mask |= chan_mask[j];
+                           ch[j] = ch_sel[j];
+                           ng[j] = neg_sel[j];
+                           emitted[j] = true;
+                        }
+                     }
+                     uint32_t fused_src = negate(
+                        swizzle(base, ch[0], ch[1], ch[2], ch[3]),
+                        ng[0], ng[1], ng[2], ng[3]);
+                     if (vec_arg == 0)
+                        i915_emit_arith(p, hw_op, cdest,
+                                        group_mask & cmask, 0,
+                                        fused_src, other, 0);
+                     else
+                        i915_emit_arith(p, hw_op, cdest,
+                                        group_mask & cmask, 0,
+                                        other, fused_src, 0);
+                     emitted[i] = true;
+                  }
+
+                  set_ureg(c, cdef, cdest);
+                  c->def_csr[cdef->index] = p->csr - 3;
+                  break;
+               }
+            }
+         }
+      }
+
      /* Process real-register sources first, folding in any ZERO/ONE
       * const-swizzle sources that can piggyback on the same MOV.
       * Use the unswizzled base register since swizzle() composes.