From 28400d7c6c8da10192ae422cfd418ab759af23d8 Mon Sep 17 00:00:00 2001 From: Adam Jackson Date: Wed, 6 May 2026 12:56:40 -0400 Subject: [PATCH] i915/corm: add fsat folding, output dest folding, and vec dest folding Add the def_csr mechanism: track the instruction cursor position for each single-instruction SSA def so we can retroactively patch it. fsat folding: when a single-use SSA def feeds into fsat, fold A0_DEST_SATURATE into the previous instruction instead of emitting a separate MOV. Output dest folding: when store_output consumes a single-use temp, patch the previous instruction to write directly to the output register (OC/OD). Includes vec look-through for the identity-swizzle case where a vec was collapsed to a register alias. Vec dest folding: single-use scalar ALU results feeding a vec component get patched to write directly into the vec dest register. shader-db (I915_FS=nir): 209/403 compiled, 3157 alu shader-db (I915_FS=both): nir won 209 (26 identical, 16 tied, 164 better, 3 only), 78 TGSI, 116 neither Assisted-by: Claude --- src/gallium/drivers/i915/i915_fpc_nir.c | 85 ++++++++++++++++++++++++- 1 file changed, 84 insertions(+), 1 deletion(-) diff --git a/src/gallium/drivers/i915/i915_fpc_nir.c b/src/gallium/drivers/i915/i915_fpc_nir.c index e4834c94c4b..41482cbd9d1 100644 --- a/src/gallium/drivers/i915/i915_fpc_nir.c +++ b/src/gallium/drivers/i915/i915_fpc_nir.c @@ -21,6 +21,7 @@ struct nir_to_i915 { struct i915_fragment_shader *ifs; uint32_t *ureg_map; + uint32_t **def_csr; unsigned ureg_map_size; int *last_use; @@ -221,6 +222,8 @@ emit_alu(struct nir_to_i915 *c, nir_alu_instr *alu) if (nir_op_infos[alu->op].num_inputs >= 3) src2 = alu_src_ureg(c, &alu->src[2]); + uint32_t *pre_csr = p->csr; + switch (alu->op) { case nir_op_mov: case nir_op_fcanonicalize: @@ -237,9 +240,22 @@ emit_alu(struct nir_to_i915 *c, nir_alu_instr *alu) i915_emit_arith(p, A0_MAX, dest, mask, 0, src0, negate(src0, 1, 1, 1, 1), 0); break; - case nir_op_fsat: + case nir_op_fsat: { + nir_def *src_def = alu->src[0].src.ssa; + uint32_t *prev = c->def_csr[src_def->index]; + if (prev && list_is_singular(&src_def->uses)) { + prev[0] |= A0_DEST_SATURATE; + i915_release_temp(p, GET_UREG_NR(dest)); + set_ureg(c, def, src_ureg(c, &alu->src[0].src)); + c->def_csr[def->index] = prev; + unsigned src_idx = alu->src[0].src.ssa->index; + if (c->last_use[src_idx] == c->ip) + c->last_use[src_idx] = c->last_use[def->index]; + return; + } i915_emit_arith(p, A0_MOV, dest, mask, A0_DEST_SATURATE, src0, 0, 0); break; + } case nir_op_fadd: i915_emit_arith(p, A0_ADD, dest, mask, 0, src0, src1, 0); break; @@ -399,6 +415,29 @@ emit_alu(struct nir_to_i915 *c, nir_alu_instr *alu) neg_sel[i] = (srcs[i] >> UREG_CHANNEL_X_NEGATE_SHIFT) & 0x1; } + /* Single-component ALU dest folding: if a vec source is a single-use + * scalar ALU result in a temp, patch that instruction to write directly + * into our dest with the right channel mask. + */ + for (unsigned i = 0; i < n; i++) { + nir_def *src_def = alu->src[i].src.ssa; + uint32_t *prev_csr = c->def_csr[src_def->index]; + if (!prev_csr || !list_is_singular(&src_def->uses)) + continue; + if (GET_UREG_TYPE(srcs[i]) != REG_TYPE_R) + continue; + if (src_def->num_components != 1) + continue; + + prev_csr[0] = (prev_csr[0] & ~(A0_DEST_CHANNEL_ALL | + (0x1ff << A0_DEST_NR_SHIFT))) | + A0_DEST(dest) | chan_mask[i]; + + i915_release_temp(p, GET_UREG_NR(srcs[i])); + c->ureg_map[src_def->index] = dest; + emitted[i] = true; + } + /* Process real-register sources first, folding in any ZERO/ONE * const-swizzle sources that can piggyback on the same MOV. * Use the unswizzled base register since swizzle() composes. @@ -471,6 +510,9 @@ emit_alu(struct nir_to_i915 *c, nir_alu_instr *alu) break; } + if (p->csr == pre_csr + 3) + c->def_csr[def->index] = pre_csr; + i915_release_utemps(p); } @@ -640,6 +682,45 @@ emit_intrinsic(struct nir_to_i915 *c, nir_intrinsic_instr *intr) dest = UREG(REG_TYPE_OC, 0); } + nir_def *src_def = intr->src[0].ssa; + uint32_t *prev = c->def_csr[src_def->index]; + + /* Look through identity vec (same_reg case emits no instructions). + * Check that all uses of the underlying def come from this vec. + */ + bool looked_through_vec = false; + if (!prev) { + nir_instr *def_instr = nir_def_instr_nonconst(src_def); + if (def_instr->type == nir_instr_type_alu) { + nir_alu_instr *vec = nir_instr_as_alu(def_instr); + if ((vec->op == nir_op_vec4 || vec->op == nir_op_vec3 || + vec->op == nir_op_vec2) && + list_is_singular(&src_def->uses)) { + nir_def *inner = vec->src[0].src.ssa; + bool all_from_vec = true; + nir_foreach_use(use, inner) { + if (nir_src_use_instr(use) != def_instr) { + all_from_vec = false; + break; + } + } + if (all_from_vec) { + src_def = inner; + prev = c->def_csr[src_def->index]; + looked_through_vec = true; + } + } + } + } + + if (prev && comp == 0 && + (looked_through_vec || list_is_singular(&src_def->uses))) { + prev[0] = (prev[0] & ~(A0_DEST_CHANNEL_ALL | + (0x1ff << A0_DEST_NR_SHIFT))) | + A0_DEST(dest) | writemask_to_mask(wm); + break; + } + if (comp > 0) { uint32_t s[4] = { X, Y, Z, W }; for (int i = 3; i >= (int)comp; i--) @@ -855,6 +936,7 @@ i915_translate_fragment_program_nir(struct i915_context *i915, .opts = *opts, .ureg_map_size = impl->ssa_alloc, .ureg_map = CALLOC(impl->ssa_alloc, sizeof(uint32_t)), + .def_csr = CALLOC(impl->ssa_alloc, sizeof(uint32_t *)), .last_use = CALLOC(impl->ssa_alloc, sizeof(int)), }; @@ -935,6 +1017,7 @@ cleanup: ralloc_free(p->error); FREE(c.last_use); + FREE(c.def_csr); FREE(c.ureg_map); FREE(p);