From 6e38f519e0a68ad60e55235f314dab0f84c305f5 Mon Sep 17 00:00:00 2001 From: Adam Jackson Date: Thu, 7 May 2026 12:40:35 -0400 Subject: [PATCH] i915/corm: add late scalarization as variant dimension Some shaders produce better code when fully scalarized after optimization: vec3(a, b, 1.0) feeding a dot product creates a cross-register vec construction, but scalarizing the fmul exposes 1.0*1.0 to constant folding, eliminating the vec entirely. Other shaders are worse fully scalar because corm's vec construction handles same_reg vecs at zero cost. Add late_scalar as a variant dimension so the multi-variant framework picks whichever is better per shader. shader-db (I915_FS=nir): 254/403 compiled, 4063 alu shader-db (I915_FS=both): nir won 254 (26 identical, 1 tied, 221 better, 6 only), 36 TGSI, 113 neither Assisted-by: Claude --- src/gallium/drivers/i915/i915_fpc.h | 1 + src/gallium/drivers/i915/i915_state.c | 20 ++++++++++++++------ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/src/gallium/drivers/i915/i915_fpc.h b/src/gallium/drivers/i915/i915_fpc.h index fe0d0f1e544..9e3e4b8ee63 100644 --- a/src/gallium/drivers/i915/i915_fpc.h +++ b/src/gallium/drivers/i915/i915_fpc.h @@ -185,6 +185,7 @@ extern void i915_translate_fragment_program(struct i915_context *i915, struct corm_compile_opts { bool deferred_const; bool seq_sne_opt; + bool late_scalar; }; extern void i915_translate_fragment_program_nir(struct i915_context *i915, diff --git a/src/gallium/drivers/i915/i915_state.c b/src/gallium/drivers/i915/i915_state.c index 10a185db957..88c65eee1c3 100644 --- a/src/gallium/drivers/i915/i915_state.c +++ b/src/gallium/drivers/i915/i915_state.c @@ -741,6 +741,10 @@ i915_create_fs_state(struct pipe_context *pipe, { .deferred_const = false, .seq_sne_opt = true }, { .deferred_const = true, .seq_sne_opt = false }, { .deferred_const = true, .seq_sne_opt = true }, + { .deferred_const = false, .seq_sne_opt = false, .late_scalar = true }, + { .deferred_const = false, .seq_sne_opt = true, .late_scalar = true }, + { .deferred_const = true, .seq_sne_opt = false, .late_scalar = true }, + { .deferred_const = true, .seq_sne_opt = true, .late_scalar = true }, }; struct i915_fragment_shader nir_results[ARRAY_SIZE(corm_variants)]; @@ -764,14 +768,19 @@ i915_create_fs_state(struct pipe_context *pipe, nir_index_ssa_defs(nir_shader_get_entrypoint(nir_s)); for (unsigned v = 0; v < ARRAY_SIZE(corm_variants); v++) { - nir_shader *variant_nir = (v == ARRAY_SIZE(corm_variants) - 1) - ? nir_s : nir_shader_clone(NULL, nir_s); + nir_shader *variant_nir = nir_shader_clone(NULL, nir_s); + if (corm_variants[v].late_scalar) { + NIR_PASS(_, variant_nir, nir_lower_alu_to_scalar, NULL, NULL); + NIR_PASS(_, variant_nir, nir_opt_copy_prop); + NIR_PASS(_, variant_nir, nir_opt_algebraic); + NIR_PASS(_, variant_nir, nir_opt_dce); + nir_index_ssa_defs(nir_shader_get_entrypoint(variant_nir)); + } memset(&nir_results[v], 0, sizeof(nir_results[v])); i915_populate_fs_metadata(&nir_results[v], variant_nir); i915_translate_fragment_program_nir(i915, &nir_results[v], variant_nir, &corm_variants[v]); - if (v < ARRAY_SIZE(corm_variants) - 1) - ralloc_free(variant_nir); + ralloc_free(variant_nir); bool ok = !nir_results[v].error || !nir_results[v].error[0]; if (ok && (best_nir < 0 || @@ -779,8 +788,7 @@ i915_create_fs_state(struct pipe_context *pipe, best_nir = v; } - if (try_tgsi) - ralloc_free(nir_s); + ralloc_free(nir_s); } if (try_tgsi) {