From 9c2b19219a0df1e2bd0f3886d7b690edfdf653f2 Mon Sep 17 00:00:00 2001
From: Faith Ekstrand <faith.ekstrand@collabora.com>
Date: Mon, 9 Mar 2026 15:09:23 -0400
Subject: [PATCH] nir/lower_bool_to_bitsize: Make all bN_csel sources match

Previously, we assumed that the selector for bcsel could be whatever,
regardless of the bit sizes of the data and we'd just fix it in the
back-end.  This works okay for scalars but falls over the moment we
vectorize because all our vector handling assumes bit sizes match.
Since matching bit sizes is what the hardware wants anyway, it's better
to do the right thing in NIR and hope copy-propagation can fold in
conversions if needed.

Unfortunately, copy prop isn't that smart yet so this does hurt a bit:

    Instrs: 1193679 -> 1198086 (+0.37%); split: -0.06%, +0.43%
    CodeSize: 11915136 -> 11950592 (+0.30%); split: -0.05%, +0.34%
    Full: 160985 -> 160941 (-0.03%); split: -0.04%, +0.01%
    Estimated normalized CVT cycles: 4456.938557000181 -> 4480.876069000186 (+0.54%); split: -0.13%, +0.67%
    Estimated normalized SFU cycles: 6350.9375 -> 6392.21875 (+0.65%)
    Estimated normalized Load/Store cycles: 205773.0 -> 205795.0 (+0.01%)
    Maximum number of threads: 12864 -> 12863 (-0.01%)
    Number of spill instructions: 22487 -> 22489 (+0.01%)
    Number of fill instructions: 52179 -> 52219 (+0.08%)

Hurt shaders:

    google-meet-clvk/BgBlur
    google-meet-clvk/Relight
    parallel-rdp/small_subgroup
    parallel-rdp/small_uber_subgroup

The proper solution here is to teach copy-prop about this stuff so that
it can propagate swizzles into ALU ops when they're supported:
https://gitlab.freedesktop.org/panfrost/mesa/-/issues/265

Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/14945
Cc: mesa-stable
Acked-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Reviewed-by: Christoph Pillmayer <christoph.pillmayer@arm.com>
(cherry picked from commit 3fd471dca5d2dfc2415b568db8b1a7458d9c93f6)

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40359>
---
 .pick_status.json                            |  2 +-
 src/compiler/nir/nir_lower_bool_to_bitsize.c | 51 ++++++++++++--------
 2 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/.pick_status.json b/.pick_status.json
index aa094f109a7..9a948d6f4cb 100644
--- a/.pick_status.json
+++ b/.pick_status.json
@@ -494,7 +494,7 @@
         "description": "nir/lower_bool_to_bitsize: Make all bN_csel sources match",
         "nominated": true,
         "nomination_type": 1,
-        "resolution": 0,
+        "resolution": 1,
         "main_sha": null,
         "because_sha": null,
         "notes": null
diff --git a/src/compiler/nir/nir_lower_bool_to_bitsize.c b/src/compiler/nir/nir_lower_bool_to_bitsize.c
index 80879e67ce1..6ed71226324 100644
--- a/src/compiler/nir/nir_lower_bool_to_bitsize.c
+++ b/src/compiler/nir/nir_lower_bool_to_bitsize.c
@@ -57,6 +57,32 @@ get_bool_convert_opcode(uint32_t dst_bit_size)
    }
 }
 
+static void
+resize_bool_alu_source(nir_builder *b, nir_alu_instr *alu,
+                       uint32_t src_idx, uint32_t bit_size)
+{
+   if (nir_src_bit_size(alu->src[src_idx].src) == bit_size)
+      return;
+
+   b->cursor = nir_before_instr(&alu->instr);
+   nir_op convert_op = get_bool_convert_opcode(bit_size);
+
+   /* Retain the number of components and swizzle of the original
+    * instruction so that we don’t unnecessarily create a vectorized
+    * instruction.
+    */
+   nir_def *new_src =
+      nir_build_alu1(b, convert_op, nir_ssa_for_alu_src(b, alu, src_idx));
+
+   nir_src_rewrite(&alu->src[src_idx].src, new_src);
+
+   /* The swizzle will have been handled by the conversion instruction
+    * so we can reset it back to the default
+    */
+   for (unsigned j = 0; j < NIR_MAX_VEC_COMPONENTS; j++)
+      alu->src[src_idx].swizzle[j] = j;
+}
+
 static void
 make_sources_canonical(nir_builder *b, nir_alu_instr *alu, uint32_t start_idx)
 {
@@ -65,25 +91,8 @@ make_sources_canonical(nir_builder *b, nir_alu_instr *alu, uint32_t start_idx)
     */
    const nir_op_info *op_info = &nir_op_infos[alu->op];
    uint32_t bit_size = nir_src_bit_size(alu->src[start_idx].src);
-   for (uint32_t i = start_idx + 1; i < op_info->num_inputs; i++) {
-      if (nir_src_bit_size(alu->src[i].src) != bit_size) {
-         b->cursor = nir_before_instr(&alu->instr);
-         nir_op convert_op = get_bool_convert_opcode(bit_size);
-         /* Retain the number of components and swizzle of the original
-          * instruction so that we don’t unnecessarily create a vectorized
-          * instruction.
-          */
-         nir_def *new_src =
-            nir_build_alu1(b, convert_op, nir_ssa_for_alu_src(b, alu, i));
-
-         nir_src_rewrite(&alu->src[i].src, new_src);
-         /* The swizzle will have been handled by the conversion instruction
-          * so we can reset it back to the default
-          */
-         for (unsigned j = 0; j < NIR_MAX_VEC_COMPONENTS; j++)
-            alu->src[i].swizzle[j] = j;
-      }
-   }
+   for (uint32_t i = start_idx + 1; i < op_info->num_inputs; i++)
+      resize_bool_alu_source(b, alu, i, bit_size);
 }
 
 static bool
@@ -130,7 +139,9 @@ lower_alu_instr(nir_builder *b, nir_alu_instr *alu)
    case nir_op_bcsel:
       /* bcsel may be choosing between boolean sources too */
       if (alu->def.bit_size == 1)
-         make_sources_canonical(b, alu, 1);
+         make_sources_canonical(b, alu, 0);
+      else
+         resize_bool_alu_source(b, alu, 0, alu->def.bit_size);
       break;
 
    default: