From 9c2b19219a0df1e2bd0f3886d7b690edfdf653f2 Mon Sep 17 00:00:00 2001 From: Faith Ekstrand Date: Mon, 9 Mar 2026 15:09:23 -0400 Subject: [PATCH] nir/lower_bool_to_bitsize: Make all bN_csel sources match Previously, we assumed that the selector for bcsel could be whatever, regardless of the bit sizes of the data and we'd just fix it in the back-end. This works okay for scalars but falls over the moment we vectorize because all our vector handling assumes bit sizes match. Since matching bit sizes is what the hardware wants anyway, it's better to do the right thing in NIR and hope copy-propagation can fold in conversions if needed. Unfortunately, copy prop isn't that smart yet so this does hurt a bit: Instrs: 1193679 -> 1198086 (+0.37%); split: -0.06%, +0.43% CodeSize: 11915136 -> 11950592 (+0.30%); split: -0.05%, +0.34% Full: 160985 -> 160941 (-0.03%); split: -0.04%, +0.01% Estimated normalized CVT cycles: 4456.938557000181 -> 4480.876069000186 (+0.54%); split: -0.13%, +0.67% Estimated normalized SFU cycles: 6350.9375 -> 6392.21875 (+0.65%) Estimated normalized Load/Store cycles: 205773.0 -> 205795.0 (+0.01%) Maximum number of threads: 12864 -> 12863 (-0.01%) Number of spill instructions: 22487 -> 22489 (+0.01%) Number of fill instructions: 52179 -> 52219 (+0.08%) Hurt shaders: google-meet-clvk/BgBlur google-meet-clvk/Relight parallel-rdp/small_subgroup parallel-rdp/small_uber_subgroup The proper solution here is to teach copy-prop about this stuff so that it can propagate swizzles into ALU ops when they're supported: https://gitlab.freedesktop.org/panfrost/mesa/-/issues/265 Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/14945 Cc: mesa-stable Acked-by: Alyssa Rosenzweig Reviewed-by: Christoph Pillmayer (cherry picked from commit 3fd471dca5d2dfc2415b568db8b1a7458d9c93f6) Part-of: --- .pick_status.json | 2 +- src/compiler/nir/nir_lower_bool_to_bitsize.c | 51 ++++++++++++-------- 2 files changed, 32 insertions(+), 21 deletions(-) diff --git a/.pick_status.json b/.pick_status.json index aa094f109a7..9a948d6f4cb 100644 --- a/.pick_status.json +++ b/.pick_status.json @@ -494,7 +494,7 @@ "description": "nir/lower_bool_to_bitsize: Make all bN_csel sources match", "nominated": true, "nomination_type": 1, - "resolution": 0, + "resolution": 1, "main_sha": null, "because_sha": null, "notes": null diff --git a/src/compiler/nir/nir_lower_bool_to_bitsize.c b/src/compiler/nir/nir_lower_bool_to_bitsize.c index 80879e67ce1..6ed71226324 100644 --- a/src/compiler/nir/nir_lower_bool_to_bitsize.c +++ b/src/compiler/nir/nir_lower_bool_to_bitsize.c @@ -57,6 +57,32 @@ get_bool_convert_opcode(uint32_t dst_bit_size) } } +static void +resize_bool_alu_source(nir_builder *b, nir_alu_instr *alu, + uint32_t src_idx, uint32_t bit_size) +{ + if (nir_src_bit_size(alu->src[src_idx].src) == bit_size) + return; + + b->cursor = nir_before_instr(&alu->instr); + nir_op convert_op = get_bool_convert_opcode(bit_size); + + /* Retain the number of components and swizzle of the original + * instruction so that we don’t unnecessarily create a vectorized + * instruction. + */ + nir_def *new_src = + nir_build_alu1(b, convert_op, nir_ssa_for_alu_src(b, alu, src_idx)); + + nir_src_rewrite(&alu->src[src_idx].src, new_src); + + /* The swizzle will have been handled by the conversion instruction + * so we can reset it back to the default + */ + for (unsigned j = 0; j < NIR_MAX_VEC_COMPONENTS; j++) + alu->src[src_idx].swizzle[j] = j; +} + static void make_sources_canonical(nir_builder *b, nir_alu_instr *alu, uint32_t start_idx) { @@ -65,25 +91,8 @@ make_sources_canonical(nir_builder *b, nir_alu_instr *alu, uint32_t start_idx) */ const nir_op_info *op_info = &nir_op_infos[alu->op]; uint32_t bit_size = nir_src_bit_size(alu->src[start_idx].src); - for (uint32_t i = start_idx + 1; i < op_info->num_inputs; i++) { - if (nir_src_bit_size(alu->src[i].src) != bit_size) { - b->cursor = nir_before_instr(&alu->instr); - nir_op convert_op = get_bool_convert_opcode(bit_size); - /* Retain the number of components and swizzle of the original - * instruction so that we don’t unnecessarily create a vectorized - * instruction. - */ - nir_def *new_src = - nir_build_alu1(b, convert_op, nir_ssa_for_alu_src(b, alu, i)); - - nir_src_rewrite(&alu->src[i].src, new_src); - /* The swizzle will have been handled by the conversion instruction - * so we can reset it back to the default - */ - for (unsigned j = 0; j < NIR_MAX_VEC_COMPONENTS; j++) - alu->src[i].swizzle[j] = j; - } - } + for (uint32_t i = start_idx + 1; i < op_info->num_inputs; i++) + resize_bool_alu_source(b, alu, i, bit_size); } static bool @@ -130,7 +139,9 @@ lower_alu_instr(nir_builder *b, nir_alu_instr *alu) case nir_op_bcsel: /* bcsel may be choosing between boolean sources too */ if (alu->def.bit_size == 1) - make_sources_canonical(b, alu, 1); + make_sources_canonical(b, alu, 0); + else + resize_bool_alu_source(b, alu, 0, alu->def.bit_size); break; default: