From 15d5675e8e8fe450315f749f3300e53e93d90075 Mon Sep 17 00:00:00 2001
From: Faith Ekstrand <faith.ekstrand@collabora.com>
Date: Mon, 30 Mar 2026 15:42:54 -0400
Subject: [PATCH] pan/bi: Pack 8-bit vec2s

We used to splat out 8-bit vec2s to 16-bit by repeating both 8-bit
halves twice with the B0011 swizzle.  I think the original idea here was
that 16-bit swizzles were more widely available in the hardware and that
this would make swizzling things easier.  The problem is that nothing
actually knows that the value is half-repeated like this so nothing
knows it can upgrade a swizzle from B0022 to B0123 (H01).  So instead we
get a bunch of B0022 swizzles, which nothing supports.

We can shave a lot of instructions if we just stop trying to be so
clever and instead repeat the whole thing with a B0101 swizzle.

The only real issue here is that v2[fiu]8_to_v2[fiu]16 needs a B0011
swizzle, which we have to apply on-the-fly.  Fortunately, any swizzle
can be composed with B0011.

Reviewed-by: Lorenzo Rossi <lorenzo.rossi@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40720>
---
 .../compiler/bifrost/bifrost_compile.c        | 87 +++++++++++--------
 1 file changed, 50 insertions(+), 37 deletions(-)

diff --git a/src/panfrost/compiler/bifrost/bifrost_compile.c b/src/panfrost/compiler/bifrost/bifrost_compile.c
index 3f09e565361..03d285a57ce 100644
--- a/src/panfrost/compiler/bifrost/bifrost_compile.c
+++ b/src/panfrost/compiler/bifrost/bifrost_compile.c
@@ -2307,35 +2307,49 @@ bi_alu_src_index(bi_builder *b, nir_alu_src src, unsigned comps)
    } else if (bitsize == 8 && comps == 1) {
       idx.swizzle = BI_SWIZZLE_B0000 + (src.swizzle[0] & 3);
    } else if (bitsize == 8) {
-      if (comps == 2 || comps == 4) {
-         /* For a vec2, place the two components in 0 and 2 instead of
-          * 0 and 1.  For a scalar, splat it out to all channels.
-          */
-         unsigned c[4] = {0};
-         for (unsigned i = 0; i < 4; ++i)
-            c[i] = src.swizzle[i * comps / 4] & 3;
+      bool has_swizzle = false;
+      enum bi_swizzle swizzle = BI_SWIZZLE_H01;
+      if (comps == 3) {
+         unsigned c[4];
+         for (unsigned i = 0; i < 3; ++i)
+            c[i] = src.swizzle[i] & 3;
 
-         enum bi_swizzle swizzle;
-         if (bi_swizzle_from_byte_channels(c, &swizzle)) {
-            idx.swizzle = swizzle;
-            return idx;
+         /* Try to find a swizzle that starts with the given v3i8 swizzle */
+         for (unsigned i = 0; i < 4; i++) {
+            c[3] = i;
+            if (bi_swizzle_from_byte_channels(c, &swizzle)) {
+               has_swizzle = true;
+               break;
+            }
          }
+      } else {
+         /* For 1 and 2-component, repeat the swizzle to increase the chances
+          * that it's a valid bi_swizzle.
+          */
+         unsigned c[4];
+         for (unsigned i = 0; i < 4; ++i)
+            c[i] = src.swizzle[i % comps] & 3;
+         has_swizzle = bi_swizzle_from_byte_channels(c, &swizzle);
       }
 
-      /* XXX: Use optimized swizzle when posisble */
-      bi_index unoffset_srcs[NIR_MAX_VEC_COMPONENTS] = {bi_null()};
-      unsigned channels[NIR_MAX_VEC_COMPONENTS] = {0};
-
-      for (unsigned i = 0; i < comps; ++i) {
-         unoffset_srcs[i] = bi_src_index(&src.src);
-         channels[i] = src.swizzle[i];
+      if (has_swizzle) {
+         idx.swizzle = swizzle;
+         return idx;
       }
 
-      bi_index temp = bi_temp(b->shader);
-      bi_make_vec_to(b, temp, unoffset_srcs, channels, comps, bitsize);
+      bi_index v4_srcs[4];
+      for (unsigned i = 0; i < comps; i++) {
+         v4_srcs[i] = idx;
+         v4_srcs[i].swizzle = BI_SWIZZLE_B0 + src.swizzle[i];
+      }
+      for (unsigned i = comps; i < 4; i++)
+         v4_srcs[i] = bi_imm_u8(0);
+
+      bi_index temp = bi_mkvec_v4i8(b, v4_srcs[0], v4_srcs[1],
+                                    v4_srcs[2], v4_srcs[3]);
 
       static const enum bi_swizzle swizzle_lut[] = {
-         BI_SWIZZLE_B0000, BI_SWIZZLE_B0011, BI_SWIZZLE_B0123, BI_SWIZZLE_B0123
+         BI_SWIZZLE_B0000, BI_SWIZZLE_B0101, BI_SWIZZLE_B0123, BI_SWIZZLE_B0123
       };
       assert(comps - 1 < ARRAY_SIZE(swizzle_lut));
 
@@ -2348,6 +2362,17 @@ bi_alu_src_index(bi_builder *b, nir_alu_src src, unsigned comps)
    return idx;
 }
 
+static bi_index
+bi_swiz_b01(bi_index idx)
+{
+   enum bi_swizzle swizzle;
+   bool valid = bi_try_compose_swizzles(&swizzle, BI_SWIZZLE_B01, idx.swizzle);
+   assert(valid);
+
+   idx.swizzle = swizzle;
+   return idx;
+}
+
 static enum bi_round
 bi_nir_round(nir_op op)
 {
@@ -2865,12 +2890,6 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
    bi_index s2 =
       srcs > 2 ? bi_alu_src_index(b, instr->src[2], comps) : bi_null();
 
-   bool need_post_swizzle = sz == 8 && comps == 2;
-   bi_index post_swizzle_dst = dst;
-   if (need_post_swizzle) {
-      dst = bi_temp(b->shader);
-   }
-
    switch (instr->op) {
    case nir_op_ffma:
       bi_fma_to(b, sz, dst, s0, s1, s2);
@@ -3148,7 +3167,7 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
       if (src_sz == 16)
          bi_v2u16_to_v2f16_to(b, dst, s0);
       else if (src_sz == 8)
-         bi_v2u8_to_v2f16_to(b, dst, s0);
+         bi_v2u8_to_v2f16_to(b, dst, bi_swiz_b01(s0));
       break;
 
    case nir_op_u2f32:
@@ -3174,7 +3193,7 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
       if (src_sz == 16)
          bi_v2s16_to_v2f16_to(b, dst, s0);
       else if (src_sz == 8)
-         bi_v2s8_to_v2f16_to(b, dst, s0);
+         bi_v2s8_to_v2f16_to(b, dst, bi_swiz_b01(s0));
       break;
 
    case nir_op_i2f32:
@@ -3216,7 +3235,7 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
       assert(src_sz == 8 || src_sz == 32);
 
       if (src_sz == 8)
-         bi_v2s8_to_v2s16_to(b, dst, s0);
+         bi_v2s8_to_v2s16_to(b, dst, bi_swiz_b01(s0));
       else
          bi_mov_i32_to(b, dst, s0);
       break;
@@ -3225,7 +3244,7 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
       assert(src_sz == 8 || src_sz == 32);
 
       if (src_sz == 8)
-         bi_v2u8_to_v2u16_to(b, dst, s0);
+         bi_v2u8_to_v2u16_to(b, dst, bi_swiz_b01(s0));
       else
          bi_mov_i32_to(b, dst, s0);
       break;
@@ -3440,12 +3459,6 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
       fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name);
       UNREACHABLE("Unknown ALU op");
    }
-
-   if (need_post_swizzle) {
-      bi_index srcs[2] = {dst, dst};
-      unsigned channels[2] = {0, 2};
-      bi_make_vec_to(b, post_swizzle_dst, srcs, channels, 2, 8);
-   }
 }
 
 /* Returns dimension with 0 special casing cubemaps. Shamelessly copied from