From f48aff4943b17c175efbb293f4dadd2d62eab7f9 Mon Sep 17 00:00:00 2001
From: Christian Gmeiner <cgmeiner@igalia.com>
Date: Fri, 21 Nov 2025 23:50:53 +0100
Subject: [PATCH] mesa/format_utils: Add NEON-optimized RGBA to BGRA conversion
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add ARM NEON implementation of convert_ubyte_rgba_to_bgra() using
architecture-specific instructions for efficient channel shuffling.
ARM64 uses vqtbl1q_u8 for table-based shuffling, while ARM32 uses
vrev32q_u8 + vextq_u8 for byte reversal. The implementation processes
multiple pixels per iteration with proper handling of non-aligned
widths via a scalar tail path.

Performance improvements across three ARM platforms:

ARM32 Cortex-A9:
  - 64x64:      3.1x faster (0.85 GB/s → 2.63 GB/s)
  - 1920x1080:  1.06x faster (0.83 GB/s → 0.88 GB/s)
  - 3840x2160:  1.07x faster (0.85 GB/s → 0.90 GB/s)

ARM64 Cortex-A53:
  - 64x64:      3.8x faster (2.30 GB/s → 8.78 GB/s)
  - 1920x1080:  1.4x faster (2.85 GB/s → 3.98 GB/s)
  - 3840x2160:  1.4x faster (2.89 GB/s → 4.12 GB/s)

Apple M2:
  - 64x64:      3.3x faster (21.1 GB/s → 69.7 GB/s)
  - 1920x1080:  1.7x faster (58.2 GB/s → 96.7 GB/s)
  - 3840x2160:  1.3x faster (53.5 GB/s → 68.8 GB/s)

The optimization provides consistent speedups across image sizes and ARM
variants, with larger gains on smaller images due to better cache
utilization. On large images, performance becomes memory bandwidth-limited
across all platforms.

The NEON path is enabled for both AARCH64 and ARM when NEON is available
(!__SOFTFP__), falling back to the existing scalar implementation on
other architectures or when explicitly disabled via NO_FORMAT_ASM.

Signed-off-by: Christian Gmeiner <cgmeiner@igalia.com>
---
 src/mesa/main/format_utils.c | 149 ++++++++++++++++++++++++++++++++++-
 1 file changed, 148 insertions(+), 1 deletion(-)

diff --git a/src/mesa/main/format_utils.c b/src/mesa/main/format_utils.c
index ac6f8a624be..76af42f389a 100644
--- a/src/mesa/main/format_utils.c
+++ b/src/mesa/main/format_utils.c
@@ -29,6 +29,7 @@
 #include "glformats.h"
 #include "format_pack.h"
 #include "format_unpack.h"
+#include "util/detect_arch.h"
 
 const mesa_array_format RGBA32_FLOAT =
    MESA_ARRAY_FORMAT(MESA_ARRAY_FORMAT_BASE_FORMAT_RGBA_VARIANTS,
@@ -195,6 +196,152 @@ _mesa_compute_rgba2base2rgba_component_mapping(GLenum baseFormat, uint8_t *map)
  * Special case conversion function to swap r/b channels from the source
  * image to the dest image.
  */
+#if DETECT_ARCH_AARCH64 && !defined(NO_FORMAT_ASM) && !defined(__SOFTFP__)
+#include <arm_neon.h>
+
+static void
+convert_ubyte_rgba_to_bgra(size_t width, size_t height,
+                           const uint8_t *src, size_t src_stride,
+                           uint8_t *dst, size_t dst_stride)
+{
+   /* shuffle table for RGBA -> BGRA */
+   static const uint8_t tbl_data[16] = {
+      2, 1, 0, 3,    /* B←R, G←G, R←B, A←A */
+      6, 5, 4, 7,
+      10, 9, 8, 11,
+      14, 13, 12, 15
+   };
+   uint8x16_t shuffle_tbl = vld1q_u8(tbl_data);
+
+   const size_t vec_width_16 = width & ~15u;  /* 16 pixels per iteration */
+   const size_t vec_width_4 = width & ~3u;    /* 4 pixels per iteration */
+
+   for (size_t row = 0; row < height; row++) {
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      size_t x;
+
+      /* process 16 pixels (64 bytes) at a time */
+      for (x = 0; x < vec_width_16; x += 16) {
+         uint8x16_t rgba0 = vld1q_u8(s);
+         uint8x16_t rgba1 = vld1q_u8(s + 16);
+         uint8x16_t rgba2 = vld1q_u8(s + 32);
+         uint8x16_t rgba3 = vld1q_u8(s + 48);
+
+         uint8x16_t bgra0 = vqtbl1q_u8(rgba0, shuffle_tbl);
+         uint8x16_t bgra1 = vqtbl1q_u8(rgba1, shuffle_tbl);
+         uint8x16_t bgra2 = vqtbl1q_u8(rgba2, shuffle_tbl);
+         uint8x16_t bgra3 = vqtbl1q_u8(rgba3, shuffle_tbl);
+
+         vst1q_u8(d, bgra0);
+         vst1q_u8(d + 16, bgra1);
+         vst1q_u8(d + 32, bgra2);
+         vst1q_u8(d + 48, bgra3);
+
+         s += 64;
+         d += 64;
+      }
+
+      /* process remaining 4-pixel chunks */
+      for (; x < vec_width_4; x += 4) {
+         uint8x16_t rgba = vld1q_u8(s);
+         uint8x16_t bgra = vqtbl1q_u8(rgba, shuffle_tbl);
+         vst1q_u8(d, bgra);
+         s += 16;
+         d += 16;
+      }
+
+      /* scalar tail for 1-3 remaining pixels */
+      for (; x < width; x++) {
+         uint32_t pixel = ((const uint32_t *)s)[0];
+         ((uint32_t *)d)[0] = (pixel & 0xff00ff00) |
+                              ((pixel & 0xff) << 16) |
+                              ((pixel & 0xff0000) >> 16);
+         s += 4;
+         d += 4;
+      }
+
+      src += src_stride;
+      dst += dst_stride;
+   }
+}
+#elif DETECT_ARCH_ARM && !defined(NO_FORMAT_ASM) && !defined(__SOFTFP__)
+#include <arm_neon.h>
+
+/* armhf builds default to vfp, not neon, and refuses to compile neon intrinsics
+ * unless you tell it "no really".
+ */
+#if defined(__clang__)
+__attribute__((target("neon")))
+#else
+__attribute__((target("fpu=neon")))
+#endif
+static void
+convert_ubyte_rgba_to_bgra(size_t width, size_t height,
+                           const uint8_t *src, size_t src_stride,
+                           uint8_t *dst, size_t dst_stride)
+{
+   /* shuffle table for RGBA -> BGRA */
+   static const uint8_t tbl_data[8] = {
+      2, 1, 0, 3,    /* pixel 0: B←R, G←G, R←B, A←A */
+      6, 5, 4, 7
+   };
+   uint8x8_t shuffle_tbl = vld1_u8(tbl_data);
+
+   const size_t vec_width_8 = width & ~7u;   /* 8 pixels per iteration */
+   const size_t vec_width_2 = width & ~1u;   /* 2 pixels per iteration */
+
+   for (size_t row = 0; row < height; row++) {
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      size_t x;
+
+      /* process 8 pixels (32 bytes) at a time */
+      for (x = 0; x < vec_width_8; x += 8) {
+         uint8x8_t rgba0 = vld1_u8(s);
+         uint8x8_t rgba1 = vld1_u8(s + 8);
+         uint8x8_t rgba2 = vld1_u8(s + 16);
+         uint8x8_t rgba3 = vld1_u8(s + 24);
+
+         uint8x8_t bgra0 = vtbl1_u8(rgba0, shuffle_tbl);
+         uint8x8_t bgra1 = vtbl1_u8(rgba1, shuffle_tbl);
+         uint8x8_t bgra2 = vtbl1_u8(rgba2, shuffle_tbl);
+         uint8x8_t bgra3 = vtbl1_u8(rgba3, shuffle_tbl);
+
+         vst1_u8(d, bgra0);
+         vst1_u8(d + 8, bgra1);
+         vst1_u8(d + 16, bgra2);
+         vst1_u8(d + 24, bgra3);
+
+         s += 32;
+         d += 32;
+      }
+
+      /* process remaining 2-pixel chunks */
+      for (; x < vec_width_2; x += 2) {
+         uint8x8_t rgba = vld1_u8(s);
+         uint8x8_t bgra = vtbl1_u8(rgba, shuffle_tbl);
+         vst1_u8(d, bgra);
+         s += 8;
+         d += 8;
+      }
+
+      /* scalar tail for remaining pixel */
+      for (; x < width; x++) {
+         uint32_t pixel = ((const uint32_t *)s)[0];
+         ((uint32_t *)d)[0] = (pixel & 0xff00ff00) |
+                              ((pixel & 0xff) << 16) |
+                              ((pixel & 0xff0000) >> 16);
+         s += 4;
+         d += 4;
+      }
+
+      src += src_stride;
+      dst += dst_stride;
+   }
+}
+
+#else
 static void
 convert_ubyte_rgba_to_bgra(size_t width, size_t height,
                            const uint8_t *src, size_t src_stride,
@@ -245,7 +392,7 @@ convert_ubyte_rgba_to_bgra(size_t width, size_t height,
       }
    }
 }
-
+#endif
 
 /**
  * This can be used to convert between most color formats.