mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-20 05:10:11 +01:00
Merge branch 'neon-rgba-to-bgra' into 'main'
mesa/format_utils: Add NEON-optimized RGBA to BGRA conversion See merge request mesa/mesa!38708
This commit is contained in:
commit
2d3720fb32
1 changed files with 148 additions and 1 deletions
|
|
@ -29,6 +29,7 @@
|
|||
#include "glformats.h"
|
||||
#include "format_pack.h"
|
||||
#include "format_unpack.h"
|
||||
#include "util/detect_arch.h"
|
||||
|
||||
const mesa_array_format RGBA32_FLOAT =
|
||||
MESA_ARRAY_FORMAT(MESA_ARRAY_FORMAT_BASE_FORMAT_RGBA_VARIANTS,
|
||||
|
|
@ -195,6 +196,152 @@ _mesa_compute_rgba2base2rgba_component_mapping(GLenum baseFormat, uint8_t *map)
|
|||
* Special case conversion function to swap r/b channels from the source
|
||||
* image to the dest image.
|
||||
*/
|
||||
#if DETECT_ARCH_AARCH64 && !defined(NO_FORMAT_ASM) && !defined(__SOFTFP__)
|
||||
#include <arm_neon.h>
|
||||
|
||||
static void
|
||||
convert_ubyte_rgba_to_bgra(size_t width, size_t height,
|
||||
const uint8_t *src, size_t src_stride,
|
||||
uint8_t *dst, size_t dst_stride)
|
||||
{
|
||||
/* shuffle table for RGBA -> BGRA */
|
||||
static const uint8_t tbl_data[16] = {
|
||||
2, 1, 0, 3, /* B←R, G←G, R←B, A←A */
|
||||
6, 5, 4, 7,
|
||||
10, 9, 8, 11,
|
||||
14, 13, 12, 15
|
||||
};
|
||||
uint8x16_t shuffle_tbl = vld1q_u8(tbl_data);
|
||||
|
||||
const size_t vec_width_16 = width & ~15u; /* 16 pixels per iteration */
|
||||
const size_t vec_width_4 = width & ~3u; /* 4 pixels per iteration */
|
||||
|
||||
for (size_t row = 0; row < height; row++) {
|
||||
const uint8_t *s = src;
|
||||
uint8_t *d = dst;
|
||||
size_t x;
|
||||
|
||||
/* process 16 pixels (64 bytes) at a time */
|
||||
for (x = 0; x < vec_width_16; x += 16) {
|
||||
uint8x16_t rgba0 = vld1q_u8(s);
|
||||
uint8x16_t rgba1 = vld1q_u8(s + 16);
|
||||
uint8x16_t rgba2 = vld1q_u8(s + 32);
|
||||
uint8x16_t rgba3 = vld1q_u8(s + 48);
|
||||
|
||||
uint8x16_t bgra0 = vqtbl1q_u8(rgba0, shuffle_tbl);
|
||||
uint8x16_t bgra1 = vqtbl1q_u8(rgba1, shuffle_tbl);
|
||||
uint8x16_t bgra2 = vqtbl1q_u8(rgba2, shuffle_tbl);
|
||||
uint8x16_t bgra3 = vqtbl1q_u8(rgba3, shuffle_tbl);
|
||||
|
||||
vst1q_u8(d, bgra0);
|
||||
vst1q_u8(d + 16, bgra1);
|
||||
vst1q_u8(d + 32, bgra2);
|
||||
vst1q_u8(d + 48, bgra3);
|
||||
|
||||
s += 64;
|
||||
d += 64;
|
||||
}
|
||||
|
||||
/* process remaining 4-pixel chunks */
|
||||
for (; x < vec_width_4; x += 4) {
|
||||
uint8x16_t rgba = vld1q_u8(s);
|
||||
uint8x16_t bgra = vqtbl1q_u8(rgba, shuffle_tbl);
|
||||
vst1q_u8(d, bgra);
|
||||
s += 16;
|
||||
d += 16;
|
||||
}
|
||||
|
||||
/* scalar tail for 1-3 remaining pixels */
|
||||
for (; x < width; x++) {
|
||||
uint32_t pixel = ((const uint32_t *)s)[0];
|
||||
((uint32_t *)d)[0] = (pixel & 0xff00ff00) |
|
||||
((pixel & 0xff) << 16) |
|
||||
((pixel & 0xff0000) >> 16);
|
||||
s += 4;
|
||||
d += 4;
|
||||
}
|
||||
|
||||
src += src_stride;
|
||||
dst += dst_stride;
|
||||
}
|
||||
}
|
||||
#elif DETECT_ARCH_ARM && !defined(NO_FORMAT_ASM) && !defined(__SOFTFP__)
|
||||
#include <arm_neon.h>
|
||||
|
||||
/* armhf builds default to vfp, not neon, and refuses to compile neon intrinsics
|
||||
* unless you tell it "no really".
|
||||
*/
|
||||
#if defined(__clang__)
|
||||
__attribute__((target("neon")))
|
||||
#else
|
||||
__attribute__((target("fpu=neon")))
|
||||
#endif
|
||||
static void
|
||||
convert_ubyte_rgba_to_bgra(size_t width, size_t height,
|
||||
const uint8_t *src, size_t src_stride,
|
||||
uint8_t *dst, size_t dst_stride)
|
||||
{
|
||||
/* shuffle table for RGBA -> BGRA */
|
||||
static const uint8_t tbl_data[8] = {
|
||||
2, 1, 0, 3, /* pixel 0: B←R, G←G, R←B, A←A */
|
||||
6, 5, 4, 7
|
||||
};
|
||||
uint8x8_t shuffle_tbl = vld1_u8(tbl_data);
|
||||
|
||||
const size_t vec_width_8 = width & ~7u; /* 8 pixels per iteration */
|
||||
const size_t vec_width_2 = width & ~1u; /* 2 pixels per iteration */
|
||||
|
||||
for (size_t row = 0; row < height; row++) {
|
||||
const uint8_t *s = src;
|
||||
uint8_t *d = dst;
|
||||
size_t x;
|
||||
|
||||
/* process 8 pixels (32 bytes) at a time */
|
||||
for (x = 0; x < vec_width_8; x += 8) {
|
||||
uint8x8_t rgba0 = vld1_u8(s);
|
||||
uint8x8_t rgba1 = vld1_u8(s + 8);
|
||||
uint8x8_t rgba2 = vld1_u8(s + 16);
|
||||
uint8x8_t rgba3 = vld1_u8(s + 24);
|
||||
|
||||
uint8x8_t bgra0 = vtbl1_u8(rgba0, shuffle_tbl);
|
||||
uint8x8_t bgra1 = vtbl1_u8(rgba1, shuffle_tbl);
|
||||
uint8x8_t bgra2 = vtbl1_u8(rgba2, shuffle_tbl);
|
||||
uint8x8_t bgra3 = vtbl1_u8(rgba3, shuffle_tbl);
|
||||
|
||||
vst1_u8(d, bgra0);
|
||||
vst1_u8(d + 8, bgra1);
|
||||
vst1_u8(d + 16, bgra2);
|
||||
vst1_u8(d + 24, bgra3);
|
||||
|
||||
s += 32;
|
||||
d += 32;
|
||||
}
|
||||
|
||||
/* process remaining 2-pixel chunks */
|
||||
for (; x < vec_width_2; x += 2) {
|
||||
uint8x8_t rgba = vld1_u8(s);
|
||||
uint8x8_t bgra = vtbl1_u8(rgba, shuffle_tbl);
|
||||
vst1_u8(d, bgra);
|
||||
s += 8;
|
||||
d += 8;
|
||||
}
|
||||
|
||||
/* scalar tail for remaining pixel */
|
||||
for (; x < width; x++) {
|
||||
uint32_t pixel = ((const uint32_t *)s)[0];
|
||||
((uint32_t *)d)[0] = (pixel & 0xff00ff00) |
|
||||
((pixel & 0xff) << 16) |
|
||||
((pixel & 0xff0000) >> 16);
|
||||
s += 4;
|
||||
d += 4;
|
||||
}
|
||||
|
||||
src += src_stride;
|
||||
dst += dst_stride;
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
static void
|
||||
convert_ubyte_rgba_to_bgra(size_t width, size_t height,
|
||||
const uint8_t *src, size_t src_stride,
|
||||
|
|
@ -245,7 +392,7 @@ convert_ubyte_rgba_to_bgra(size_t width, size_t height,
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* This can be used to convert between most color formats.
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue