diff --git a/src/mesa/main/format_utils.c b/src/mesa/main/format_utils.c index ac6f8a624be..76af42f389a 100644 --- a/src/mesa/main/format_utils.c +++ b/src/mesa/main/format_utils.c @@ -29,6 +29,7 @@ #include "glformats.h" #include "format_pack.h" #include "format_unpack.h" +#include "util/detect_arch.h" const mesa_array_format RGBA32_FLOAT = MESA_ARRAY_FORMAT(MESA_ARRAY_FORMAT_BASE_FORMAT_RGBA_VARIANTS, @@ -195,6 +196,152 @@ _mesa_compute_rgba2base2rgba_component_mapping(GLenum baseFormat, uint8_t *map) * Special case conversion function to swap r/b channels from the source * image to the dest image. */ +#if DETECT_ARCH_AARCH64 && !defined(NO_FORMAT_ASM) && !defined(__SOFTFP__) +#include + +static void +convert_ubyte_rgba_to_bgra(size_t width, size_t height, + const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride) +{ + /* shuffle table for RGBA -> BGRA */ + static const uint8_t tbl_data[16] = { + 2, 1, 0, 3, /* B←R, G←G, R←B, A←A */ + 6, 5, 4, 7, + 10, 9, 8, 11, + 14, 13, 12, 15 + }; + uint8x16_t shuffle_tbl = vld1q_u8(tbl_data); + + const size_t vec_width_16 = width & ~15u; /* 16 pixels per iteration */ + const size_t vec_width_4 = width & ~3u; /* 4 pixels per iteration */ + + for (size_t row = 0; row < height; row++) { + const uint8_t *s = src; + uint8_t *d = dst; + size_t x; + + /* process 16 pixels (64 bytes) at a time */ + for (x = 0; x < vec_width_16; x += 16) { + uint8x16_t rgba0 = vld1q_u8(s); + uint8x16_t rgba1 = vld1q_u8(s + 16); + uint8x16_t rgba2 = vld1q_u8(s + 32); + uint8x16_t rgba3 = vld1q_u8(s + 48); + + uint8x16_t bgra0 = vqtbl1q_u8(rgba0, shuffle_tbl); + uint8x16_t bgra1 = vqtbl1q_u8(rgba1, shuffle_tbl); + uint8x16_t bgra2 = vqtbl1q_u8(rgba2, shuffle_tbl); + uint8x16_t bgra3 = vqtbl1q_u8(rgba3, shuffle_tbl); + + vst1q_u8(d, bgra0); + vst1q_u8(d + 16, bgra1); + vst1q_u8(d + 32, bgra2); + vst1q_u8(d + 48, bgra3); + + s += 64; + d += 64; + } + + /* process remaining 4-pixel chunks */ + for (; x < vec_width_4; x += 4) { + uint8x16_t rgba = vld1q_u8(s); + uint8x16_t bgra = vqtbl1q_u8(rgba, shuffle_tbl); + vst1q_u8(d, bgra); + s += 16; + d += 16; + } + + /* scalar tail for 1-3 remaining pixels */ + for (; x < width; x++) { + uint32_t pixel = ((const uint32_t *)s)[0]; + ((uint32_t *)d)[0] = (pixel & 0xff00ff00) | + ((pixel & 0xff) << 16) | + ((pixel & 0xff0000) >> 16); + s += 4; + d += 4; + } + + src += src_stride; + dst += dst_stride; + } +} +#elif DETECT_ARCH_ARM && !defined(NO_FORMAT_ASM) && !defined(__SOFTFP__) +#include + +/* armhf builds default to vfp, not neon, and refuses to compile neon intrinsics + * unless you tell it "no really". + */ +#if defined(__clang__) +__attribute__((target("neon"))) +#else +__attribute__((target("fpu=neon"))) +#endif +static void +convert_ubyte_rgba_to_bgra(size_t width, size_t height, + const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride) +{ + /* shuffle table for RGBA -> BGRA */ + static const uint8_t tbl_data[8] = { + 2, 1, 0, 3, /* pixel 0: B←R, G←G, R←B, A←A */ + 6, 5, 4, 7 + }; + uint8x8_t shuffle_tbl = vld1_u8(tbl_data); + + const size_t vec_width_8 = width & ~7u; /* 8 pixels per iteration */ + const size_t vec_width_2 = width & ~1u; /* 2 pixels per iteration */ + + for (size_t row = 0; row < height; row++) { + const uint8_t *s = src; + uint8_t *d = dst; + size_t x; + + /* process 8 pixels (32 bytes) at a time */ + for (x = 0; x < vec_width_8; x += 8) { + uint8x8_t rgba0 = vld1_u8(s); + uint8x8_t rgba1 = vld1_u8(s + 8); + uint8x8_t rgba2 = vld1_u8(s + 16); + uint8x8_t rgba3 = vld1_u8(s + 24); + + uint8x8_t bgra0 = vtbl1_u8(rgba0, shuffle_tbl); + uint8x8_t bgra1 = vtbl1_u8(rgba1, shuffle_tbl); + uint8x8_t bgra2 = vtbl1_u8(rgba2, shuffle_tbl); + uint8x8_t bgra3 = vtbl1_u8(rgba3, shuffle_tbl); + + vst1_u8(d, bgra0); + vst1_u8(d + 8, bgra1); + vst1_u8(d + 16, bgra2); + vst1_u8(d + 24, bgra3); + + s += 32; + d += 32; + } + + /* process remaining 2-pixel chunks */ + for (; x < vec_width_2; x += 2) { + uint8x8_t rgba = vld1_u8(s); + uint8x8_t bgra = vtbl1_u8(rgba, shuffle_tbl); + vst1_u8(d, bgra); + s += 8; + d += 8; + } + + /* scalar tail for remaining pixel */ + for (; x < width; x++) { + uint32_t pixel = ((const uint32_t *)s)[0]; + ((uint32_t *)d)[0] = (pixel & 0xff00ff00) | + ((pixel & 0xff) << 16) | + ((pixel & 0xff0000) >> 16); + s += 4; + d += 4; + } + + src += src_stride; + dst += dst_stride; + } +} + +#else static void convert_ubyte_rgba_to_bgra(size_t width, size_t height, const uint8_t *src, size_t src_stride, @@ -245,7 +392,7 @@ convert_ubyte_rgba_to_bgra(size_t width, size_t height, } } } - +#endif /** * This can be used to convert between most color formats.