From 0f2dcf656a851b2ac13e34ee22c97d60ec72b625 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Tue, 28 Oct 2025 10:13:34 -0700 Subject: [PATCH] freedreno/fdl: Optimize linear_to_tiled with avx2 Part-of: --- src/freedreno/fdl/fd6_tiled_memcpy.cc | 88 +++++++++++++++++++++++++-- 1 file changed, 84 insertions(+), 4 deletions(-) diff --git a/src/freedreno/fdl/fd6_tiled_memcpy.cc b/src/freedreno/fdl/fd6_tiled_memcpy.cc index 876d94609dc..7bde850b404 100644 --- a/src/freedreno/fdl/fd6_tiled_memcpy.cc +++ b/src/freedreno/fdl/fd6_tiled_memcpy.cc @@ -25,6 +25,8 @@ #if DETECT_ARCH_AARCH64 #include +#elif (DETECT_ARCH_X86 || DETECT_ARCH_X86_64) && defined(__AVX2__) +#include #endif /* The tiling scheme on Qualcomm consists of four levels: @@ -174,7 +176,7 @@ block_y_xormask(uint32_t y, uint32_t cpp, uint32_t bank_mask, uint32_t bank_shif template<> uint32_t block_y_xormask(uint32_t y, uint32_t cpp, - uint32_t bank_mask, + uint32_t bank_mask, uint32_t bank_shift) { return ((((y & 1) * 0b110) ^ (((y >> 1) & 1) * 0b011)) << 8) | @@ -184,7 +186,7 @@ block_y_xormask(uint32_t y, uint32_t cpp, template<> uint32_t block_y_xormask(uint32_t y, uint32_t cpp, - uint32_t bank_mask, + uint32_t bank_mask, uint32_t bank_shift) { return ((((y & 1) * 0b110) ^ (((y >> 1) & 1) * 0b011) ^ @@ -312,7 +314,7 @@ get_block_size(unsigned cpp, bool r8g8, uint32_t *block_width, } } -void +void fdl6_get_ubwc_macrotile_size(const struct fdl_layout *layout, uint32_t *macrotile_width, uint32_t *macrotile_height) @@ -449,6 +451,42 @@ linear_to_tiled_1cpp(char *_tiled, char *_linear, uint32_t linear_pitch) : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"); } +#elif (DETECT_ARCH_X86 || DETECT_ARCH_X86_64) && defined(__AVX2__) + __m256i *tiled = (__m256i *)_tiled; + for (unsigned y = 0; y < 2; y++, _linear += 4 * linear_pitch) { + auto linear0 = _mm256_loadu_ps((float const*)(_linear)); + auto linear1 = _mm256_loadu_ps((float const*)(_linear + linear_pitch)); + auto linear2 = _mm256_loadu_ps((float const*)(_linear + 2 * linear_pitch)); + auto linear3 = _mm256_loadu_ps((float const*)(_linear + 3 * linear_pitch)); + + auto linear_0_1_low = _mm256_unpacklo_epi16(linear0, linear1); + auto linear_2_3_low = _mm256_unpacklo_epi16(linear2, linear3); + auto linear_0_1_high = _mm256_unpackhi_epi16(linear0, linear1); + auto linear_2_3_high = _mm256_unpackhi_epi16(linear2, linear3); + + // Effectively 256-bit zip1 ARM equivalent + auto r1_zip1 = _mm256_permute4x64_epi64(linear_2_3_low, 0x60); + auto r1_zip2 = _mm256_permute4x64_epi64(linear_0_1_low, 0xd4); + auto r1 = _mm256_blend_epi32(r1_zip2, r1_zip1, 0xcc); + + auto r2_zip1 = _mm256_permute4x64_epi64(linear_2_3_high, 0x60); + auto r2_zip2 = _mm256_permute4x64_epi64(linear_0_1_high, 0xd4); + auto r2 = _mm256_blend_epi32(r2_zip2, r2_zip1, 0xcc); + + // Effectively 256-bit zip2 ARM equivalent + auto r3_zip1 = _mm256_permute4x64_epi64(linear_2_3_low, 0xe8); + auto r3_zip2 = _mm256_permute4x64_epi64(linear_0_1_low, 0xf6); + auto r3 = _mm256_blend_epi32(r3_zip2, r3_zip1, 0xcc); + + auto r4_zip1 = _mm256_permute4x64_epi64(linear_2_3_high, 0xe8); + auto r4_zip2 = _mm256_permute4x64_epi64(linear_0_1_high, 0xf6); + auto r4 = _mm256_blend_epi32(r4_zip2, r4_zip1, 0xcc); + + _mm256_storeu_ps((float*)tiled, r1); tiled++; + _mm256_storeu_ps((float*)tiled, r2); tiled++; + _mm256_storeu_ps((float*)tiled, r3); tiled++; + _mm256_storeu_ps((float*)tiled, r4); tiled++; + } #else memcpy_small<1, LINEAR_TO_TILED, FDL_MACROTILE_4_CHANNEL>( 0, 0, 32, 8, _tiled, _linear, linear_pitch, 0, 0, 0); @@ -525,6 +563,29 @@ linear_to_tiled_2cpp(char *_tiled, char *_linear, uint32_t linear_pitch) : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"); } +#elif (DETECT_ARCH_X86 || DETECT_ARCH_X86_64) && defined(__AVX2__) + __m256i *tiled = (__m256i *)_tiled; + for (unsigned x = 0; x < 2; x++, _linear += 32) { + auto linear0 = _mm256_loadu_ps((float const*)(_linear)); + auto linear1 = _mm256_loadu_ps((float const*)(_linear + linear_pitch)); + auto linear2 = _mm256_loadu_ps((float const*)(_linear + 2 * linear_pitch)); + auto linear3 = _mm256_loadu_ps((float const*)(_linear + 3 * linear_pitch)); + + auto linear_0_1_low = _mm256_unpacklo_epi32(linear0, linear1); + auto linear_2_3_low = _mm256_unpacklo_epi32(linear2, linear3); + auto linear_0_1_high = _mm256_unpackhi_epi32(linear0, linear1); + auto linear_2_3_high = _mm256_unpackhi_epi32(linear2, linear3); + + auto r1 = _mm256_inserti128_si256(linear_0_1_low, _mm256_castsi256_si128(linear_2_3_low), 1); + auto r2 = _mm256_inserti128_si256(linear_0_1_high, _mm256_castsi256_si128(linear_2_3_high), 1); + auto r3 = _mm256_permute2f128_si256(linear_0_1_low, linear_2_3_low, 0x31); + auto r4 = _mm256_permute2f128_si256(linear_0_1_high, linear_2_3_high, 0x31); + + _mm256_storeu_ps((float*)tiled, r1); tiled++; + _mm256_storeu_ps((float*)tiled, r2); tiled++; + _mm256_storeu_ps((float*)tiled, r3); tiled++; + _mm256_storeu_ps((float*)tiled, r4); tiled++; + } #else memcpy_small<2, LINEAR_TO_TILED, FDL_MACROTILE_4_CHANNEL>( 0, 0, 32, 4, _tiled, _linear, linear_pitch, 0, 0, 0); @@ -600,6 +661,25 @@ linear_to_tiled_4cpp(char *_tiled, char *_linear, uint32_t linear_pitch) : "0"(tiled), "r"(linear0), "r"(linear1), "r"(linear2), "r"(linear3) : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"); + +#elif (DETECT_ARCH_X86 || DETECT_ARCH_X86_64) && defined(__AVX2__) + __m256i *tiled = (__m256i *)_tiled; + for (unsigned x = 0; x < 2; x++, _linear += 32) { + auto linear0 = _mm256_loadu_ps((float const*)(_linear)); + auto linear1 = _mm256_loadu_ps((float const*)(_linear + 1 * linear_pitch)); + auto linear2 = _mm256_loadu_ps((float const*)(_linear + 2 * linear_pitch)); + auto linear3 = _mm256_loadu_ps((float const*)(_linear + 3 * linear_pitch)); + + auto r1 = _mm256_blend_ps(_mm256_permute4x64_pd(linear0, 0xd4), _mm256_permute4x64_pd(linear1, 0x60), 0xcc); + auto r2 = _mm256_blend_ps(_mm256_permute4x64_pd(linear2, 0xd4), _mm256_permute4x64_pd(linear3, 0x60), 0xcc); + auto r3 = _mm256_blend_ps(_mm256_permute4x64_pd(linear0, 0xf6), _mm256_permute4x64_pd(linear1, 0xe8), 0xcc); + auto r4 = _mm256_blend_ps(_mm256_permute4x64_pd(linear2, 0xf6), _mm256_permute4x64_pd(linear3, 0xe8), 0xcc); + + _mm256_storeu_ps((float*)tiled, r1); tiled++; + _mm256_storeu_ps((float*)tiled, r2); tiled++; + _mm256_storeu_ps((float*)tiled, r3); tiled++; + _mm256_storeu_ps((float*)tiled, r4); tiled++; + } #else pixel8_t *tiled = (pixel8_t *)_tiled; for (unsigned x = 0; x < 4; x++, _linear += 4 * 4, tiled += 8) { @@ -908,7 +988,7 @@ fdl6_memcpy_linear_to_tiled(uint32_t x_start, uint32_t y_start, uint32_t x_block = (x + x_start) / block_width; uint32_t x_pixel = (x + x_start) % block_width; - uint32_t block_offset = + uint32_t block_offset = get_block_offset(x_block, y_block, macrotile_stride, bank_mask, bank_shift); uint32_t pixel_offset = get_pixel_offset(x_pixel, y_pixel);