mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-01-09 04:00:12 +01:00
freedreno/fdl: Optimize linear_to_tiled with avx2
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38249>
This commit is contained in:
parent
3160c516ca
commit
0f2dcf656a
1 changed files with 84 additions and 4 deletions
|
|
@ -25,6 +25,8 @@
|
|||
|
||||
#if DETECT_ARCH_AARCH64
|
||||
#include <arm_neon.h>
|
||||
#elif (DETECT_ARCH_X86 || DETECT_ARCH_X86_64) && defined(__AVX2__)
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
|
||||
/* The tiling scheme on Qualcomm consists of four levels:
|
||||
|
|
@ -174,7 +176,7 @@ block_y_xormask(uint32_t y, uint32_t cpp, uint32_t bank_mask, uint32_t bank_shif
|
|||
template<>
|
||||
uint32_t
|
||||
block_y_xormask<FDL_MACROTILE_4_CHANNEL>(uint32_t y, uint32_t cpp,
|
||||
uint32_t bank_mask,
|
||||
uint32_t bank_mask,
|
||||
uint32_t bank_shift)
|
||||
{
|
||||
return ((((y & 1) * 0b110) ^ (((y >> 1) & 1) * 0b011)) << 8) |
|
||||
|
|
@ -184,7 +186,7 @@ block_y_xormask<FDL_MACROTILE_4_CHANNEL>(uint32_t y, uint32_t cpp,
|
|||
template<>
|
||||
uint32_t
|
||||
block_y_xormask<FDL_MACROTILE_8_CHANNEL>(uint32_t y, uint32_t cpp,
|
||||
uint32_t bank_mask,
|
||||
uint32_t bank_mask,
|
||||
uint32_t bank_shift)
|
||||
{
|
||||
return ((((y & 1) * 0b110) ^ (((y >> 1) & 1) * 0b011) ^
|
||||
|
|
@ -312,7 +314,7 @@ get_block_size(unsigned cpp, bool r8g8, uint32_t *block_width,
|
|||
}
|
||||
}
|
||||
|
||||
void
|
||||
void
|
||||
fdl6_get_ubwc_macrotile_size(const struct fdl_layout *layout,
|
||||
uint32_t *macrotile_width,
|
||||
uint32_t *macrotile_height)
|
||||
|
|
@ -449,6 +451,42 @@ linear_to_tiled_1cpp(char *_tiled, char *_linear, uint32_t linear_pitch)
|
|||
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
|
||||
"v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
|
||||
}
|
||||
#elif (DETECT_ARCH_X86 || DETECT_ARCH_X86_64) && defined(__AVX2__)
|
||||
__m256i *tiled = (__m256i *)_tiled;
|
||||
for (unsigned y = 0; y < 2; y++, _linear += 4 * linear_pitch) {
|
||||
auto linear0 = _mm256_loadu_ps((float const*)(_linear));
|
||||
auto linear1 = _mm256_loadu_ps((float const*)(_linear + linear_pitch));
|
||||
auto linear2 = _mm256_loadu_ps((float const*)(_linear + 2 * linear_pitch));
|
||||
auto linear3 = _mm256_loadu_ps((float const*)(_linear + 3 * linear_pitch));
|
||||
|
||||
auto linear_0_1_low = _mm256_unpacklo_epi16(linear0, linear1);
|
||||
auto linear_2_3_low = _mm256_unpacklo_epi16(linear2, linear3);
|
||||
auto linear_0_1_high = _mm256_unpackhi_epi16(linear0, linear1);
|
||||
auto linear_2_3_high = _mm256_unpackhi_epi16(linear2, linear3);
|
||||
|
||||
// Effectively 256-bit zip1 ARM equivalent
|
||||
auto r1_zip1 = _mm256_permute4x64_epi64(linear_2_3_low, 0x60);
|
||||
auto r1_zip2 = _mm256_permute4x64_epi64(linear_0_1_low, 0xd4);
|
||||
auto r1 = _mm256_blend_epi32(r1_zip2, r1_zip1, 0xcc);
|
||||
|
||||
auto r2_zip1 = _mm256_permute4x64_epi64(linear_2_3_high, 0x60);
|
||||
auto r2_zip2 = _mm256_permute4x64_epi64(linear_0_1_high, 0xd4);
|
||||
auto r2 = _mm256_blend_epi32(r2_zip2, r2_zip1, 0xcc);
|
||||
|
||||
// Effectively 256-bit zip2 ARM equivalent
|
||||
auto r3_zip1 = _mm256_permute4x64_epi64(linear_2_3_low, 0xe8);
|
||||
auto r3_zip2 = _mm256_permute4x64_epi64(linear_0_1_low, 0xf6);
|
||||
auto r3 = _mm256_blend_epi32(r3_zip2, r3_zip1, 0xcc);
|
||||
|
||||
auto r4_zip1 = _mm256_permute4x64_epi64(linear_2_3_high, 0xe8);
|
||||
auto r4_zip2 = _mm256_permute4x64_epi64(linear_0_1_high, 0xf6);
|
||||
auto r4 = _mm256_blend_epi32(r4_zip2, r4_zip1, 0xcc);
|
||||
|
||||
_mm256_storeu_ps((float*)tiled, r1); tiled++;
|
||||
_mm256_storeu_ps((float*)tiled, r2); tiled++;
|
||||
_mm256_storeu_ps((float*)tiled, r3); tiled++;
|
||||
_mm256_storeu_ps((float*)tiled, r4); tiled++;
|
||||
}
|
||||
#else
|
||||
memcpy_small<1, LINEAR_TO_TILED, FDL_MACROTILE_4_CHANNEL>(
|
||||
0, 0, 32, 8, _tiled, _linear, linear_pitch, 0, 0, 0);
|
||||
|
|
@ -525,6 +563,29 @@ linear_to_tiled_2cpp(char *_tiled, char *_linear, uint32_t linear_pitch)
|
|||
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
|
||||
"v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
|
||||
}
|
||||
#elif (DETECT_ARCH_X86 || DETECT_ARCH_X86_64) && defined(__AVX2__)
|
||||
__m256i *tiled = (__m256i *)_tiled;
|
||||
for (unsigned x = 0; x < 2; x++, _linear += 32) {
|
||||
auto linear0 = _mm256_loadu_ps((float const*)(_linear));
|
||||
auto linear1 = _mm256_loadu_ps((float const*)(_linear + linear_pitch));
|
||||
auto linear2 = _mm256_loadu_ps((float const*)(_linear + 2 * linear_pitch));
|
||||
auto linear3 = _mm256_loadu_ps((float const*)(_linear + 3 * linear_pitch));
|
||||
|
||||
auto linear_0_1_low = _mm256_unpacklo_epi32(linear0, linear1);
|
||||
auto linear_2_3_low = _mm256_unpacklo_epi32(linear2, linear3);
|
||||
auto linear_0_1_high = _mm256_unpackhi_epi32(linear0, linear1);
|
||||
auto linear_2_3_high = _mm256_unpackhi_epi32(linear2, linear3);
|
||||
|
||||
auto r1 = _mm256_inserti128_si256(linear_0_1_low, _mm256_castsi256_si128(linear_2_3_low), 1);
|
||||
auto r2 = _mm256_inserti128_si256(linear_0_1_high, _mm256_castsi256_si128(linear_2_3_high), 1);
|
||||
auto r3 = _mm256_permute2f128_si256(linear_0_1_low, linear_2_3_low, 0x31);
|
||||
auto r4 = _mm256_permute2f128_si256(linear_0_1_high, linear_2_3_high, 0x31);
|
||||
|
||||
_mm256_storeu_ps((float*)tiled, r1); tiled++;
|
||||
_mm256_storeu_ps((float*)tiled, r2); tiled++;
|
||||
_mm256_storeu_ps((float*)tiled, r3); tiled++;
|
||||
_mm256_storeu_ps((float*)tiled, r4); tiled++;
|
||||
}
|
||||
#else
|
||||
memcpy_small<2, LINEAR_TO_TILED, FDL_MACROTILE_4_CHANNEL>(
|
||||
0, 0, 32, 4, _tiled, _linear, linear_pitch, 0, 0, 0);
|
||||
|
|
@ -600,6 +661,25 @@ linear_to_tiled_4cpp(char *_tiled, char *_linear, uint32_t linear_pitch)
|
|||
: "0"(tiled), "r"(linear0), "r"(linear1), "r"(linear2), "r"(linear3)
|
||||
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
|
||||
"v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
|
||||
|
||||
#elif (DETECT_ARCH_X86 || DETECT_ARCH_X86_64) && defined(__AVX2__)
|
||||
__m256i *tiled = (__m256i *)_tiled;
|
||||
for (unsigned x = 0; x < 2; x++, _linear += 32) {
|
||||
auto linear0 = _mm256_loadu_ps((float const*)(_linear));
|
||||
auto linear1 = _mm256_loadu_ps((float const*)(_linear + 1 * linear_pitch));
|
||||
auto linear2 = _mm256_loadu_ps((float const*)(_linear + 2 * linear_pitch));
|
||||
auto linear3 = _mm256_loadu_ps((float const*)(_linear + 3 * linear_pitch));
|
||||
|
||||
auto r1 = _mm256_blend_ps(_mm256_permute4x64_pd(linear0, 0xd4), _mm256_permute4x64_pd(linear1, 0x60), 0xcc);
|
||||
auto r2 = _mm256_blend_ps(_mm256_permute4x64_pd(linear2, 0xd4), _mm256_permute4x64_pd(linear3, 0x60), 0xcc);
|
||||
auto r3 = _mm256_blend_ps(_mm256_permute4x64_pd(linear0, 0xf6), _mm256_permute4x64_pd(linear1, 0xe8), 0xcc);
|
||||
auto r4 = _mm256_blend_ps(_mm256_permute4x64_pd(linear2, 0xf6), _mm256_permute4x64_pd(linear3, 0xe8), 0xcc);
|
||||
|
||||
_mm256_storeu_ps((float*)tiled, r1); tiled++;
|
||||
_mm256_storeu_ps((float*)tiled, r2); tiled++;
|
||||
_mm256_storeu_ps((float*)tiled, r3); tiled++;
|
||||
_mm256_storeu_ps((float*)tiled, r4); tiled++;
|
||||
}
|
||||
#else
|
||||
pixel8_t *tiled = (pixel8_t *)_tiled;
|
||||
for (unsigned x = 0; x < 4; x++, _linear += 4 * 4, tiled += 8) {
|
||||
|
|
@ -908,7 +988,7 @@ fdl6_memcpy_linear_to_tiled(uint32_t x_start, uint32_t y_start,
|
|||
uint32_t x_block = (x + x_start) / block_width;
|
||||
uint32_t x_pixel = (x + x_start) % block_width;
|
||||
|
||||
uint32_t block_offset =
|
||||
uint32_t block_offset =
|
||||
get_block_offset(x_block, y_block, macrotile_stride, bank_mask,
|
||||
bank_shift);
|
||||
uint32_t pixel_offset = get_pixel_offset(x_pixel, y_pixel);
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue