From d778ede72c4c31bf4ed80578688c3e78d9af2dfa Mon Sep 17 00:00:00 2001 From: Ganesh Belgur Ramachandra Date: Mon, 23 Mar 2026 22:14:28 +0000 Subject: [PATCH] amd: import gfx11.7 addrlib Part-of: --- src/amd/addrlib/inc/addrinterface.h | 132 +- src/amd/addrlib/inc/addrtypes.h | 2 +- src/amd/addrlib/meson.build | 1 + src/amd/addrlib/src/addrinterface.cpp | 30 +- src/amd/addrlib/src/amdgpu_asic_addr.h | 5 +- src/amd/addrlib/src/core/addrcommon.h | 39 +- src/amd/addrlib/src/core/addrlib.cpp | 84 +- src/amd/addrlib/src/core/addrlib.h | 10 +- src/amd/addrlib/src/core/addrlib1.cpp | 10 +- src/amd/addrlib/src/core/addrlib2.cpp | 20 +- src/amd/addrlib/src/core/addrlib3.cpp | 43 +- src/amd/addrlib/src/core/addrlib3.h | 13 +- src/amd/addrlib/src/core/addrswizzler.cpp | 847 +++++++- src/amd/addrlib/src/core/addrswizzler.h | 33 +- src/amd/addrlib/src/core/addrswizzlersimd.h | 2031 +++++++++++++++++++ src/amd/addrlib/src/gfx10/gfx10addrlib.cpp | 110 +- src/amd/addrlib/src/gfx11/gfx11addrlib.cpp | 280 ++- src/amd/addrlib/src/gfx11/gfx11addrlib.h | 5 +- src/amd/addrlib/src/gfx12/gfx12addrlib.cpp | 213 +- src/amd/addrlib/src/gfx12/gfx12addrlib.h | 8 +- 20 files changed, 3407 insertions(+), 509 deletions(-) create mode 100644 src/amd/addrlib/src/core/addrswizzlersimd.h diff --git a/src/amd/addrlib/inc/addrinterface.h b/src/amd/addrlib/inc/addrinterface.h index 4ec2026376d..c1d9c1f7e5b 100644 --- a/src/amd/addrlib/inc/addrinterface.h +++ b/src/amd/addrlib/inc/addrinterface.h @@ -1,7 +1,7 @@ /* ************************************************************************************************************************ * -* Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved. +* Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved. * SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -24,7 +24,7 @@ extern "C" #endif #define ADDRLIB_VERSION_MAJOR 10 -#define ADDRLIB_VERSION_MINOR 1 +#define ADDRLIB_VERSION_MINOR 6 #define ADDRLIB_MAKE_VERSION(major, minor) ((major << 16) | minor) #define ADDRLIB_VERSION ADDRLIB_MAKE_VERSION(ADDRLIB_VERSION_MAJOR, ADDRLIB_VERSION_MINOR) @@ -107,6 +107,11 @@ typedef struct _ADDR_EXTENT3D * AddrComputeFmaskAddrFromCoord() * AddrComputeFmaskCoordFromAddr() * +* ///////////////////////////////////////////////////////////////////////////////////////////////// +* // Format properties functions +* ///////////////////////////////////////////////////////////////////////////////////////////////// +* AddrFormatProperties() +* **/ /** * ///////////////////////////////////////////////////////////////////////////////////////////////// @@ -452,6 +457,49 @@ ADDR_E_RETURNCODE ADDR_API AddrCreate( ADDR_E_RETURNCODE ADDR_API AddrDestroy( ADDR_HANDLE hLib); +/** +**************************************************************************************************** +* ADDR_FORMAT_PROPERTIES_IN +* +* @brief +* Input structure to the AddrFormatProperties routine. +* +**************************************************************************************************** +*/ +typedef struct _ADDR_FORMAT_PROPERTIES_IN { + UINT_32 size; ///< Size of this structure in bytes + AddrFormat format; ///< If format is set to valid one, bpp/width/height + /// might be overwritten +} ADDR_FORMAT_PROPERTIES_IN; + +/** +**************************************************************************************************** +* ADDR_FORMAT_PROPERTIES_OUT +* +* @brief +* Output structure from the AddrFormatProperties routine. +* +**************************************************************************************************** +*/ +typedef struct _ADDR_FORMAT_PROPERTIES_OUT { + UINT_32 size; ///< Size of this structure in bytes + UINT_32 bpp; ///< Bits per pixel as laid out in memory (eg. 128bpp for BC7) + ADDR_EXTENT2D expand; ///< Dimensions of one macro pixel block +} ADDR_FORMAT_PROPERTIES_OUT; + +/** +**************************************************************************************************** +* AddrFormatProperties +* +* @brief +* Gets a list of format properties +* +**************************************************************************************************** +*/ +ADDR_E_RETURNCODE ADDR_API AddrFormatProperties( + ADDR_HANDLE hLib, + const ADDR_FORMAT_PROPERTIES_IN* in, + ADDR_FORMAT_PROPERTIES_OUT* pOut); //////////////////////////////////////////////////////////////////////////////////////////////////// // Surface functions @@ -2463,6 +2511,7 @@ typedef union _ADDR2_SURFACE_FLAGS UINT_32 rotated : 1; ///< This resource is rotated and displayable UINT_32 needEquation : 1; ///< This resource needs equation to be generated if possible UINT_32 opt4space : 1; ///< This resource should be optimized for space + UINT_32 computeMaxSize : 1; ///< This resource should select the largest swizzle possible UINT_32 minimizeAlign : 1; ///< This resource should use minimum alignment UINT_32 noMetadata : 1; ///< This resource has no metadata UINT_32 metaRbUnaligned : 1; ///< This resource has rb unaligned metadata @@ -2470,7 +2519,7 @@ typedef union _ADDR2_SURFACE_FLAGS UINT_32 view3dAs2dArray : 1; ///< This resource is a 3D resource viewed as 2D array UINT_32 allowExtEquation : 1; ///< If unset, only legacy DX eqs are allowed (2 XORs) UINT_32 requireMetadata : 1; ///< This resource must support metadata - UINT_32 reserved : 11; ///< Reserved bits + UINT_32 reserved : 10; ///< Reserved bits }; UINT_32 value; @@ -2666,6 +2715,31 @@ ADDR_E_RETURNCODE ADDR_API Addr2ComputeSurfaceAddrFromCoord( const ADDR2_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn, ADDR2_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT* pOut); +/** +**************************************************************************************************** +* ADDR_COPY_FLAGS +* +* @brief +* Options controlling image copy functions. +**************************************************************************************************** +*/ +typedef union _ADDR_COPY_FLAGS { + struct + { + UINT_32 blockMemcpy : 1; ///< Memory layout is pre-swizzled and stored block-by-block. + /// For regions in the miptail, this uses hybrid memcpy. + /// Regions must cover full width/height of the subresource. + UINT_32 hybridMemcpy : 1; ///< Memory layout is partially pre-swizzled and stored + /// microblock-by-microblock. Data in this format is agnostic to + /// chip harvesting and block size. Regions will be padded out + /// to microblock boundaries for alignment. + /// Mutually exclusive with 'blockMemcpy'. + UINT_32 reserved : 30; ///< Reserved bits + }; + + UINT_32 value; +} ADDR_COPY_FLAGS; + /** **************************************************************************************************** * ADDR2_COPY_MEMSURFACE_REGION @@ -2718,6 +2792,7 @@ typedef struct _ADDR2_COPY_MEMSURFACE_INPUT /// - copyDims.depth == 1 /// - all copy regions target the same mip /// - all copy regions target the same slice/depth + ADDR_COPY_FLAGS copyFlags; ///< Controls how the copy is performed. } ADDR2_COPY_MEMSURFACE_INPUT; /** @@ -4008,30 +4083,34 @@ typedef union _ADDR2_SWMODE_SET */ typedef struct _ADDR2_GET_PREFERRED_SURF_SETTING_INPUT { - UINT_32 size; ///< Size of this structure in bytes + UINT_32 size; ///< Size of this structure in bytes - ADDR2_SURFACE_FLAGS flags; ///< Surface flags - AddrResourceType resourceType; ///< Surface type - AddrFormat format; ///< Surface format - AddrResrouceLocation resourceLoction; ///< Surface heap choice - ADDR2_BLOCK_SET forbiddenBlock; ///< Client can use it to disable some block setting - ///< such as linear for DXTn, tiled for YUV - ADDR2_SWTYPE_SET preferredSwSet; ///< Client can use it to specify sw type(s) wanted - BOOL_32 noXor; ///< Do not use xor mode for this resource - UINT_32 bpp; ///< bits per pixel - UINT_32 width; ///< Width (of mip0), in pixels - UINT_32 height; ///< Height (of mip0), in pixels - UINT_32 numSlices; ///< Number surface slice/depth (of mip0), - UINT_32 numMipLevels; ///< Total mipmap levels. - UINT_32 numSamples; ///< Number of samples - UINT_32 numFrags; ///< Number of fragments, leave it zero or the same as - /// number of samples for normal AA; Set it to the - /// number of fragments for EQAA - UINT_32 maxAlign; ///< maximum base/size alignment requested by client - UINT_32 minSizeAlign; ///< memory allocated for surface in client driver will - /// be padded to multiple of this value (in bytes) - DOUBLE memoryBudget; ///< Memory consumption ratio based on minimum possible - /// size. + ADDR2_SURFACE_FLAGS flags; ///< Surface flags + AddrResourceType resourceType; ///< Surface type + AddrFormat format; ///< Surface format + AddrResrouceLocation resourceLoction; ///< Surface heap choice + ADDR2_BLOCK_SET forbiddenBlock; ///< Client can use it to disable some block setting + ///< such as linear for DXTn, tiled for YUV + ADDR2_SWTYPE_SET preferredSwSet; ///< Client can use it to specify sw type(s) wanted + BOOL_32 noXor; ///< Do not use xor mode for this resource + UINT_32 bpp; ///< bits per pixel + UINT_32 width; ///< Width (of mip0), in pixels + UINT_32 height; ///< Height (of mip0), in pixels + UINT_32 numSlices; ///< Number surface slice/depth (of mip0), + UINT_32 numMipLevels; ///< Total mipmap levels. + UINT_32 numSamples; ///< Number of samples + UINT_32 numFrags; ///< Number of fragments, leave it zero or the same as + /// number of samples for normal AA; Set it to the + /// number of fragments for EQAA + UINT_32 maxAlign; ///< maximum base/size alignment requested by client + UINT_32 minSizeAlign; ///< memory allocated for surface in client driver will + /// be padded to multiple of this value (in bytes) + DOUBLE memoryBudget; ///< Memory consumption ratio based on minimum possible + /// size. + bool useBlockBasedHeuristic; ///< Use the block-based heuristic for swizzle mode selection. + /// The heuristic has the property of image size predictably + /// with image extents, which is needed for Vulkan. It ignores + /// minSizeAlign, maxAlign and memoryBudget options } ADDR2_GET_PREFERRED_SURF_SETTING_INPUT; /** @@ -4488,6 +4567,7 @@ typedef struct _ADDR3_COPY_MEMSURFACE_INPUT /// - copyDims.depth == 1 /// - all copy regions target the same mip /// - all copy regions target the same slice/depth + ADDR_COPY_FLAGS copyFlags; ///< Controls how the copy is performed. } ADDR3_COPY_MEMSURFACE_INPUT; /** diff --git a/src/amd/addrlib/inc/addrtypes.h b/src/amd/addrlib/inc/addrtypes.h index 66f7ed4a4af..0934cf023a6 100644 --- a/src/amd/addrlib/inc/addrtypes.h +++ b/src/amd/addrlib/inc/addrtypes.h @@ -1,7 +1,7 @@ /* ************************************************************************************************************************ * -* Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved. +* Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved. * SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ diff --git a/src/amd/addrlib/meson.build b/src/amd/addrlib/meson.build index 7a5266f8f38..6341649152b 100644 --- a/src/amd/addrlib/meson.build +++ b/src/amd/addrlib/meson.build @@ -20,6 +20,7 @@ files_addrlib = files( 'src/core/addrobject.h', 'src/core/addrswizzler.cpp', 'src/core/addrswizzler.h', + 'src/core/addrswizzlersimd.h', 'src/core/coord.cpp', 'src/core/coord.h', 'src/gfx9/gfx9addrlib.cpp', diff --git a/src/amd/addrlib/src/addrinterface.cpp b/src/amd/addrlib/src/addrinterface.cpp index 52a1f91ef6d..2f2073dc0b6 100644 --- a/src/amd/addrlib/src/addrinterface.cpp +++ b/src/amd/addrlib/src/addrinterface.cpp @@ -1,7 +1,7 @@ /* ************************************************************************************************************************ * -* Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved. +* Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved. * SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -81,6 +81,34 @@ ADDR_E_RETURNCODE ADDR_API AddrDestroy( return returnCode; } +/** +**************************************************************************************************** +* AddrFormatProperties +* +* @brief +* Retreives properties of the specified format. +* +**************************************************************************************************** +*/ +ADDR_E_RETURNCODE ADDR_API AddrFormatProperties( + ADDR_HANDLE hLib, + const ADDR_FORMAT_PROPERTIES_IN& in, + ADDR_FORMAT_PROPERTIES_OUT* pOut) +{ + ADDR_E_RETURNCODE retCode = ADDR_INVALIDPARAMS; + + if (hLib) + { + Lib* pLib = Lib::GetLib(hLib); + + if (pLib != NULL) + { + retCode = pLib->GetFormatProperties(in, pOut); + } + } + + return retCode; +} //////////////////////////////////////////////////////////////////////////////////////////////////// // Surface functions diff --git a/src/amd/addrlib/src/amdgpu_asic_addr.h b/src/amd/addrlib/src/amdgpu_asic_addr.h index 58e4be056d8..2f56743b040 100644 --- a/src/amd/addrlib/src/amdgpu_asic_addr.h +++ b/src/amd/addrlib/src/amdgpu_asic_addr.h @@ -1,7 +1,7 @@ /* ************************************************************************************************************************ * -* Copyright (C) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. +* Copyright (C) 2017-2026 Advanced Micro Devices, Inc. All rights reserved. * SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -29,6 +29,7 @@ #define FAMILY_NV3 0x91 //# 145 / Navi: 3x #define FAMILY_STX 0x96 #define FAMILY_PHX 0x94 //# 148 / Phoenix +#define FAMILY_GFX1170 0x9A #define FAMILY_RMB 0x92 //# 146 / Rembrandt #define FAMILY_RPL 0x95 //# 149 / Raphael #define FAMILY_MDN 0x97 //# 151 / Mendocino @@ -109,6 +110,7 @@ #define AMDGPU_PHOENIX2_RANGE 0x80, 0xC0 //# 128 <= x < 192 #define AMDGPU_HAWK_POINT1_RANGE 0xC0, 0xF0 //# 192 <= x < 240 #define AMDGPU_HAWK_POINT2_RANGE 0xF0, 0xFF //# 240 <= x < 255 +#define AMDGPU_GFX1170_RANGE 0x01, 0x40 //# 1 <= x < 64 #define AMDGPU_REMBRANDT_RANGE 0x01, 0xFF //# 01 <= x < 255 #define AMDGPU_RAPHAEL_RANGE 0x01, 0xFF //# 1 <= x < max @@ -189,6 +191,7 @@ #define ASICREV_IS_PHOENIX2(r) ASICREV_IS(r, PHOENIX2) #define ASICREV_IS_HAWK_POINT1(r) ASICREV_IS(r, HAWK_POINT1) #define ASICREV_IS_HAWK_POINT2(r) ASICREV_IS(r, HAWK_POINT2) +#define ASICREV_IS_GFX1170(r) ASICREV_IS(r, GFX1170) #define ASICREV_IS_REMBRANDT(r) ASICREV_IS(r, REMBRANDT) #define ASICREV_IS_RAPHAEL(r) ASICREV_IS(r, RAPHAEL) diff --git a/src/amd/addrlib/src/core/addrcommon.h b/src/amd/addrlib/src/core/addrcommon.h index f07de1aef93..9c1766ae603 100644 --- a/src/amd/addrlib/src/core/addrcommon.h +++ b/src/amd/addrlib/src/core/addrcommon.h @@ -1,7 +1,7 @@ /* ************************************************************************************************************************ * -* Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved. +* Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved. * SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -360,7 +360,7 @@ static inline UINT_32 BitMaskScanForward( { ADDR_ASSERT(mask > 0); unsigned long out = 0; -#if (defined(_WIN64) && defined(_M_X64)) || (defined(_WIN32) && defined(_M_IX64)) +#if ((defined(_WIN64) && defined(_M_X64)) || (defined(_WIN32) && defined(_M_IX64))) && !defined(_M_ARM64EC) out = ::_tzcnt_u32(mask); #elif (defined(_WIN32) || defined(_WIN64)) ::_BitScanForward(&out, mask); @@ -436,6 +436,22 @@ static inline UINT_64 IsPow2( return !(dim & (dim - 1)); } +/** +**************************************************************************************************** +* RoundUpToMultiple +* +* @brief +* Rounds up the specified integer to the nearest multiple of the specified alignment value. +**************************************************************************************************** +*/ +template +constexpr T RoundUpToMultiple( + T operand, ///< Value to be aligned. + T alignment) ///< Alignment desired. +{ + return (((operand + (alignment - 1)) / alignment) * alignment); +} + /** **************************************************************************************************** * PowTwoAlign @@ -647,6 +663,25 @@ static inline UINT_32 Log2( return (x != 0) ? (31 ^ BitMaskScanReverse(x)) : 0; } +/** +**************************************************************************************************** +* ConstexprLog2 +* +* @brief +* Compute log of base 2 no matter the target is power of 2 or not. Returns 0 if 0. +**************************************************************************************************** +*/ +static constexpr inline UINT_32 ConstexprLog2( + UINT_32 x) ///< [in] the value should calculate log based 2 +{ + UINT_32 out = 0; + while (x >>= 1) + { + out++; + } + return out; +} + /** **************************************************************************************************** * QLog2 diff --git a/src/amd/addrlib/src/core/addrlib.cpp b/src/amd/addrlib/src/core/addrlib.cpp index 457ee43129a..916b77b5309 100644 --- a/src/amd/addrlib/src/core/addrlib.cpp +++ b/src/amd/addrlib/src/core/addrlib.cpp @@ -1,7 +1,7 @@ /* ************************************************************************************************************************ * -* Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved. +* Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved. * SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -222,6 +222,7 @@ ADDR_E_RETURNCODE Lib::Create( case FAMILY_NV3: case FAMILY_STX: case FAMILY_PHX: + case FAMILY_GFX1170: pLib = Gfx11HwlInit(&client); break; case FAMILY_NV4: @@ -304,6 +305,44 @@ ADDR_E_RETURNCODE Lib::Create( return returnCode; } +/** +**************************************************************************************************** +* Lib::GetFormatProperties +* +* @brief +* Returns the properties of the format as specifed in the input. +* @return +* ADDR_E_RETURNCODE +**************************************************************************************************** +*/ +ADDR_E_RETURNCODE Lib::GetFormatProperties( + const ADDR_FORMAT_PROPERTIES_IN& in, + ADDR_FORMAT_PROPERTIES_OUT* pOut + ) const +{ + ADDR_E_RETURNCODE returnCode = ADDR_OK; + + if (GetFillSizeFieldsFlags() == TRUE) + { + if ((in.size != sizeof(ADDR_FORMAT_PROPERTIES_IN)) || + (pOut->size != sizeof(ADDR_FORMAT_PROPERTIES_OUT))) + { + returnCode = ADDR_PARAMSIZEMISMATCH; + } + } + + if (returnCode == ADDR_OK) + { + pOut->bpp = GetElemLib()->GetBitsPerPixel(in.format, + nullptr, // elemMode, unused + &pOut->expand.width, + &pOut->expand.height, + nullptr); // unused bits + } + + return returnCode; +} + /** **************************************************************************************************** * Lib::SetChipFamily @@ -315,7 +354,7 @@ ADDR_E_RETURNCODE Lib::Create( **************************************************************************************************** */ VOID Lib::SetChipFamily( - UINT_32 uChipFamily, ///< [in] chip family defined in atiih.h + UINT_32 uChipFamily, ///< [in] chip family defined in atiid.h UINT_32 uChipRevision) ///< [in] chip revision defined in "asic_family"_id.h { ChipFamily family = HwlConvertChipFamily(uChipFamily, uChipRevision); @@ -668,6 +707,47 @@ UINT_32 Lib::GetBpe(AddrFormat format) const return GetElemLib()->GetBitsPerPixel(format); } +/** +**************************************************************************************************** +* Lib::GetSwizzleModePreferenceRatio +* +* @brief +* Get ratio driving swizzle mode selection heuristic. Ratio is returned as fraction nominator +* and denominator +* @return +* void +**************************************************************************************************** +*/ +void Lib::GetSwizzleModePreferenceRatio( + const ADDR2_GET_PREFERRED_SURF_SETTING_INPUT* pIn, + UINT_32* pOutRatioLo, + UINT_32* pOutRatioHi + ) const +{ + const BOOL_32 computeMinSize = (pIn->flags.minimizeAlign == 1) || (pIn->memoryBudget >= 1.0); + + if (computeMinSize) + { + *pOutRatioLo = 1; + *pOutRatioHi = 1; + } + else if (pIn->flags.opt4space) + { + *pOutRatioLo = 3; + *pOutRatioHi = 2; + } + else if (pIn->flags.computeMaxSize) + { + *pOutRatioLo = 1024; + *pOutRatioHi = 1; + } + else + { + *pOutRatioLo = 2; + *pOutRatioHi = 1; + } +} + /** ************************************************************************************************************************ * Lib::ComputeOffsetFromSwizzlePattern diff --git a/src/amd/addrlib/src/core/addrlib.h b/src/amd/addrlib/src/core/addrlib.h index 64881a7f434..7f2a4fe0ad6 100644 --- a/src/amd/addrlib/src/core/addrlib.h +++ b/src/amd/addrlib/src/core/addrlib.h @@ -1,7 +1,7 @@ /* ************************************************************************************************************************ * -* Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved. +* Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved. * SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -300,6 +300,10 @@ public: delete this; } + ADDR_E_RETURNCODE GetFormatProperties( + const ADDR_FORMAT_PROPERTIES_IN& in, + ADDR_FORMAT_PROPERTIES_OUT* pOut) const; + static Lib* GetLib(ADDR_HANDLE hLib); /// Returns which version of addrlib functions should be used. @@ -333,6 +337,10 @@ public: UINT_32 GetBpe(AddrFormat format) const; + void GetSwizzleModePreferenceRatio( + const ADDR2_GET_PREFERRED_SURF_SETTING_INPUT* pIn, + UINT_32* pOutRatioLo, + UINT_32* pOutRatioHi) const; static UINT_32 ComputeOffsetFromSwizzlePattern( const UINT_64* pPattern, diff --git a/src/amd/addrlib/src/core/addrlib1.cpp b/src/amd/addrlib/src/core/addrlib1.cpp index fc459f5684e..2742ce18467 100644 --- a/src/amd/addrlib/src/core/addrlib1.cpp +++ b/src/amd/addrlib/src/core/addrlib1.cpp @@ -1,7 +1,7 @@ /* ************************************************************************************************************************ * -* Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved. +* Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved. * SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -2473,7 +2473,7 @@ UINT_64 Lib::HwlComputeXmaskAddrFromCoord( // macroTileIndexX = x / macroTileWidth; macroTileIndexY = y / macroTileHeight; - macroTileOffset = ((macroTileIndexY * macroTilesPerRow) + macroTileIndexX) * macroTileBytes; + macroTileOffset = (static_cast(macroTileIndexY * macroTilesPerRow) + macroTileIndexX) * macroTileBytes; // // Compute the pixel offset within the macro tile. @@ -2675,7 +2675,7 @@ VOID Lib::ComputeSurfaceCoordFromAddrMicroTiled( // sliceBits = static_cast(pitch) * height * microTileThickness * bpp * numSamples; - rowBits = (pitch / MicroTileWidth) * microTileBits; + rowBits = static_cast(pitch / MicroTileWidth) * microTileBits; // // Extract the slice index. @@ -3559,11 +3559,11 @@ BOOL_32 Lib::DegradeTo1D( if (degrade == FALSE) { // Only check width and height as slices are aligned to thickness - UINT_64 unalignedSize = width * height; + UINT_64 unalignedSize = static_cast(width) * height; UINT_32 alignedPitch = PowTwoAlign(width, macroTilePitchAlign); UINT_32 alignedHeight = PowTwoAlign(height, macroTileHeightAlign); - UINT_64 alignedSize = alignedPitch * alignedHeight; + UINT_64 alignedSize = static_cast(alignedPitch) * alignedHeight; // alignedSize > 1.5 * unalignedSize if (2 * alignedSize > 3 * unalignedSize) diff --git a/src/amd/addrlib/src/core/addrlib2.cpp b/src/amd/addrlib/src/core/addrlib2.cpp index b3920f34959..a46aa4617a5 100644 --- a/src/amd/addrlib/src/core/addrlib2.cpp +++ b/src/amd/addrlib/src/core/addrlib2.cpp @@ -1,7 +1,7 @@ /* ************************************************************************************************************************ * -* Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved. +* Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved. * SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -207,7 +207,7 @@ ADDR_E_RETURNCODE Lib::ComputeSurfaceInfo( // Overwrite these parameters if we have a valid format } - if (localIn.bpp != 0) + if (localIn.bpp >= 8) { localIn.width = Max(localIn.width, 1u); localIn.height = Max(localIn.height, 1u); @@ -444,8 +444,8 @@ ADDR_E_RETURNCODE Lib::CopyLinearSurface( void* pMipBase = VoidPtrInc(pIn->pMappedSurface, (pIn->singleSubres ? 0 : mipInfo[pCurRegion->mipId].offset)); - const size_t lineSizeBytes = (localIn.bpp >> 3) * pCurRegion->copyDims.width; - const size_t lineImgPitchBytes = (localIn.bpp >> 3) * mipInfo[pCurRegion->mipId].pitch; + const size_t lineSizeBytes = (static_cast(localIn.bpp) >> 3) * pCurRegion->copyDims.width; + const size_t lineImgPitchBytes = (static_cast(localIn.bpp) >> 3) * mipInfo[pCurRegion->mipId].pitch; for (UINT_32 sliceIdx = 0; sliceIdx < pCurRegion->copyDims.depth; sliceIdx++) { @@ -504,6 +504,11 @@ ADDR_E_RETURNCODE Lib::CopyMemToSurface( { returnCode = ADDR_INVALIDPARAMS; } + else if (pIn->copyFlags.blockMemcpy && pIn->copyFlags.hybridMemcpy) + { + // Invalid to specify conflicting copy modes. + returnCode = ADDR_INVALIDPARAMS; + } else { UINT_32 baseSlice = pRegions[0].slice; @@ -573,6 +578,11 @@ ADDR_E_RETURNCODE Lib::CopySurfaceToMem( { returnCode = ADDR_INVALIDPARAMS; } + else if (pIn->copyFlags.blockMemcpy && pIn->copyFlags.hybridMemcpy) + { + // Invalid to specify conflicting copy modes. + returnCode = ADDR_INVALIDPARAMS; + } else { UINT_32 baseSlice = pRegions[0].slice; @@ -1424,7 +1434,7 @@ ADDR_E_RETURNCODE Lib::ComputeSurfaceAddrFromCoordLinear( { pOut->addr = (localOut.sliceSize * pIn->slice) + mipInfo[pIn->mipId].offset + - (pIn->y * mipInfo[pIn->mipId].pitch + pIn->x) * (pIn->bpp >> 3); + (static_cast(pIn->y) * mipInfo[pIn->mipId].pitch + pIn->x) * (pIn->bpp >> 3); pOut->bitPosition = 0; } else diff --git a/src/amd/addrlib/src/core/addrlib3.cpp b/src/amd/addrlib/src/core/addrlib3.cpp index b809f908375..ac9d1022bad 100644 --- a/src/amd/addrlib/src/core/addrlib3.cpp +++ b/src/amd/addrlib/src/core/addrlib3.cpp @@ -1,7 +1,7 @@ /* ************************************************************************************************************************ * -* Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved. +* Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved. * SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -39,8 +39,6 @@ namespace V3 Lib::Lib() : Addr::Lib(), - m_pipesLog2(0), - m_pipeInterleaveLog2(0), m_numEquations(0) { Init(); @@ -59,8 +57,6 @@ Lib::Lib( const Client* pClient) : Addr::Lib(pClient), - m_pipesLog2(0), - m_pipeInterleaveLog2(0), m_numEquations(0) { Init(); @@ -265,7 +261,7 @@ ADDR_E_RETURNCODE Lib::ComputeSurfaceInfo( // Overwrite these parameters if we have a valid format } - if (localIn.bpp != 0) + if (localIn.bpp >= 8) { localIn.width = Max(localIn.width, 1u); localIn.height = Max(localIn.height, 1u); @@ -547,8 +543,8 @@ ADDR_E_RETURNCODE Lib::CopyLinearSurface( void* pMipBase = VoidPtrInc(pIn->pMappedSurface, (pIn->singleSubres ? 0 : mipInfo[pCurRegion->mipId].offset)); - const size_t lineSizeBytes = (localIn.bpp >> 3) * pCurRegion->copyDims.width; - const size_t lineImgPitchBytes = (localIn.bpp >> 3) * mipInfo[pCurRegion->mipId].pitch; + const size_t lineSizeBytes = (static_cast(localIn.bpp) >> 3) * pCurRegion->copyDims.width; + const size_t lineImgPitchBytes = (static_cast(localIn.bpp) >> 3) * mipInfo[pCurRegion->mipId].pitch; for (UINT_32 sliceIdx = 0; sliceIdx < pCurRegion->copyDims.depth; sliceIdx++) { @@ -611,6 +607,11 @@ ADDR_E_RETURNCODE Lib::CopyMemToSurface( { returnCode = ADDR_INVALIDPARAMS; } + else if (pIn->copyFlags.blockMemcpy && pIn->copyFlags.hybridMemcpy) + { + // Invalid to specify conflicting copy modes. + returnCode = ADDR_INVALIDPARAMS; + } else { UINT_32 baseSlice = pRegions[0].slice; @@ -680,6 +681,11 @@ ADDR_E_RETURNCODE Lib::CopySurfaceToMem( { returnCode = ADDR_INVALIDPARAMS; } + else if (pIn->copyFlags.blockMemcpy && pIn->copyFlags.hybridMemcpy) + { + // Invalid to specify conflicting copy modes. + returnCode = ADDR_INVALIDPARAMS; + } else { UINT_32 baseSlice = pRegions[0].slice; @@ -736,7 +742,7 @@ ADDR_E_RETURNCODE Lib::ComputePipeBankXor( const ADDR3_COMPUTE_PIPEBANKXOR_INPUT* pIn, ADDR3_COMPUTE_PIPEBANKXOR_OUTPUT* pOut) { - ADDR_E_RETURNCODE returnCode; + ADDR_E_RETURNCODE returnCode = ADDR_OK; if ((GetFillSizeFieldsFlags() == TRUE) && ((pIn->size != sizeof(ADDR3_COMPUTE_PIPEBANKXOR_INPUT)) || @@ -746,7 +752,23 @@ ADDR_E_RETURNCODE Lib::ComputePipeBankXor( } else { - returnCode = HwlComputePipeBankXor(pIn, pOut); + // The swizzle mode determines how many unused bits there are in the address. We never (ok, rarely...) program + // the low eight bits of the address, so the "numSwizzleBits" effectively represents the number of "guaranteed + // zero" programmed bits in the address. + const UINT_32 numSwizzleBits = GetBlockSizeLog2(pIn->swizzleMode, FALSE) - 8; + + // make sure this configuration supports swizzling + if (numSwizzleBits != 0) + { + // These cases should have been excluded with the "numSwizzleBits" calculation above, but make sure here. + ADDR_ASSERT((IsLinear(pIn->swizzleMode) == FALSE) && (IsBlock256b(pIn->swizzleMode) == FALSE)); + + pOut->pipeBankXor = pIn->surfIndex % (1 << numSwizzleBits); + } + else + { + pOut->pipeBankXor = 0; + } } return returnCode; @@ -1167,7 +1189,6 @@ ADDR_E_RETURNCODE Lib::ComputeSurfaceInfoSanityCheck( return HwlValidateNonSwModeParams(&localIn) ? ADDR_OK : ADDR_INVALIDPARAMS; } - /** ************************************************************************************************************************ * Lib::ComputeOffsetFromEquation diff --git a/src/amd/addrlib/src/core/addrlib3.h b/src/amd/addrlib/src/core/addrlib3.h index 6f6ecf6d99f..ae4e6b7fd8f 100644 --- a/src/amd/addrlib/src/core/addrlib3.h +++ b/src/amd/addrlib/src/core/addrlib3.h @@ -1,7 +1,7 @@ /* ************************************************************************************************************************ * -* Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. +* Copyright (C) 2022-2026 Advanced Micro Devices, Inc. All rights reserved. * SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -44,7 +44,6 @@ struct ADDR3_COORD struct ADDR3_COMPUTE_SURFACE_INFO_PARAMS_INPUT { const ADDR3_COMPUTE_SURFACE_INFO_INPUT* pSurfInfo; - void* pvAddrParams; }; /** @@ -155,14 +154,16 @@ protected: Lib(); // Constructor is protected Lib(const Client* pClient); - UINT_32 m_pipesLog2; ///< Number of pipe per shader engine Log2 - UINT_32 m_pipeInterleaveLog2; ///< Log2 of pipe interleave bytes - SwizzleModeFlags m_swizzleModeTable[ADDR3_MAX_TYPE]; ///< Swizzle mode table // Number of unique MSAA sample rates (1/2/4/8) static const UINT_32 MaxNumMsaaRates = 4; + //# These fields exist in the GB_ADDR_CONFIG register; however, the HW does not care about them. + //# The HW acts as if the log2(pipes)==5 and log2(pi) == 8, always. + static const UINT_32 NumPipesLog2 = 5; + static const UINT_32 PipeInterleaveLog2 = 8; + // Number of equation entries in the table UINT_32 m_numEquations; @@ -444,4 +445,4 @@ private: } // V3 } // Addr -#endif \ No newline at end of file +#endif diff --git a/src/amd/addrlib/src/core/addrswizzler.cpp b/src/amd/addrlib/src/core/addrswizzler.cpp index b27851b9ec2..19e7d212bb1 100644 --- a/src/amd/addrlib/src/core/addrswizzler.cpp +++ b/src/amd/addrlib/src/core/addrswizzler.cpp @@ -2,7 +2,8 @@ /* ************************************************************************************************************************ * -* Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. +* Copyright (C) 2024-2026 Advanced Micro Devices, Inc. All rights reserved. +* SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -14,6 +15,7 @@ */ #include "addrswizzler.h" +#include "addrswizzlersimd.h" namespace Addr { @@ -98,6 +100,23 @@ void LutAddresser::InitSwizzleProps() m_sLutMask |= m_bit[i].s; } + // Derive the microblock size from the swizzle equation. + UINT_32 xMbMask = 0; + UINT_32 yMbMask = 0; + UINT_32 zMbMask = 0; + for (UINT_32 i = 0; i < 8; i++) + { + xMbMask |= m_bit[i].x; + yMbMask |= m_bit[i].y; + zMbMask |= m_bit[i].z; + } + m_microBlockSize.width = xMbMask + 1; + m_microBlockSize.height = yMbMask + 1; + m_microBlockSize.depth = zMbMask + 1; + ADDR_ASSERT(IsPow2(m_microBlockSize.width)); + ADDR_ASSERT(IsPow2(m_microBlockSize.height)); + ADDR_ASSERT(IsPow2(m_microBlockSize.depth)); + // An expandX of 1 is a no-op m_maxExpandX = 1; if (m_sLutMask == 0) @@ -153,7 +172,7 @@ void LutAddresser::InitLuts() m_pYLut = &m_lutData[0]; ADDR_ASSERT(m_pYLut[0] == 0); } - + if (m_zLutMask != 0) { m_pZLut = &m_lutData[curOffset]; @@ -269,82 +288,33 @@ UINT_32 LutAddresser::EvalEquation( /** **************************************************************************************************** -* Copy2DSliceUnaligned +* CopyRowUnaligned * * @brief -* Copies an arbitrary 2D pixel region to or from a surface. +* Copies a single row to or from a surface. **************************************************************************************************** */ template -void Copy2DSliceUnaligned( - void* pImgBlockSliceStart, // Block corresponding to beginning of slice - void* pBuf, // Pointer to data starting from the copy origin. - size_t bufStrideY, // Stride of each row in pBuf - UINT_32 imageBlocksY, // Width of the image slice, in blocks. - ADDR_COORD2D origin, // Absolute origin, in elements - ADDR_EXTENT2D extent, // Size to copy, in elements - UINT_32 sliceXor, // Includes pipeBankXor and z XOR +void CopyRowUnaligned( + void* pRowImgBlockStart, // Pointer to the image block at x=0 + void* pBuf, // Pointer to data at x=0 + UINT_32 xStart, // x value to start at + UINT_32 xEnd, // x value to finish at (not inclusive) + UINT_32 rowXor, // Value to XOR in for each address (makes up PBX and y/z coords) const LutAddresser& addresser) { - UINT_32 xStart = origin.x; - UINT_32 xEnd = origin.x + extent.width; - + UINT_32 x = xStart; constexpr UINT_32 PixBytes = (1 << BPELog2); - // Apply a negative offset now so later code can do eg. pBuf[x] instead of pBuf[x - origin.x] - pBuf = VoidPtrDec(pBuf, xStart * PixBytes); - - // Do things one row at a time for unaligned regions. - for (UINT_32 y = origin.y; y < (origin.y + extent.height); y++) + // Most swizzles pack 2-4 pixels horizontally. Take advantage of this even in non-microblock-aligned + // regions to commonly do 2-4x less work. This is still way less good than copying by whole microblocks though. + if (ExpandX > 1) { - UINT_32 yBlk = (y >> addresser.GetBlockYBits()) * imageBlocksY; - UINT_32 rowXor = sliceXor ^ addresser.GetAddressY(y); - - UINT_32 x = xStart; - - // Most swizzles pack 2-4 pixels horizontally. Take advantage of this even in non-microblock-aligned - // regions to commonly do 2-4x less work. This is still way less good than copying by whole microblocks though. - if (ExpandX > 1) + // Unaligned left edge + for (; x < Min(xEnd, PowTwoAlign(xStart, ExpandX)); x++) { - // Unaligned left edge - for (; x < Min(xEnd, PowTwoAlign(xStart, ExpandX)); x++) - { - UINT_32 blk = (yBlk + (x >> addresser.GetBlockXBits())); - void* pImgBlock = VoidPtrInc(pImgBlockSliceStart, blk << addresser.GetBlockBits()); - void* pPix = VoidPtrInc(pImgBlock, rowXor ^ addresser.GetAddressX(x)); - if (ImgIsDest) - { - memcpy(pPix, VoidPtrInc(pBuf, x * PixBytes), PixBytes); - } - else - { - memcpy(VoidPtrInc(pBuf, x * PixBytes), pPix, PixBytes); - } - } - // Aligned middle - for (; x < PowTwoAlignDown(xEnd, ExpandX); x += ExpandX) - { - UINT_32 blk = (yBlk + (x >> addresser.GetBlockXBits())); - void* pImgBlock = VoidPtrInc(pImgBlockSliceStart, blk << addresser.GetBlockBits()); - void* pPix = VoidPtrInc(pImgBlock, rowXor ^ addresser.GetAddressX(x)); - if (ImgIsDest) - { - memcpy(pPix, VoidPtrInc(pBuf, x * PixBytes), PixBytes * ExpandX); - } - else - { - memcpy(VoidPtrInc(pBuf, x * PixBytes), pPix, PixBytes * ExpandX); - } - } - } - // Unaligned end (or the whole thing when ExpandX == 1) - for (; x < xEnd; x++) - { - // Get the index of the block within the slice - UINT_32 blk = (yBlk + (x >> addresser.GetBlockXBits())); - // Apply that index to get the base address of the current block. - void* pImgBlock = VoidPtrInc(pImgBlockSliceStart, blk << addresser.GetBlockBits()); - // Grab the x-xor and XOR it all together, adding to get the final address + UINT_32 blk = (x >> addresser.GetBlockXBits()); + void* pImgBlock = VoidPtrInc(pRowImgBlockStart, blk << addresser.GetBlockBits()); void* pPix = VoidPtrInc(pImgBlock, rowXor ^ addresser.GetAddressX(x)); if (ImgIsDest) { @@ -355,8 +325,478 @@ void Copy2DSliceUnaligned( memcpy(VoidPtrInc(pBuf, x * PixBytes), pPix, PixBytes); } } + // Aligned middle + for (; x < PowTwoAlignDown(xEnd, ExpandX); x += ExpandX) + { + UINT_32 blk = (x >> addresser.GetBlockXBits()); + void* pImgBlock = VoidPtrInc(pRowImgBlockStart, blk << addresser.GetBlockBits()); + void* pPix = VoidPtrInc(pImgBlock, rowXor ^ addresser.GetAddressX(x)); + if (ImgIsDest) + { + memcpy(pPix, VoidPtrInc(pBuf, x * PixBytes), PixBytes * ExpandX); + } + else + { + memcpy(VoidPtrInc(pBuf, x * PixBytes), pPix, PixBytes * ExpandX); + } + } + } + // Unaligned end (or the whole thing when ExpandX == 1) + for (; x < xEnd; x++) + { + // Get the index of the block within the slice + UINT_32 blk = (x >> addresser.GetBlockXBits()); + // Apply that index to get the base address of the current block. + void* pImgBlock = VoidPtrInc(pRowImgBlockStart, blk << addresser.GetBlockBits()); + // Grab the x-xor and XOR it all together, adding to get the final address + void* pPix = VoidPtrInc(pImgBlock, rowXor ^ addresser.GetAddressX(x)); + if (ImgIsDest) + { + memcpy(pPix, VoidPtrInc(pBuf, x * PixBytes), PixBytes); + } + else + { + memcpy(VoidPtrInc(pBuf, x * PixBytes), pPix, PixBytes); + } + } +} - pBuf = VoidPtrInc(pBuf, bufStrideY); +/** +**************************************************************************************************** +* CopyImgUnaligned +* +* @brief +* Copies an arbitrary 3D pixel region to or from a surface. +**************************************************************************************************** +*/ +template +void CopyImgUnaligned( + void* pImgBlockStart, // Block corresponding to beginning of image + void* pBuf, // Pointer to data starting from the copy origin. + size_t bufStrideY, // Stride of each row in pBuf + size_t bufStrideZ, // Stride of each slice in pBuf + UINT_32 imageBlocksY, // Width of the image slice, in blocks. + UINT_32 imageBlocksZ, // Depth pitch of the image slice, in blocks. + ADDR_COORD3D origin, // Absolute origin, in elements + ADDR_EXTENT3D extent, // Size to copy, in elements + UINT_32 pipeBankXor, // Final value to xor in + BOOL_32 isInMipTail, // True if this is in the mip tail. + const LutAddresser& addresser) +{ + constexpr UINT_32 PixBytes = (1 << BPELog2); + + // Apply a negative x/y offset now so later code can do eg. pBuf[x] instead of pBuf[x - origin.x] + // Keep the z offset. + pBuf = VoidPtrDec(pBuf, origin.x * PixBytes); + + void* pSliceBuf = pBuf; + // Do things one slice/row at a time for unaligned regions. + for (UINT_32 z = origin.z; z < (origin.z + extent.depth); z++) + { + UINT_32 sliceXor = pipeBankXor ^ addresser.GetAddressZ(z); + UINT_32 zBlk = (z >> addresser.GetBlockZBits()) * imageBlocksZ; + void* pRowBuf = pSliceBuf; + for (UINT_32 y = origin.y; y < (origin.y + extent.height); y++) + { + UINT_32 yBlk = (y >> addresser.GetBlockYBits()) * imageBlocksY; + UINT_32 rowXor = sliceXor ^ addresser.GetAddressY(y); + UINT_64 rowOffset = ((zBlk + yBlk) << addresser.GetBlockBits()); + + void* pImgBlockRow = VoidPtrInc(pImgBlockStart, rowOffset); + + CopyRowUnaligned( + pImgBlockRow, + pRowBuf, + origin.x, + origin.x + extent.width, + rowXor, + addresser); + + pRowBuf = VoidPtrInc(pRowBuf, bufStrideY); + } + pSliceBuf = VoidPtrInc(pSliceBuf, bufStrideZ); + } +} + + +/** +**************************************************************************************************** +* HandleUnalignedRegions +* +* @brief +* Does unaligned copies for any X/Y/Z edges that are not fully aligned, fixing up the +* copy region and pointer to point at the aligned region that remains. +**************************************************************************************************** +*/ +template +void HandleUnalignedRegions( + void* pImgBlockStart, // Block corresponding to beginning of image + void** ppBuf, // Pointer to pointer to data starting from the copy origin. + size_t bufStrideY, // Stride of each row in pBuf + size_t bufStrideZ, // Stride of each slice in pBuf + UINT_32 imageBlocksY, // Width of the image slice, in blocks. + UINT_32 imageBlocksZ, // Depth pitch of the image slice, in blocks. + ADDR_COORD3D* pOrigin, // Absolute origin, in elements + ADDR_EXTENT3D* pExtent, // Size to copy, in elements + ADDR_EXTENT3D align, // Size to align on, in elements + UINT_32 pipeBankXor, // Final value to xor in + BOOL_32 isInMipTail, // True if this is in the mip tail. + const LutAddresser& addresser) +{ + constexpr bool ImgIsDest = true; + + // Go through the start/end of the x/y/z extents and copy the parts that aren't aligned. + if (pOrigin->x != PowTwoAlign(pOrigin->x, align.width)) + { + UINT_32 xSize = Min(pOrigin->x + pExtent->width, PowTwoAlign(pOrigin->x, align.width)) - pOrigin->x; + CopyImgUnaligned( + pImgBlockStart, + *ppBuf, + bufStrideY, + bufStrideZ, + imageBlocksY, + imageBlocksZ, + *pOrigin, + { xSize, pExtent->height, pExtent->depth}, + pipeBankXor, + isInMipTail, + addresser); + pExtent->width -= xSize; + pOrigin->x += xSize; + *ppBuf = VoidPtrInc(*ppBuf, xSize << BPELog2); + } + if (pOrigin->y != PowTwoAlign(pOrigin->y, align.height)) + { + UINT_32 ySize = Min(pOrigin->y + pExtent->height, PowTwoAlign(pOrigin->y, align.height)) - pOrigin->y; + CopyImgUnaligned( + pImgBlockStart, + *ppBuf, + bufStrideY, + bufStrideZ, + imageBlocksY, + imageBlocksZ, + *pOrigin, + { pExtent->width, ySize, pExtent->depth}, + pipeBankXor, + isInMipTail, + addresser); + pExtent->height -= ySize; + pOrigin->y += ySize; + *ppBuf = VoidPtrInc(*ppBuf, ySize * bufStrideY); + } + if (pOrigin->z != PowTwoAlign(pOrigin->z, align.depth)) + { + UINT_32 zSize = Min(pOrigin->z + pExtent->depth, PowTwoAlign(pOrigin->z, align.depth)) - pOrigin->z; + CopyImgUnaligned( + pImgBlockStart, + *ppBuf, + bufStrideY, + bufStrideZ, + imageBlocksY, + imageBlocksZ, + *pOrigin, + { pExtent->width, pExtent->height, zSize }, + pipeBankXor, + isInMipTail, + addresser); + pExtent->depth -= zSize; + pOrigin->z += zSize; + *ppBuf = VoidPtrInc(*ppBuf, zSize * bufStrideZ); + } + + // At this point the starts are aligned, so we can care about just size rather than origin+size. + if ((pExtent->width) != PowTwoAlignDown(pExtent->width, align.width)) + { + UINT_32 xAlignedSize = PowTwoAlignDown(pOrigin->x + pExtent->width, align.width) - pOrigin->x; + void* pBuf = VoidPtrInc(*ppBuf, xAlignedSize << BPELog2); + CopyImgUnaligned( + pImgBlockStart, + pBuf, + bufStrideY, + bufStrideZ, + imageBlocksY, + imageBlocksZ, + { pOrigin->x + xAlignedSize, pOrigin->y, pOrigin->z}, + { pExtent->width - xAlignedSize, pExtent->height, pExtent->depth }, + pipeBankXor, + isInMipTail, + addresser); + pExtent->width = xAlignedSize; + } + + if ((pExtent->height) != PowTwoAlignDown(pExtent->height, align.height)) + { + UINT_32 yAlignedSize = PowTwoAlignDown(pOrigin->y + pExtent->height, align.height) - pOrigin->y; + void* pBuf = VoidPtrInc(*ppBuf, yAlignedSize * bufStrideY); + CopyImgUnaligned( + pImgBlockStart, + pBuf, + bufStrideY, + bufStrideZ, + imageBlocksY, + imageBlocksZ, + { pOrigin->x, pOrigin->y + yAlignedSize, pOrigin->z}, + { pExtent->width, pExtent->height - yAlignedSize, pExtent->depth }, + pipeBankXor, + isInMipTail, + addresser); + pExtent->height = yAlignedSize; + } + + if ((pExtent->depth) != PowTwoAlignDown(pExtent->depth, align.depth)) + { + UINT_32 zAlignedSize = PowTwoAlignDown(pOrigin->z + pExtent->depth, align.depth) - pOrigin->z; + void* pBuf = VoidPtrInc(*ppBuf, zAlignedSize * bufStrideZ); + CopyImgUnaligned( + pImgBlockStart, + pBuf, + bufStrideY, + bufStrideZ, + imageBlocksY, + imageBlocksZ, + { pOrigin->x, pOrigin->y, pOrigin->z + zAlignedSize }, + { pExtent->width, pExtent->height, pExtent->depth - zAlignedSize }, + pipeBankXor, + isInMipTail, + addresser); + pExtent->depth = zAlignedSize; + } +} + +/** +**************************************************************************************************** +* CopyMemImgHybrid +* +* @brief +* Copies a 3D pixel region to a surface. Uses fast copies for fully covered microblocks. +**************************************************************************************************** +*/ +template +AVX2_FUNC NEON_FUNC void CopyMemImgHybrid( + void* pImgBlockStart, // Block corresponding to beginning of image + void* pBuf, // Pointer to data starting from the copy origin. + size_t bufStrideY, // Stride of each row in pBuf + size_t bufStrideZ, // Stride of each slice in pBuf + UINT_32 imageBlocksY, // Width of the image slice, in blocks. + UINT_32 imageBlocksZ, // Depth pitch of the image slice, in blocks. + ADDR_COORD3D origin, // Absolute origin, in elements + ADDR_EXTENT3D extent, // Size to copy, in elements + UINT_32 pipeBankXor, // Final value to xor in + BOOL_32 isInMipTail, // True if this is in the mip tail. + const LutAddresser& addresser) +{ + // Handle unaligned edges in x/y/z and fixup the extents to match. + HandleUnalignedRegions( + pImgBlockStart, + &pBuf, + bufStrideY, + bufStrideZ, + imageBlocksY, + imageBlocksZ, + &origin, + &extent, + MicroSw::MicroBlockExtent, + pipeBankXor, + isInMipTail, + addresser + ); + + // Apply a negative x/y offset now so later code can do eg. pBuf[x] instead of pBuf[x - origin.x] + // Keep the z offset. + pBuf = VoidPtrDec(pBuf, origin.x << MicroSw::BpeLog2); + + void* pSliceBuf = pBuf; + // Do things one slice/row at a time for unaligned regions. + for (UINT_32 z = origin.z; z < (origin.z + extent.depth); z += MicroSw::MicroBlockExtent.depth) + { + UINT_32 sliceXor = pipeBankXor ^ addresser.GetAddressZ(z); + UINT_32 zBlk = (z >> addresser.GetBlockZBits()) * imageBlocksZ; + void* pRowBuf = pSliceBuf; + for (UINT_32 y = origin.y; y < (origin.y + extent.height); y += MicroSw::MicroBlockExtent.height) + { + UINT_32 yBlk = ((y >> addresser.GetBlockYBits()) * imageBlocksY) + zBlk; + UINT_32 rowXor = sliceXor ^ addresser.GetAddressY(y); + + for (UINT_32 x = origin.x; x < (origin.x + extent.width); x += MicroSw::MicroBlockExtent.width) + { + UINT_32 xBlk = (x >> addresser.GetBlockXBits()) + yBlk; + UINT_64 offset = (xBlk << addresser.GetBlockBits()); + offset ^= rowXor; + offset ^= addresser.GetAddressX(x); + + void* pPix = VoidPtrInc(pImgBlockStart, offset); + void* pPixBuf = VoidPtrInc(pRowBuf, x << MicroSw::BpeLog2); + + MicroSw::CopyMicroBlock( + pPix, + pPixBuf, + bufStrideY, + bufStrideZ + ); + } + pRowBuf = VoidPtrInc(pRowBuf, bufStrideY * MicroSw::MicroBlockExtent.height); + } + pSliceBuf = VoidPtrInc(pSliceBuf, bufStrideZ * MicroSw::MicroBlockExtent.depth); + } +} + +/** +**************************************************************************************************** +* CopyMemImgMicroblocks +* +* @brief +* Copies the microblocks of a 3D pixel region to/from a surface. +**************************************************************************************************** +*/ +template +AVX2_FUNC NEON_FUNC void CopyMemImgMicroblocks( + void* pImgBlockStart, // Block corresponding to beginning of image + void* pBuf, // Pointer to data starting from the copy origin. + size_t bufStrideY, // Stride of each row in pBuf, ignored. + size_t bufStrideZ, // Stride of each slice in pBuf, ignored. + UINT_32 imageBlocksY, // Width of the image slice, in blocks. + UINT_32 imageBlocksZ, // Depth pitch of the image slice, in blocks. + ADDR_COORD3D origin, // Absolute origin, in elements + ADDR_EXTENT3D extent, // Size to copy, in elements + UINT_32 pipeBankXor, // Final value to xor in + BOOL_32 isInMipTail, // True if this is in the mip tail. + const LutAddresser& addresser) +{ + // Pad out our dims to microblock boundaries. + origin.x = PowTwoAlignDown(origin.x, addresser.GetMicroBlockX()); + origin.y = PowTwoAlignDown(origin.y, addresser.GetMicroBlockY()); + origin.z = PowTwoAlignDown(origin.z, addresser.GetMicroBlockZ()); + extent.width = PowTwoAlign(extent.width, addresser.GetMicroBlockX()); + extent.height = PowTwoAlign(extent.height, addresser.GetMicroBlockY()); + extent.depth = PowTwoAlign(extent.depth, addresser.GetMicroBlockZ()); + + // Calculate the address of the first pixel in each microblock (256B), then copy it. + for (UINT_32 z = origin.z; z < (origin.z + extent.depth); z += addresser.GetMicroBlockZ()) + { + UINT_32 sliceXor = pipeBankXor ^ addresser.GetAddressZ(z); + UINT_32 zBlk = (z >> addresser.GetBlockZBits()) * imageBlocksZ; + for (UINT_32 y = origin.y; y < (origin.y + extent.height); y += addresser.GetMicroBlockY()) + { + UINT_32 yBlk = ((y >> addresser.GetBlockYBits()) * imageBlocksY) + zBlk; + UINT_32 rowXor = sliceXor ^ addresser.GetAddressY(y); + + for (UINT_32 x = origin.x; x < (origin.x + extent.width); x += addresser.GetMicroBlockX()) + { + UINT_32 xBlk = (x >> addresser.GetBlockXBits()) + yBlk; + UINT_64 offset = (xBlk << addresser.GetBlockBits()); + offset ^= rowXor; + offset ^= addresser.GetAddressX(x); + + void* pPix = VoidPtrInc(pImgBlockStart, offset); + constexpr UINT_32 CopySize = 1 << 8; + +#if ADDR_HAS_AVX2 + if (NonTemporal && ImgIsDest) + { + StreamCopyToImgAligned(pPix, pBuf, CopySize); + } + else if (NonTemporal) + { + StreamCopyFromImgAligned(pBuf, pPix, CopySize); + } + else +#endif + if (ImgIsDest) + { + memcpy(pPix, pBuf, CopySize); + } + else + { + memcpy(pBuf, pPix, CopySize); + } + pBuf = VoidPtrInc(pBuf, CopySize); + } + } + } +} + +/** +**************************************************************************************************** +* CopyMemImgBlocks +* +* @brief +* Copies the blocks of a 3D pixel region to/from a surface. +**************************************************************************************************** +*/ +template +AVX2_FUNC NEON_FUNC void CopyMemImgBlocks( + void* pImgBlockStart, // Block corresponding to beginning of image + void* pBuf, // Pointer to data starting from the copy origin. + size_t bufStrideY, // Stride of each row in pBuf, ignored. + size_t bufStrideZ, // Stride of each slice in pBuf, ignored. + UINT_32 imageBlocksY, // Width of the image slice, in blocks. + UINT_32 imageBlocksZ, // Depth pitch of the image slice, in blocks. + ADDR_COORD3D origin, // Absolute origin, in elements + ADDR_EXTENT3D extent, // Size to copy, in elements + UINT_32 pipeBankXor, // Final value to xor in + BOOL_32 isInMipTail, // True if this is in the mip tail. + const LutAddresser& addresser) +{ + if (isInMipTail) + { + return CopyMemImgMicroblocks( + pImgBlockStart, + pBuf, + bufStrideY, + bufStrideZ, + imageBlocksY, + imageBlocksZ, + origin, + extent, + pipeBankXor, + isInMipTail, + addresser + ); + } + + // Pad out our dims to block boundaries. + origin.x = PowTwoAlignDown(origin.x, addresser.GetBlockX()); + origin.y = PowTwoAlignDown(origin.y, addresser.GetBlockY()); + origin.z = PowTwoAlignDown(origin.z, addresser.GetBlockZ()); + extent.width = PowTwoAlign(extent.width, addresser.GetBlockX()); + extent.height = PowTwoAlign(extent.height, addresser.GetBlockY()); + extent.depth = PowTwoAlign(extent.depth, addresser.GetBlockZ()); + + // Copy block by block. No complex swizzling here, everything is in (strided) typewriter order. + for (UINT_32 z = origin.z; z < (origin.z + extent.depth); z += addresser.GetBlockZ()) + { + UINT_32 zBlk = (z >> addresser.GetBlockZBits()) * imageBlocksZ; + for (UINT_32 y = origin.y; y < (origin.y + extent.height); y += addresser.GetBlockY()) + { + UINT_32 yBlk = ((y >> addresser.GetBlockYBits()) * imageBlocksY) + zBlk; + UINT_32 xBlkStart = (origin.x >> addresser.GetBlockXBits()) + yBlk; + UINT_32 numXBlk = extent.width >> addresser.GetBlockXBits(); + UINT_64 offset = (xBlkStart << addresser.GetBlockBits()); + + void* pPix = VoidPtrInc(pImgBlockStart, offset); + UINT_32 copySize = numXBlk << addresser.GetBlockBits(); + +#if ADDR_HAS_AVX2 + if (NonTemporal && ImgIsDest) + { + StreamCopyToImgAligned(pPix, pBuf, copySize); + } + else if (NonTemporal) + { + StreamCopyFromImgAligned(pBuf, pPix, copySize); + } + else +#endif + if (ImgIsDest) + { + memcpy(pPix, pBuf, copySize); + } + else + { + memcpy(pBuf, pPix, copySize); + } + pBuf = VoidPtrInc(pBuf, copySize); + } } } @@ -368,33 +808,130 @@ void Copy2DSliceUnaligned( * Determines and returns which copy function to use for copying to images **************************************************************************************************** */ -UnalignedCopyMemImgFunc LutAddresser::GetCopyMemImgFunc() const +UnalignedCopyMemImgFunc LutAddresser::GetCopyMemImgFunc( + ADDR_COPY_FLAGS flags + ) const { + UnalignedCopyMemImgFunc pfnRet = nullptr; + // This key encodes how the bottom 8 bits (256B) are formed, so we can match to the correct optimized + // swizzle function (they are all swizzle-agnostic beyond those 256B). + UINT_64 microSwKey = GetMicroSwKey(reinterpret_cast(&m_bit[0])); + + if (flags.blockMemcpy) + { +#if ADDR_HAS_AVX2 + if (CpuSupportsAvx2()) + { + pfnRet = CopyMemImgBlocks; + } + else +#endif + { + pfnRet = CopyMemImgBlocks; + } + } + + if ((pfnRet == nullptr) && flags.hybridMemcpy) + { +#if ADDR_HAS_AVX2 + if (CpuSupportsAvx2()) + { + pfnRet = CopyMemImgMicroblocks; + } + else +#endif + { + pfnRet = CopyMemImgMicroblocks; + } + } + + // If this is one of the known microswizzles and CPU support is present, use a hybrid copy that does + // SIMD swizzling for aligned regions and falls back for unaligned edges. +#if ADDR_HAS_AVX2 + static constexpr struct { + UINT_64 microSwKey; + UnalignedCopyMemImgFunc pfn; + } AvxFuncs[] = { + { GetMicroSwKey(MicroSw_2D_1BPE_AVX2::MicroEq), CopyMemImgHybrid}, + { GetMicroSwKey(MicroSw_2D_2BPE_AVX2::MicroEq), CopyMemImgHybrid}, + { GetMicroSwKey(MicroSw_2D_4BPE_AVX2::MicroEq), CopyMemImgHybrid}, + { GetMicroSwKey(MicroSw_2D_8BPE_AVX2::MicroEq), CopyMemImgHybrid}, + { GetMicroSwKey(MicroSw_2D_16BPE_AVX2::MicroEq), CopyMemImgHybrid}, + { GetMicroSwKey(MicroSw_3D_1BPE_AVX2::MicroEq), CopyMemImgHybrid}, + { GetMicroSwKey(MicroSw_3D_2BPE_AVX2::MicroEq), CopyMemImgHybrid}, + { GetMicroSwKey(MicroSw_3D_4BPE_AVX2::MicroEq), CopyMemImgHybrid}, + { GetMicroSwKey(MicroSw_3D_8BPE_AVX2::MicroEq), CopyMemImgHybrid}, + { GetMicroSwKey(MicroSw_3D_16BPE_AVX2::MicroEq), CopyMemImgHybrid}, + { GetMicroSwKey(MicroSw_R_1BPE_AVX2::MicroEq), CopyMemImgHybrid}, + { GetMicroSwKey(MicroSw_R_2BPE_AVX2::MicroEq), CopyMemImgHybrid}, + { GetMicroSwKey(MicroSw_R_4BPE_AVX2::MicroEq), CopyMemImgHybrid}, + { GetMicroSwKey(MicroSw_Z_1BPE_AVX2::MicroEq), CopyMemImgHybrid}, + { GetMicroSwKey(MicroSw_D_1BPE_AVX2::MicroEq), CopyMemImgHybrid} + }; + if ((pfnRet == nullptr) && CpuSupportsAvx2()) + { + for (const auto& func : AvxFuncs) + { + if (func.microSwKey == microSwKey) + { + pfnRet = func.pfn; + break; + } + } + } +#endif // ADDR_HAS_AVX2 + +#if ADDR_HAS_NEON + static constexpr struct { + UINT_64 microSwKey; + UnalignedCopyMemImgFunc pfn; + } NeonFuncs[] = { + { GetMicroSwKey(MicroSw_2D_1BPE_NEON::MicroEq), CopyMemImgHybrid}, + { GetMicroSwKey(MicroSw_2D_2BPE_NEON::MicroEq), CopyMemImgHybrid}, + { GetMicroSwKey(MicroSw_2D_4BPE_NEON::MicroEq), CopyMemImgHybrid}, + { GetMicroSwKey(MicroSw_2D_8BPE_NEON::MicroEq), CopyMemImgHybrid}, + { GetMicroSwKey(MicroSw_2D_16BPE_NEON::MicroEq), CopyMemImgHybrid}, + { GetMicroSwKey(MicroSw_3D_1BPE_NEON::MicroEq), CopyMemImgHybrid}, + { GetMicroSwKey(MicroSw_3D_2BPE_NEON::MicroEq), CopyMemImgHybrid}, + { GetMicroSwKey(MicroSw_3D_4BPE_NEON::MicroEq), CopyMemImgHybrid}, + { GetMicroSwKey(MicroSw_3D_8BPE_NEON::MicroEq), CopyMemImgHybrid}, + { GetMicroSwKey(MicroSw_3D_16BPE_NEON::MicroEq), CopyMemImgHybrid}, + { GetMicroSwKey(MicroSw_R_1BPE_NEON::MicroEq), CopyMemImgHybrid}, + { GetMicroSwKey(MicroSw_R_2BPE_NEON::MicroEq), CopyMemImgHybrid}, + { GetMicroSwKey(MicroSw_R_4BPE_NEON::MicroEq), CopyMemImgHybrid}, + { GetMicroSwKey(MicroSw_Z_1BPE_NEON::MicroEq), CopyMemImgHybrid}, + { GetMicroSwKey(MicroSw_D_1BPE_NEON::MicroEq), CopyMemImgHybrid} + }; + if ((pfnRet == nullptr) && CpuSupportsNeon()) + { + for (const auto& func : NeonFuncs) + { + if (func.microSwKey == microSwKey) + { + pfnRet = func.pfn; + break; + } + } + } +#endif // ADDR_HAS_NEON + // While these are all the same function, the codegen gets really bad if the size of each pixel // is not known at compile time. Hence, templates. const UnalignedCopyMemImgFunc Funcs[MaxElementBytesLog2][3] = { // ExpandX = 1, 2, 4 - { Copy2DSliceUnaligned<0, 1, true>, Copy2DSliceUnaligned<0, 2, true>, Copy2DSliceUnaligned<0, 4, true> }, // 1BPE - { Copy2DSliceUnaligned<1, 1, true>, Copy2DSliceUnaligned<1, 2, true>, Copy2DSliceUnaligned<1, 4, true> }, // 2BPE - { Copy2DSliceUnaligned<2, 1, true>, Copy2DSliceUnaligned<2, 2, true>, Copy2DSliceUnaligned<2, 4, true> }, // 4BPE - { Copy2DSliceUnaligned<3, 1, true>, Copy2DSliceUnaligned<3, 2, true>, Copy2DSliceUnaligned<3, 4, true> }, // 8BPE - { Copy2DSliceUnaligned<4, 1, true>, Copy2DSliceUnaligned<4, 2, true>, Copy2DSliceUnaligned<4, 4, true> }, // 16BPE + { CopyImgUnaligned<0, 1, true>, CopyImgUnaligned<0, 2, true>, CopyImgUnaligned<0, 4, true> }, // 1BPE + { CopyImgUnaligned<1, 1, true>, CopyImgUnaligned<1, 2, true>, CopyImgUnaligned<1, 4, true> }, // 2BPE + { CopyImgUnaligned<2, 1, true>, CopyImgUnaligned<2, 2, true>, CopyImgUnaligned<2, 4, true> }, // 4BPE + { CopyImgUnaligned<3, 1, true>, CopyImgUnaligned<3, 2, true>, CopyImgUnaligned<3, 4, true> }, // 8BPE + { CopyImgUnaligned<4, 1, true>, CopyImgUnaligned<4, 2, true>, CopyImgUnaligned<4, 4, true> }, // 16BPE }; - UnalignedCopyMemImgFunc pfnRet = nullptr; - ADDR_ASSERT(m_bpeLog2 < MaxElementBytesLog2); - if (m_maxExpandX >= 4) + // Fallback functions + if (pfnRet == nullptr) { - pfnRet = Funcs[m_bpeLog2][2]; - } - else if (m_maxExpandX >= 2) - { - pfnRet = Funcs[m_bpeLog2][1]; - } - else - { - pfnRet = Funcs[m_bpeLog2][0]; + ADDR_ASSERT(m_bpeLog2 < MaxElementBytesLog2); + pfnRet = Funcs[m_bpeLog2][Min(2U, Log2(m_maxExpandX))]; } return pfnRet; } @@ -407,35 +944,139 @@ UnalignedCopyMemImgFunc LutAddresser::GetCopyMemImgFunc() const * Determines and returns which copy function to use for copying from images **************************************************************************************************** */ -UnalignedCopyMemImgFunc LutAddresser::GetCopyImgMemFunc() const +UnalignedCopyMemImgFunc LutAddresser::GetCopyImgMemFunc( + ADDR_COPY_FLAGS flags + ) const { + UnalignedCopyMemImgFunc pfnRet = nullptr; + if (flags.blockMemcpy) + { +#if ADDR_HAS_AVX2 + if (CpuSupportsAvx2()) + { + pfnRet = CopyMemImgBlocks; + } + else +#endif + { + pfnRet = CopyMemImgBlocks; + } + } + + if ((pfnRet == nullptr) && flags.hybridMemcpy) + { +#if ADDR_HAS_AVX2 + if (CpuSupportsAvx2()) + { + pfnRet = CopyMemImgMicroblocks; + } + else +#endif + { + pfnRet = CopyMemImgMicroblocks; + } + } // While these are all the same function, the codegen gets really bad if the size of each pixel // is not known at compile time. Hence, templates. const UnalignedCopyMemImgFunc Funcs[MaxElementBytesLog2][3] = { // ExpandX = 1, 2, 4 - { Copy2DSliceUnaligned<0, 1, false>, Copy2DSliceUnaligned<0, 2, false>, Copy2DSliceUnaligned<0, 4, false> }, // 1BPE - { Copy2DSliceUnaligned<1, 1, false>, Copy2DSliceUnaligned<1, 2, false>, Copy2DSliceUnaligned<1, 4, false> }, // 2BPE - { Copy2DSliceUnaligned<2, 1, false>, Copy2DSliceUnaligned<2, 2, false>, Copy2DSliceUnaligned<2, 4, false> }, // 4BPE - { Copy2DSliceUnaligned<3, 1, false>, Copy2DSliceUnaligned<3, 2, false>, Copy2DSliceUnaligned<3, 4, false> }, // 8BPE - { Copy2DSliceUnaligned<4, 1, false>, Copy2DSliceUnaligned<4, 2, false>, Copy2DSliceUnaligned<4, 4, false> }, // 16BPE + { CopyImgUnaligned<0, 1, false>, CopyImgUnaligned<0, 2, false>, CopyImgUnaligned<0, 4, false> }, // 1BPE + { CopyImgUnaligned<1, 1, false>, CopyImgUnaligned<1, 2, false>, CopyImgUnaligned<1, 4, false> }, // 2BPE + { CopyImgUnaligned<2, 1, false>, CopyImgUnaligned<2, 2, false>, CopyImgUnaligned<2, 4, false> }, // 4BPE + { CopyImgUnaligned<3, 1, false>, CopyImgUnaligned<3, 2, false>, CopyImgUnaligned<3, 4, false> }, // 8BPE + { CopyImgUnaligned<4, 1, false>, CopyImgUnaligned<4, 2, false>, CopyImgUnaligned<4, 4, false> }, // 16BPE }; - UnalignedCopyMemImgFunc pfnRet = nullptr; ADDR_ASSERT(m_bpeLog2 < MaxElementBytesLog2); - if (m_maxExpandX >= 4) + if (pfnRet == nullptr) { - pfnRet = Funcs[m_bpeLog2][2]; - } - else if (m_maxExpandX >= 2) - { - pfnRet = Funcs[m_bpeLog2][1]; - } - else - { - pfnRet = Funcs[m_bpeLog2][0]; + pfnRet = Funcs[m_bpeLog2][Min(2U, Log2(m_maxExpandX))]; } return pfnRet; } +/** +**************************************************************************************************** +* LutAddresser::DoCopyImgMemPreFlushes +* +* @brief +* Does any flushes required for nontemporal SIMD instructions to access the image memory. +**************************************************************************************************** +*/ +void LutAddresser::DoCopyImgMemPreFlushes( + ADDR_COPY_FLAGS flags + ) const +{ +#if ADDR_HAS_AVX2 + if ((flags.blockMemcpy || flags.hybridMemcpy) && CpuSupportsAvx2()) + { + // Loads are weakly ordered, and we need to ensure they start after the previous copy + NonTemporalLoadStoreFence(); + } +#endif +} + +/** +**************************************************************************************************** +* LutAddresser::DoCopyMemImgPostFlushes +* +* @brief +* Does any flushes required for nontemporal SIMD instructions to access the image memory. +**************************************************************************************************** +*/ +void LutAddresser::DoCopyMemImgPostFlushes( + ADDR_COPY_FLAGS flags + ) const +{ +#if ADDR_HAS_AVX2 + if (CpuSupportsAvx2()) + { + // Stores are weakly ordered, and we need to ensure they finish before the next submission + // or copy. + NonTemporalStoreFence(); + } +#endif +} + + +#if __cplusplus < 201703L +// Constexpr arrays need an additional definition at namespace scope until c++17 +#if ADDR_HAS_AVX2 +constexpr ADDR_EXTENT3D MicroSw_2D_1BPE_AVX2::MicroBlockExtent; +constexpr ADDR_EXTENT3D MicroSw_2D_2BPE_AVX2::MicroBlockExtent; +constexpr ADDR_EXTENT3D MicroSw_2D_4BPE_AVX2::MicroBlockExtent; +constexpr ADDR_EXTENT3D MicroSw_2D_8BPE_AVX2::MicroBlockExtent; +constexpr ADDR_EXTENT3D MicroSw_2D_16BPE_AVX2::MicroBlockExtent; +constexpr ADDR_EXTENT3D MicroSw_3D_1BPE_AVX2::MicroBlockExtent; +constexpr ADDR_EXTENT3D MicroSw_3D_2BPE_AVX2::MicroBlockExtent; +constexpr ADDR_EXTENT3D MicroSw_3D_4BPE_AVX2::MicroBlockExtent; +constexpr ADDR_EXTENT3D MicroSw_3D_8BPE_AVX2::MicroBlockExtent; +constexpr ADDR_EXTENT3D MicroSw_3D_16BPE_AVX2::MicroBlockExtent; +constexpr ADDR_EXTENT3D MicroSw_R_1BPE_AVX2::MicroBlockExtent; +constexpr ADDR_EXTENT3D MicroSw_R_2BPE_AVX2::MicroBlockExtent; +constexpr ADDR_EXTENT3D MicroSw_R_4BPE_AVX2::MicroBlockExtent; +constexpr ADDR_EXTENT3D MicroSw_Z_1BPE_AVX2::MicroBlockExtent; +constexpr ADDR_EXTENT3D MicroSw_D_1BPE_AVX2::MicroBlockExtent; +#endif +#if ADDR_HAS_NEON +constexpr ADDR_EXTENT3D MicroSw_2D_1BPE_NEON::MicroBlockExtent; +constexpr ADDR_EXTENT3D MicroSw_2D_2BPE_NEON::MicroBlockExtent; +constexpr ADDR_EXTENT3D MicroSw_2D_4BPE_NEON::MicroBlockExtent; +constexpr ADDR_EXTENT3D MicroSw_2D_8BPE_NEON::MicroBlockExtent; +constexpr ADDR_EXTENT3D MicroSw_2D_16BPE_NEON::MicroBlockExtent; +constexpr ADDR_EXTENT3D MicroSw_3D_1BPE_NEON::MicroBlockExtent; +constexpr ADDR_EXTENT3D MicroSw_3D_2BPE_NEON::MicroBlockExtent; +constexpr ADDR_EXTENT3D MicroSw_3D_4BPE_NEON::MicroBlockExtent; +constexpr ADDR_EXTENT3D MicroSw_3D_8BPE_NEON::MicroBlockExtent; +constexpr ADDR_EXTENT3D MicroSw_3D_16BPE_NEON::MicroBlockExtent; +constexpr ADDR_EXTENT3D MicroSw_R_1BPE_NEON::MicroBlockExtent; +constexpr ADDR_EXTENT3D MicroSw_R_2BPE_NEON::MicroBlockExtent; +constexpr ADDR_EXTENT3D MicroSw_R_4BPE_NEON::MicroBlockExtent; +constexpr ADDR_EXTENT3D MicroSw_Z_1BPE_NEON::MicroBlockExtent; +constexpr ADDR_EXTENT3D MicroSw_D_1BPE_NEON::MicroBlockExtent; +#endif + +#endif + } diff --git a/src/amd/addrlib/src/core/addrswizzler.h b/src/amd/addrlib/src/core/addrswizzler.h index feace2761c1..7f615711b84 100644 --- a/src/amd/addrlib/src/core/addrswizzler.h +++ b/src/amd/addrlib/src/core/addrswizzler.h @@ -1,7 +1,8 @@ /* ************************************************************************************************************************ * -* Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. +* Copyright (C) 2024-2026 Advanced Micro Devices, Inc. All rights reserved. +* SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ /** @@ -26,10 +27,13 @@ typedef void (*UnalignedCopyMemImgFunc)( void* pImgBlockSliceStart, // Block corresponding to beginning of slice void* pBuf, // Pointer to data starting from the copy origin. size_t bufStrideY, // Stride of each row in pBuf + size_t bufStrideZ, // Stride of each slice in pBuf UINT_32 imageBlocksY, // Width of the image slice, in blocks. - ADDR_COORD2D origin, // Absolute origin, in elements - ADDR_EXTENT2D extent, // Size to copy, in elements - UINT_32 sliceXor, // Includes pipeBankXor and z XOR + UINT_32 imageBlocksZ, // Depth pitch of the image slice, in blocks. + ADDR_COORD3D origin, // Absolute origin, in elements + ADDR_EXTENT3D extent, // Size to copy, in elements + UINT_32 pipeBankXor, // Final value to XOR into the address + BOOL_32 isInMipTail, // True if this is in the mip tail. const LutAddresser& addresser); // This class calculates and holds up to four lookup tables (x/y/z/s) which can be used to cheaply calculate the @@ -60,10 +64,21 @@ public: // Get the block size UINT_32 GetBlockBits() const { return m_blockBits; } + UINT_32 GetBlockX() const { return m_blockSize.width; } + UINT_32 GetBlockY() const { return m_blockSize.height; } + UINT_32 GetBlockZ() const { return m_blockSize.depth; } UINT_32 GetBlockXBits() const { return Log2(m_blockSize.width); } UINT_32 GetBlockYBits() const { return Log2(m_blockSize.height); } UINT_32 GetBlockZBits() const { return Log2(m_blockSize.depth); } + // Get the microblock size + UINT_32 GetMicroBlockX() const { return m_microBlockSize.width; } + UINT_32 GetMicroBlockY() const { return m_microBlockSize.height; } + UINT_32 GetMicroBlockZ() const { return m_microBlockSize.depth; } + + // Get other image props + UINT_32 GetBpeLog2() const { return m_bpeLog2; } + // "Fast single channel" functions to get the part that each channel contributes to be XORd together. UINT_32 GetAddressX(UINT_32 x) const { return m_pXLut[x & m_xLutMask];} UINT_32 GetAddressY(UINT_32 y) const { return m_pYLut[y & m_yLutMask];} @@ -71,8 +86,11 @@ public: UINT_32 GetAddressS(UINT_32 s) const { return m_pSLut[s & m_sLutMask];} // Get a function that can copy a single 2D slice of an image with this swizzle. - UnalignedCopyMemImgFunc GetCopyMemImgFunc() const; - UnalignedCopyMemImgFunc GetCopyImgMemFunc() const; + UnalignedCopyMemImgFunc GetCopyMemImgFunc(ADDR_COPY_FLAGS flags) const; + UnalignedCopyMemImgFunc GetCopyImgMemFunc(ADDR_COPY_FLAGS flags) const; + + void DoCopyMemImgPostFlushes(ADDR_COPY_FLAGS flags) const; + void DoCopyImgMemPreFlushes(ADDR_COPY_FLAGS flags) const; private: // Calculate general properties of the swizzle equations void InitSwizzleProps(); @@ -99,6 +117,9 @@ private: // The block size ADDR_EXTENT3D m_blockSize; + + // The microblock size + ADDR_EXTENT3D m_microBlockSize; // Number of 'x' bits at the bottom of the equation. Must be a pow2 and at least 1. // This will be used as a simple optimization to batch together operations on adjacent x pixels. diff --git a/src/amd/addrlib/src/core/addrswizzlersimd.h b/src/amd/addrlib/src/core/addrswizzlersimd.h new file mode 100644 index 00000000000..fd53e4080a3 --- /dev/null +++ b/src/amd/addrlib/src/core/addrswizzlersimd.h @@ -0,0 +1,2031 @@ +/* +************************************************************************************************************************ +* +* Copyright (C) 2024-2026 Advanced Micro Devices, Inc. All rights reserved. +* SPDX-License-Identifier: MIT +* +***********************************************************************************************************************/ +/** +**************************************************************************************************** +* @file addrswizzlersimd.h +* @brief Contains CPU/swizzle-specific code for efficient CPU swizzling. +**************************************************************************************************** +*/ + +#ifndef __ADDR_SWIZZLER_SIMD_H__ +#define __ADDR_SWIZZLER_SIMD_H__ + +#include "addrswizzler.h" +#include "addrcommon.h" + +#if !ADDR_ALLOW_SIMD +// Disabled +#define ADDR_HAS_AVX2 0 +#define AVX2_FUNC +#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) +// x86 visual studio builds +#define ADDR_HAS_AVX2 1 +#define AVX2_FUNC +#elif defined(__x86_64__) || defined(__i386__) +// x86 GCC/Clang builds +#define ADDR_HAS_AVX2 1 +#define AVX2_FUNC [[gnu::target("avx2")]] +#else +// Unknown +#define ADDR_HAS_AVX2 0 +#define AVX2_FUNC +#endif + +#if !ADDR_ALLOW_SIMD +// Disabled +#define ADDR_HAS_NEON 0 +#define NEON_FUNC +#elif defined(_MSC_VER) && (defined(_M_ARM64) || defined(_M_ARM)) +// arm visual studio builds +#define ADDR_HAS_NEON 1 +#define NEON_FUNC +#elif (defined(__linux__) || defined(_WIN32)) && (defined(__aarch64__) || (defined(__arm__) && defined(__ARM_FP))) +// arm GCC/Clang builds on windows or linux +#define ADDR_HAS_NEON 1 +#define NEON_FUNC +#else +// Unknown +#define ADDR_HAS_NEON 0 +#define NEON_FUNC +#endif + +#if ADDR_HAS_AVX2 +#if _MSC_VER +#include +#endif +#include + +// Certain compiler versions lack this intrinsic. +#if (defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 8) || (_MSC_VER && !defined(_mm256_set_m128i)) +#define _mm256_set_m128i(hi, lo) _mm256_inserti128_si256(_mm256_castsi128_si256(lo), (hi), 1) +#endif +#endif + +#if ADDR_HAS_NEON +#if _WIN32 +#include +#else +#include +#include +#include +#endif +#endif + +namespace Addr +{ + +#if ADDR_HAS_AVX2 +static inline bool CpuSupportsAvx2() { + // Use compiler builtins to check for support +#if _MSC_VER + return IsProcessorFeaturePresent(PF_AVX2_INSTRUCTIONS_AVAILABLE); +#elif defined(__GNUC__) + return __builtin_cpu_supports("avx2"); +#else +#error "What platform is this?" +#endif +} +#endif + +#if ADDR_HAS_NEON +static inline bool CpuSupportsNeon() { + // ARM can't check this without OS help. Use OS knowledge and helpers to check for support. +#if _WIN32 + return true; // Mandatory for WoA +#elif defined(__linux__) && defined(__aarch64__) + return ((getauxval(AT_HWCAP) & HWCAP_ASIMD) != 0); +#elif defined(__linux__) && defined(__arm__) + return ((getauxval(AT_HWCAP) & HWCAP_NEON) != 0); +#else +#error "What platform is this?" +#endif +} +#endif + +constexpr UINT_64 GetMicroSwKey(const UINT_64* pEq, bool isPlanarMsaa = false) +{ + UINT_64 out = 0; + + // Microswizzles never have move than 1 xor, so just use the log2 of each bit (6 bits each = 48 bits) + for (UINT_32 i = 0; i < 8; i++) + { + if (pEq[i] != 0) + { + out |= (static_cast(ConstexprLog2(pEq[i]) + 1) << (i * 6)); + } + } + + // Bits 48+ can be other things. The big one is if this is actually MSAA but the sample bits are in the + // high part of the equation. + if (isPlanarMsaa) + { + out |= (static_cast(1) << 48); + } + + return out; +} + +#if ADDR_HAS_AVX2 +// Ensures all non-temporal/stream stores have completed. +AVX2_FUNC static inline void NonTemporalStoreFence() +{ + _mm_sfence(); +} + +// Ensures all non-temporal/stream loads and stores have completed. +AVX2_FUNC static inline void NonTemporalLoadStoreFence() +{ + _mm_mfence(); +} + +AVX2_FUNC static inline void StreamCopyToImgAligned( + void* pImg, // Memory to write to, must be 256B aligned. + const void* pBuf, // Memory to read from, can be unaligned. + size_t size) // Bytes to copy, must be 256B aligned. +{ + __m256i* pAlignedOut = reinterpret_cast<__m256i*>(pImg); + const __m256i* pUnalignedIn = reinterpret_cast(pBuf); + ADDR_ASSERT(PowTwoAlign(uint64_t(size), 256ULL) == uint64_t(size)); + while (size > 0) + { + // Use streaming stores to optimize memory behavior-- this requires aligned memory. + _mm256_stream_si256(pAlignedOut++, _mm256_loadu_si256(pUnalignedIn++)); + _mm256_stream_si256(pAlignedOut++, _mm256_loadu_si256(pUnalignedIn++)); + _mm256_stream_si256(pAlignedOut++, _mm256_loadu_si256(pUnalignedIn++)); + _mm256_stream_si256(pAlignedOut++, _mm256_loadu_si256(pUnalignedIn++)); + _mm256_stream_si256(pAlignedOut++, _mm256_loadu_si256(pUnalignedIn++)); + _mm256_stream_si256(pAlignedOut++, _mm256_loadu_si256(pUnalignedIn++)); + _mm256_stream_si256(pAlignedOut++, _mm256_loadu_si256(pUnalignedIn++)); + _mm256_stream_si256(pAlignedOut++, _mm256_loadu_si256(pUnalignedIn++)); + + size -= 256; + } +} + +AVX2_FUNC static inline void StreamCopyFromImgAligned( + void* pBuf, // Memory to write to, can be unaligned. + const void* pImg, // Memory to read from, must be 256B aligned. + size_t size) // Bytes to copy, must be 256B aligned. +{ + __m256i* pUnalignedOut = reinterpret_cast<__m256i*>(pBuf); + const __m256i* pAlignedIn = reinterpret_cast(pImg); + ADDR_ASSERT(PowTwoAlign(uint64_t(size), 256ULL) == uint64_t(size)); + while (size > 0) + { + // Use streaming loads to optimize memory behavior-- this requires aligned memory. + _mm256_storeu_si256(pUnalignedOut++, _mm256_stream_load_si256(pAlignedIn++)); + _mm256_storeu_si256(pUnalignedOut++, _mm256_stream_load_si256(pAlignedIn++)); + _mm256_storeu_si256(pUnalignedOut++, _mm256_stream_load_si256(pAlignedIn++)); + _mm256_storeu_si256(pUnalignedOut++, _mm256_stream_load_si256(pAlignedIn++)); + _mm256_storeu_si256(pUnalignedOut++, _mm256_stream_load_si256(pAlignedIn++)); + _mm256_storeu_si256(pUnalignedOut++, _mm256_stream_load_si256(pAlignedIn++)); + _mm256_storeu_si256(pUnalignedOut++, _mm256_stream_load_si256(pAlignedIn++)); + _mm256_storeu_si256(pUnalignedOut++, _mm256_stream_load_si256(pAlignedIn++)); + + size -= 256; + } +} + +class MicroSw_2D_1BPE_AVX2 +{ + MicroSw_2D_1BPE_AVX2() = delete; +public: + static constexpr UINT_64 MicroEq[8] = { X0, X1, Y0, X2, Y1, Y2, X3, Y3 }; + static constexpr ADDR_EXTENT3D MicroBlockExtent = {16, 16, 1}; + static constexpr UINT_32 BpeLog2 = 0; + static constexpr UINT_32 ExpandX = 4; + + AVX2_FUNC static void CopyMicroBlock( + void* pImgMicroblock, // Microblock to write to + const void* pBuf, // Pointer to data starting from the first pixel of this block + size_t bufStrideY, // Stride of each row in pBuf + size_t bufStrideZ) // Stride of each slice in pBuf + { + // Unaligned buffer loads for 16 rows * (16 col * 8b = 128b) + // Yes, that means double the load instructions + // Each reg becomes: [ X3 X2 X1 X0 ] + __m128i y0 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 0)); + __m128i y1 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 1)); + __m128i y2 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 2)); + __m128i y3 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 3)); + __m128i y4 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 4)); + __m128i y5 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 5)); + __m128i y6 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 6)); + __m128i y7 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 7)); + __m128i y8 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 8)); + __m128i y9 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 9)); + __m128i y10 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 10)); + __m128i y11 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 11)); + __m128i y12 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 12)); + __m128i y13 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 13)); + __m128i y14 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 14)); + __m128i y15 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 15)); + + // First, concat the SSE regs so they end up in pairs that will unpack well + // Each reg becomes: [ Y1 X3 X2 X1 X0 ] + __m256i m1_0 = _mm256_set_m128i(y2, y0); + __m256i m1_1 = _mm256_set_m128i(y3, y1); + __m256i m1_2 = _mm256_set_m128i(y6, y4); + __m256i m1_3 = _mm256_set_m128i(y7, y5); + __m256i m1_4 = _mm256_set_m128i(y10, y8); + __m256i m1_5 = _mm256_set_m128i(y11, y9); + __m256i m1_6 = _mm256_set_m128i(y14, y12); + __m256i m1_7 = _mm256_set_m128i(y15, y13); + + // Unpack to handle the rest of the swizzling within each reg + // Each reg becomes: [ Y1 X2 Y0 X1 X0 ] + __m256i m2_0 = _mm256_unpacklo_epi32(m1_0, m1_1); + __m256i m2_1 = _mm256_unpackhi_epi32(m1_0, m1_1); + __m256i m2_2 = _mm256_unpacklo_epi32(m1_2, m1_3); + __m256i m2_3 = _mm256_unpackhi_epi32(m1_2, m1_3); + __m256i m2_4 = _mm256_unpacklo_epi32(m1_4, m1_5); + __m256i m2_5 = _mm256_unpackhi_epi32(m1_4, m1_5); + __m256i m2_6 = _mm256_unpacklo_epi32(m1_6, m1_7); + __m256i m2_7 = _mm256_unpackhi_epi32(m1_6, m1_7); + + // Move each reg around to handle high bit swizzling + // Use streaming stores to optimize memory behavior-- this requires aligned memory. + __m256i* pAlignedOut = reinterpret_cast<__m256i*>(pImgMicroblock); + _mm256_stream_si256(pAlignedOut++, m2_0); + _mm256_stream_si256(pAlignedOut++, m2_2); + _mm256_stream_si256(pAlignedOut++, m2_1); + _mm256_stream_si256(pAlignedOut++, m2_3); + _mm256_stream_si256(pAlignedOut++, m2_4); + _mm256_stream_si256(pAlignedOut++, m2_6); + _mm256_stream_si256(pAlignedOut++, m2_5); + _mm256_stream_si256(pAlignedOut++, m2_7); + } +}; + + +class MicroSw_2D_2BPE_AVX2 +{ + MicroSw_2D_2BPE_AVX2() = delete; +public: + static constexpr UINT_64 MicroEq[8] = { 0, X0, Y0, X1, Y1, X2, Y2, X3 }; + static constexpr ADDR_EXTENT3D MicroBlockExtent = {16, 8, 1}; + static constexpr UINT_32 BpeLog2 = 1; + static constexpr UINT_32 ExpandX = 2; + + AVX2_FUNC static void CopyMicroBlock( + void* pImgMicroblock, // Microblock to write to + const void* pBuf, // Pointer to data starting from the first pixel of this block + size_t bufStrideY, // Stride of each row in pBuf + size_t bufStrideZ) // Stride of each slice in pBuf + { + // Unaligned buffer loads for 8 rows * (16 col * 16b = 256b) + // Each reg becomes: [ X3 X2 X1 X0 0 ] + __m256i y0 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, bufStrideY * 0))); + __m256i y1 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, bufStrideY * 1))); + __m256i y2 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, bufStrideY * 2))); + __m256i y3 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, bufStrideY * 3))); + __m256i y4 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, bufStrideY * 4))); + __m256i y5 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, bufStrideY * 5))); + __m256i y6 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, bufStrideY * 6))); + __m256i y7 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, bufStrideY * 7))); + + // Do a 32-bit zip/unpack operation to interleave within each 128b reg. + // Each reg becomes: [ X2 X1 Y0 X0 0 ] + __m256i m0_0 = _mm256_unpacklo_epi32(y0, y1); + __m256i m0_1 = _mm256_unpackhi_epi32(y0, y1); + __m256i m0_2 = _mm256_unpacklo_epi32(y2, y3); + __m256i m0_3 = _mm256_unpackhi_epi32(y2, y3); + __m256i m0_4 = _mm256_unpacklo_epi32(y4, y5); + __m256i m0_5 = _mm256_unpackhi_epi32(y4, y5); + __m256i m0_6 = _mm256_unpacklo_epi32(y6, y7); + __m256i m0_7 = _mm256_unpackhi_epi32(y6, y7); + + // Then use a cross-lane dual permute to do a 128b interleave across y1 + // Each reg becomes: [ Y1 X1 Y0 X0 0 ] + __m256i m1_0 = _mm256_permute2x128_si256(m0_0, m0_2, 0x20); + __m256i m1_1 = _mm256_permute2x128_si256(m0_1, m0_3, 0x20); + __m256i m1_2 = _mm256_permute2x128_si256(m0_4, m0_6, 0x20); + __m256i m1_3 = _mm256_permute2x128_si256(m0_5, m0_7, 0x20); + __m256i m1_4 = _mm256_permute2x128_si256(m0_0, m0_2, 0x31); + __m256i m1_5 = _mm256_permute2x128_si256(m0_1, m0_3, 0x31); + __m256i m1_6 = _mm256_permute2x128_si256(m0_4, m0_6, 0x31); + __m256i m1_7 = _mm256_permute2x128_si256(m0_5, m0_7, 0x31); + + // Use streaming stores to optimize memory behavior-- this requires aligned memory. + __m256i* pAlignedOut = reinterpret_cast<__m256i*>(pImgMicroblock); + _mm256_stream_si256(pAlignedOut++, m1_0); + _mm256_stream_si256(pAlignedOut++, m1_1); + _mm256_stream_si256(pAlignedOut++, m1_2); + _mm256_stream_si256(pAlignedOut++, m1_3); + _mm256_stream_si256(pAlignedOut++, m1_4); + _mm256_stream_si256(pAlignedOut++, m1_5); + _mm256_stream_si256(pAlignedOut++, m1_6); + _mm256_stream_si256(pAlignedOut++, m1_7); + } +}; + + +class MicroSw_2D_4BPE_AVX2 +{ + MicroSw_2D_4BPE_AVX2() = delete; +public: + static constexpr UINT_64 MicroEq[8] = { 0, 0, X0, Y0, X1, Y1, X2, Y2 }; + static constexpr ADDR_EXTENT3D MicroBlockExtent = { 8, 8, 1}; + static constexpr UINT_32 BpeLog2 = 2; + static constexpr UINT_32 ExpandX = 2; + + AVX2_FUNC static void CopyMicroBlock( + void* pImgMicroblock, // Microblock to write to + const void* pBuf, // Pointer to data starting from the first pixel of this block + size_t bufStrideY, // Stride of each row in pBuf + size_t bufStrideZ) // Stride of each slice in pBuf + { + // Unaligned buffer loads for 8 rows * (8 col * 32b = 256b) + // Each reg becomes: [ X2 X1 X0 0 0 ] + __m256i y0 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, bufStrideY * 0))); + __m256i y1 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, bufStrideY * 1))); + __m256i y2 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, bufStrideY * 2))); + __m256i y3 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, bufStrideY * 3))); + __m256i y4 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, bufStrideY * 4))); + __m256i y5 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, bufStrideY * 5))); + __m256i y6 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, bufStrideY * 6))); + __m256i y7 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, bufStrideY * 7))); + + // In-register shuffling on the bottom 5 bits of which the bottom 3 are already right: (X1, Y0, *, *, *) + // Start: y0/y1: [ 0 1 2 3 ] [ 4 5 6 7 ] (QWORDS) + // Desired: [ 0 4 1 5 ] [ 2 6 3 7 ] + + // Do a permute to reorder each register to have all values in the correct 128b lanes. + // Swap 2/3 -> mask 0b11_01_10_00 -> 0xD8 + // Result: [ 0 2 1 3 ] [ 4 6 5 7 ] + // Each reg becomes: [ X1 X2 X0 0 0 ] + __m256i perm0 = _mm256_permute4x64_epi64(y0, 0xD8); + __m256i perm1 = _mm256_permute4x64_epi64(y1, 0xD8); + __m256i perm2 = _mm256_permute4x64_epi64(y2, 0xD8); + __m256i perm3 = _mm256_permute4x64_epi64(y3, 0xD8); + __m256i perm4 = _mm256_permute4x64_epi64(y4, 0xD8); + __m256i perm5 = _mm256_permute4x64_epi64(y5, 0xD8); + __m256i perm6 = _mm256_permute4x64_epi64(y6, 0xD8); + __m256i perm7 = _mm256_permute4x64_epi64(y7, 0xD8); + + // Then use unpack intrinsics to interleave two regs (within those lanes), which leaves it in the final place + // Result: [ 0 4 1 5 ] [ 2 6 3 7] + // Each reg becomes: [ X1 Y0 X0 0 0 ] + __m256i unpack0 = _mm256_unpacklo_epi64(perm0, perm1); + __m256i unpack1 = _mm256_unpackhi_epi64(perm0, perm1); + __m256i unpack2 = _mm256_unpacklo_epi64(perm2, perm3); + __m256i unpack3 = _mm256_unpackhi_epi64(perm2, perm3); + __m256i unpack4 = _mm256_unpacklo_epi64(perm4, perm5); + __m256i unpack5 = _mm256_unpackhi_epi64(perm4, perm5); + __m256i unpack6 = _mm256_unpacklo_epi64(perm6, perm7); + __m256i unpack7 = _mm256_unpackhi_epi64(perm6, perm7); + + // The top 3 bits of the swizzle are handled by the order of the registers here. + // Use streaming stores to optimize memory behavior-- this requires aligned memory. + __m256i* pAlignedOut = reinterpret_cast<__m256i*>(pImgMicroblock); + _mm256_stream_si256(pAlignedOut++, unpack0); + _mm256_stream_si256(pAlignedOut++, unpack2); + _mm256_stream_si256(pAlignedOut++, unpack1); + _mm256_stream_si256(pAlignedOut++, unpack3); + _mm256_stream_si256(pAlignedOut++, unpack4); + _mm256_stream_si256(pAlignedOut++, unpack6); + _mm256_stream_si256(pAlignedOut++, unpack5); + _mm256_stream_si256(pAlignedOut++, unpack7); + } +}; + +class MicroSw_2D_8BPE_AVX2 +{ + MicroSw_2D_8BPE_AVX2() = delete; +public: + static constexpr UINT_64 MicroEq[8] = { 0, 0, 0, X0, Y0, X1, X2, Y1 }; + static constexpr ADDR_EXTENT3D MicroBlockExtent = {8, 4, 1}; + static constexpr UINT_32 BpeLog2 = 3; + static constexpr UINT_32 ExpandX = 2; + + AVX2_FUNC static void CopyMicroBlock( + void* pImgMicroblock, // Microblock to write to + const void* pBuf, // Pointer to data starting from the first pixel of this block + size_t bufStrideY, // Stride of each row in pBuf + size_t bufStrideZ) // Stride of each slice in pBuf + { + // Unaligned buffer loads for 4 rows * (8 col * 64b = 256bx2) + // Each reg becomes: [ X1 X0 0 0 0 ] + __m256i y0a = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 0) + 0))); + __m256i y0b = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 0) + 32))); + __m256i y1a = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 1) + 0))); + __m256i y1b = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 1) + 32))); + __m256i y2a = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 2) + 0))); + __m256i y2b = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 2) + 32))); + __m256i y3a = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 3) + 0))); + __m256i y3b = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 3) + 32))); + + // In-register shuffling on the bottom 5 bits of which the bottom 4 are already right: (Y0, *, *, *, *) + // Start: y0a/y1a: [ 0 1 ] [ 2 3 ] (128b) + // Desired: [ 0 2 ] [ 1 3 ] + // The magic mask value lines up with the numbers above in hex, so 0x20 means [ 0 2 ] + // Each reg becomes: [ Y0 X0 0 0 0 ] + __m256i perm0 = _mm256_permute2x128_si256(y0a, y1a, 0x20); + __m256i perm1 = _mm256_permute2x128_si256(y0a, y1a, 0x31); + __m256i perm2 = _mm256_permute2x128_si256(y2a, y3a, 0x20); + __m256i perm3 = _mm256_permute2x128_si256(y2a, y3a, 0x31); + __m256i perm4 = _mm256_permute2x128_si256(y0b, y1b, 0x20); + __m256i perm5 = _mm256_permute2x128_si256(y0b, y1b, 0x31); + __m256i perm6 = _mm256_permute2x128_si256(y2b, y3b, 0x20); + __m256i perm7 = _mm256_permute2x128_si256(y2b, y3b, 0x31); + + // The top 3 bits of the swizzle are handled by the order of the registers here. + // Use streaming stores to optimize memory behavior-- this requires aligned memory. + __m256i* pAlignedOut = reinterpret_cast<__m256i*>(pImgMicroblock); + _mm256_stream_si256(pAlignedOut++, perm0); + _mm256_stream_si256(pAlignedOut++, perm1); + _mm256_stream_si256(pAlignedOut++, perm4); + _mm256_stream_si256(pAlignedOut++, perm5); + _mm256_stream_si256(pAlignedOut++, perm2); + _mm256_stream_si256(pAlignedOut++, perm3); + _mm256_stream_si256(pAlignedOut++, perm6); + _mm256_stream_si256(pAlignedOut++, perm7); + } +}; + +class MicroSw_2D_16BPE_AVX2 +{ + MicroSw_2D_16BPE_AVX2() = delete; +public: + static constexpr UINT_64 MicroEq[8] = { 0, 0, 0, 0, X0, Y0, X1, Y1 }; + static constexpr ADDR_EXTENT3D MicroBlockExtent = {4, 4, 1}; + static constexpr UINT_32 BpeLog2 = 4; + static constexpr UINT_32 ExpandX = 2; + + AVX2_FUNC static void CopyMicroBlock( + void* pImgMicroblock, // Microblock to write to + const void* pBuf, // Pointer to data starting from the first pixel of this block + size_t bufStrideY, // Stride of each row in pBuf + size_t bufStrideZ) // Stride of each slice in pBuf + { + // Unaligned buffer loads for 4 rows * (4 col * 128b = 256bx2) + __m256i y0a = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 0) + 0))); + __m256i y0b = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 0) + 32))); + __m256i y1a = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 1) + 0))); + __m256i y1b = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 1) + 32))); + __m256i y2a = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 2) + 0))); + __m256i y2b = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 2) + 32))); + __m256i y3a = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 3) + 0))); + __m256i y3b = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 3) + 32))); + + // The top 3 bits of the swizzle are handled by the order of the registers here. The rest are already right. + // Use streaming stores to optimize memory behavior-- this requires aligned memory. + __m256i* pAlignedOut = reinterpret_cast<__m256i*>(pImgMicroblock); + _mm256_stream_si256(pAlignedOut++, y0a); + _mm256_stream_si256(pAlignedOut++, y1a); + _mm256_stream_si256(pAlignedOut++, y0b); + _mm256_stream_si256(pAlignedOut++, y1b); + _mm256_stream_si256(pAlignedOut++, y2a); + _mm256_stream_si256(pAlignedOut++, y3a); + _mm256_stream_si256(pAlignedOut++, y2b); + _mm256_stream_si256(pAlignedOut++, y3b); + } +}; + + +class MicroSw_3D_1BPE_AVX2 +{ + MicroSw_3D_1BPE_AVX2() = delete; +public: + static constexpr UINT_64 MicroEq[8] = { X0, X1, Z0, Y0, Y1, Z1, X2, Z2 }; + static constexpr ADDR_EXTENT3D MicroBlockExtent = {8, 4, 8}; + static constexpr UINT_32 BpeLog2 = 0; + static constexpr UINT_32 ExpandX = 2; + + AVX2_FUNC static void CopyMicroBlock( + void* pImgMicroblock, // Microblock to write to + const void* pBuf, // Pointer to data starting from the first pixel of this block + size_t bufStrideY, // Stride of each row in pBuf + size_t bufStrideZ) // Stride of each slice in pBuf + { + // Unaligned buffer loads for 8 slices * 4 rows * (8 col * 8b = 64b) + // We'll do one 64x4 gather-load for each slice. + + // Pre-compute the y offsets + __m256i yOffsets = _mm256_set_epi64x((3 * bufStrideY), (2 * bufStrideY), (1 * bufStrideY), 0); + + // Then gather, incrementing the 'base' address for each slice. + // Each reg becomes: [ Y1 Y0 X2 X1 X0 ] + __m256i z0 = _mm256_i64gather_epi64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 0))), yOffsets, 1); + __m256i z1 = _mm256_i64gather_epi64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 1))), yOffsets, 1); + __m256i z2 = _mm256_i64gather_epi64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 2))), yOffsets, 1); + __m256i z3 = _mm256_i64gather_epi64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 3))), yOffsets, 1); + __m256i z4 = _mm256_i64gather_epi64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 4))), yOffsets, 1); + __m256i z5 = _mm256_i64gather_epi64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 5))), yOffsets, 1); + __m256i z6 = _mm256_i64gather_epi64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 6))), yOffsets, 1); + __m256i z7 = _mm256_i64gather_epi64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 7))), yOffsets, 1); + + // First swap x2 and y0 bits, so the following unpack ends up with x2 across registers + // Each reg becomes: [ Y1 X2 Y0 X1 X0 ] + __m256i shuf0 = _mm256_shuffle_epi32(z0, 0b11011000); + __m256i shuf1 = _mm256_shuffle_epi32(z1, 0b11011000); + __m256i shuf2 = _mm256_shuffle_epi32(z2, 0b11011000); + __m256i shuf3 = _mm256_shuffle_epi32(z3, 0b11011000); + __m256i shuf4 = _mm256_shuffle_epi32(z4, 0b11011000); + __m256i shuf5 = _mm256_shuffle_epi32(z5, 0b11011000); + __m256i shuf6 = _mm256_shuffle_epi32(z6, 0b11011000); + __m256i shuf7 = _mm256_shuffle_epi32(z7, 0b11011000); + + // Unpack to 32-bit interleave by z0. + // Each reg becomes: [ Y1 Y0 Z0 X1 X0 ] + __m256i unpack0 = _mm256_unpacklo_epi32(shuf0, shuf1); + __m256i unpack1 = _mm256_unpackhi_epi32(shuf0, shuf1); + __m256i unpack2 = _mm256_unpacklo_epi32(shuf2, shuf3); + __m256i unpack3 = _mm256_unpackhi_epi32(shuf2, shuf3); + __m256i unpack4 = _mm256_unpacklo_epi32(shuf4, shuf5); + __m256i unpack5 = _mm256_unpackhi_epi32(shuf4, shuf5); + __m256i unpack6 = _mm256_unpacklo_epi32(shuf6, shuf7); + __m256i unpack7 = _mm256_unpackhi_epi32(shuf6, shuf7); + + // The top 3 bits of the swizzle are handled by the order of the registers here. + // Use streaming stores to optimize memory behavior-- this requires aligned memory. + __m256i* pAlignedOut = reinterpret_cast<__m256i*>(pImgMicroblock); + _mm256_stream_si256(pAlignedOut++, unpack0); + _mm256_stream_si256(pAlignedOut++, unpack2); + _mm256_stream_si256(pAlignedOut++, unpack1); + _mm256_stream_si256(pAlignedOut++, unpack3); + _mm256_stream_si256(pAlignedOut++, unpack4); + _mm256_stream_si256(pAlignedOut++, unpack6); + _mm256_stream_si256(pAlignedOut++, unpack5); + _mm256_stream_si256(pAlignedOut++, unpack7); + } +}; + +class MicroSw_3D_2BPE_AVX2 +{ + MicroSw_3D_2BPE_AVX2() = delete; +public: + static constexpr UINT_64 MicroEq[8] = { 0, X0, Z0, Y0, X1, Z1, Y1, Z2 }; + static constexpr ADDR_EXTENT3D MicroBlockExtent = {4, 4, 8}; + static constexpr UINT_32 BpeLog2 = 1; + static constexpr UINT_32 ExpandX = 2; + + AVX2_FUNC static void CopyMicroBlock( + void* pImgMicroblock, // Microblock to write to + const void* pBuf, // Pointer to data starting from the first pixel of this block + size_t bufStrideY, // Stride of each row in pBuf + size_t bufStrideZ) // Stride of each slice in pBuf + { + // Unaligned buffer loads for 8 slices * 4 rows * (4 col * 16b = 64b) + // We'll do one 64x4 gather-load for each slice. + + // Pre-compute the y offsets, doing a pre-swizzle between y0 and y1 by changing the order of offsets. + // The pre-swizzle is done so that y1 gets separated across different registers in the unpack below. + __m256i yOffsets = _mm256_set_epi64x((3 * bufStrideY), (1 * bufStrideY), (2 * bufStrideY), 0); + + // Then gather, incrementing the 'base' address for each slice. + // Each reg becomes: [ Y0 Y1 X1 X0 0 ] + __m256i z0 = _mm256_i64gather_epi64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 0))), yOffsets, 1); + __m256i z1 = _mm256_i64gather_epi64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 1))), yOffsets, 1); + __m256i z2 = _mm256_i64gather_epi64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 2))), yOffsets, 1); + __m256i z3 = _mm256_i64gather_epi64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 3))), yOffsets, 1); + __m256i z4 = _mm256_i64gather_epi64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 4))), yOffsets, 1); + __m256i z5 = _mm256_i64gather_epi64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 5))), yOffsets, 1); + __m256i z6 = _mm256_i64gather_epi64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 6))), yOffsets, 1); + __m256i z7 = _mm256_i64gather_epi64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 7))), yOffsets, 1); + + // Unpack to 32-bit interleave by z0. + // Each reg becomes: [ Y0 X1 Z0 X0 0 ] + __m256i unpack0 = _mm256_unpacklo_epi32(z0, z1); + __m256i unpack1 = _mm256_unpackhi_epi32(z0, z1); + __m256i unpack2 = _mm256_unpacklo_epi32(z2, z3); + __m256i unpack3 = _mm256_unpackhi_epi32(z2, z3); + __m256i unpack4 = _mm256_unpacklo_epi32(z4, z5); + __m256i unpack5 = _mm256_unpackhi_epi32(z4, z5); + __m256i unpack6 = _mm256_unpacklo_epi32(z6, z7); + __m256i unpack7 = _mm256_unpackhi_epi32(z6, z7); + + // Then do a cross-lane permute to swap y0 and x1 + // Change [ 0 1 2 3 ] -> [ 0 2 1 3] + // Each reg becomes: [ X1 Y0 Z0 X0 0 ] + __m256i permute0 = _mm256_permute4x64_epi64(unpack0, 0b11011000); + __m256i permute1 = _mm256_permute4x64_epi64(unpack1, 0b11011000); + __m256i permute2 = _mm256_permute4x64_epi64(unpack2, 0b11011000); + __m256i permute3 = _mm256_permute4x64_epi64(unpack3, 0b11011000); + __m256i permute4 = _mm256_permute4x64_epi64(unpack4, 0b11011000); + __m256i permute5 = _mm256_permute4x64_epi64(unpack5, 0b11011000); + __m256i permute6 = _mm256_permute4x64_epi64(unpack6, 0b11011000); + __m256i permute7 = _mm256_permute4x64_epi64(unpack7, 0b11011000); + + // The top 3 bits of the swizzle are handled by the order of the registers here. + // Use streaming stores to optimize memory behavior-- this requires aligned memory. + __m256i* pAlignedOut = reinterpret_cast<__m256i*>(pImgMicroblock); + _mm256_stream_si256(pAlignedOut++, permute0); + _mm256_stream_si256(pAlignedOut++, permute2); + _mm256_stream_si256(pAlignedOut++, permute1); + _mm256_stream_si256(pAlignedOut++, permute3); + _mm256_stream_si256(pAlignedOut++, permute4); + _mm256_stream_si256(pAlignedOut++, permute6); + _mm256_stream_si256(pAlignedOut++, permute5); + _mm256_stream_si256(pAlignedOut++, permute7); + } +}; + + +class MicroSw_3D_4BPE_AVX2 +{ + MicroSw_3D_4BPE_AVX2() = delete; +public: + static constexpr UINT_64 MicroEq[8] = { 0, 0, X0, Y0, X1, Z0, Y1, Z1 }; + static constexpr ADDR_EXTENT3D MicroBlockExtent = {4, 4, 4}; + static constexpr UINT_32 BpeLog2 = 2; + static constexpr UINT_32 ExpandX = 2; + + AVX2_FUNC static void CopyMicroBlock( + void* pImgMicroblock, // Microblock to write to + const void* pBuf, // Pointer to data starting from the first pixel of this block + size_t bufStrideY, // Stride of each row in pBuf + size_t bufStrideZ) // Stride of each slice in pBuf + { + // Unaligned buffer loads for 4 slices * 4 rows * (4 col * 32b = 128b) + // Each reg becomes: [ X1 X0 0 0 ] + __m128i z0y0 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, (bufStrideZ * 0) + (bufStrideY * 0))); + __m128i z0y1 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, (bufStrideZ * 0) + (bufStrideY * 1))); + __m128i z0y2 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, (bufStrideZ * 0) + (bufStrideY * 2))); + __m128i z0y3 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, (bufStrideZ * 0) + (bufStrideY * 3))); + __m128i z1y0 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, (bufStrideZ * 1) + (bufStrideY * 0))); + __m128i z1y1 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, (bufStrideZ * 1) + (bufStrideY * 1))); + __m128i z1y2 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, (bufStrideZ * 1) + (bufStrideY * 2))); + __m128i z1y3 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, (bufStrideZ * 1) + (bufStrideY * 3))); + __m128i z2y0 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, (bufStrideZ * 2) + (bufStrideY * 0))); + __m128i z2y1 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, (bufStrideZ * 2) + (bufStrideY * 1))); + __m128i z2y2 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, (bufStrideZ * 2) + (bufStrideY * 2))); + __m128i z2y3 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, (bufStrideZ * 2) + (bufStrideY * 3))); + __m128i z3y0 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, (bufStrideZ * 3) + (bufStrideY * 0))); + __m128i z3y1 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, (bufStrideZ * 3) + (bufStrideY * 1))); + __m128i z3y2 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, (bufStrideZ * 3) + (bufStrideY * 2))); + __m128i z3y3 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, (bufStrideZ * 3) + (bufStrideY * 3))); + + // Concat two 128b together to form one 256b register, along y0. + // Each reg becomes: [ Y0 X1 X0 0 0 ] + __m256i concat0 = _mm256_set_m128i(z0y1, z0y0); + __m256i concat1 = _mm256_set_m128i(z0y3, z0y2); + __m256i concat2 = _mm256_set_m128i(z1y1, z1y0); + __m256i concat3 = _mm256_set_m128i(z1y3, z1y2); + __m256i concat4 = _mm256_set_m128i(z2y1, z2y0); + __m256i concat5 = _mm256_set_m128i(z2y3, z2y2); + __m256i concat6 = _mm256_set_m128i(z3y1, z3y0); + __m256i concat7 = _mm256_set_m128i(z3y3, z3y2); + + // Then do a cross-lane permute to swap y0 and x1 + // Change [ 0 1 2 3 ] -> [ 0 2 1 3] + // Each reg becomes: [ X1 Y0 X0 0 0 ] + __m256i perm0 = _mm256_permute4x64_epi64(concat0, 0b11011000); + __m256i perm1 = _mm256_permute4x64_epi64(concat1, 0b11011000); + __m256i perm2 = _mm256_permute4x64_epi64(concat2, 0b11011000); + __m256i perm3 = _mm256_permute4x64_epi64(concat3, 0b11011000); + __m256i perm4 = _mm256_permute4x64_epi64(concat4, 0b11011000); + __m256i perm5 = _mm256_permute4x64_epi64(concat5, 0b11011000); + __m256i perm6 = _mm256_permute4x64_epi64(concat6, 0b11011000); + __m256i perm7 = _mm256_permute4x64_epi64(concat7, 0b11011000); + + // The top 3 bits of the swizzle are handled by the order of the registers here. + // Use streaming stores to optimize memory behavior-- this requires aligned memory. + __m256i* pAlignedOut = reinterpret_cast<__m256i*>(pImgMicroblock); + _mm256_stream_si256(pAlignedOut++, perm0); + _mm256_stream_si256(pAlignedOut++, perm2); + _mm256_stream_si256(pAlignedOut++, perm1); + _mm256_stream_si256(pAlignedOut++, perm3); + _mm256_stream_si256(pAlignedOut++, perm4); + _mm256_stream_si256(pAlignedOut++, perm6); + _mm256_stream_si256(pAlignedOut++, perm5); + _mm256_stream_si256(pAlignedOut++, perm7); + } +}; + +class MicroSw_3D_8BPE_AVX2 +{ + MicroSw_3D_8BPE_AVX2() = delete; +public: + static constexpr UINT_64 MicroEq[8] = { 0, 0, 0, X0, Y0, Z0, X1, Z1 }; + static constexpr ADDR_EXTENT3D MicroBlockExtent = {4, 2, 4}; + static constexpr UINT_32 BpeLog2 = 3; + static constexpr UINT_32 ExpandX = 2; + + AVX2_FUNC static void CopyMicroBlock( + void* pImgMicroblock, // Microblock to write to + const void* pBuf, // Pointer to data starting from the first pixel of this block + size_t bufStrideY, // Stride of each row in pBuf + size_t bufStrideZ) // Stride of each slice in pBuf + { + // Unaligned buffer loads for 4 slices * 2 rows * (4 col * 64b = 256b) + // Each reg becomes: [ X1 X0 0 0 0 ] + __m256i z0y0 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 0)))); + __m256i z0y1 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 0) + bufStrideY))); + __m256i z1y0 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 1)))); + __m256i z1y1 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 1) + bufStrideY))); + __m256i z2y0 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 2)))); + __m256i z2y1 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 2) + bufStrideY))); + __m256i z3y0 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 3)))); + __m256i z3y1 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 3) + bufStrideY))); + + // Use 128-bit permutes to swap the y0 bit (diff registers) with the x0 bit (128b boundary) + // Start: y0/y1: [ 0 1 ] [ 2 3 ] (128b) + // Desired: [ 0 2 ] [ 1 3 ] + // The magic mask value lines up with the numbers above in hex, so 0x20 means [ 0 2 ] + // Each reg becomes: [ Y0 X0 0 0 0 ] + __m256i z0x0 = _mm256_permute2x128_si256(z0y0, z0y1, 0x20); + __m256i z0x1 = _mm256_permute2x128_si256(z0y0, z0y1, 0x31); + __m256i z1x0 = _mm256_permute2x128_si256(z1y0, z1y1, 0x20); + __m256i z1x1 = _mm256_permute2x128_si256(z1y0, z1y1, 0x31); + __m256i z2x0 = _mm256_permute2x128_si256(z2y0, z2y1, 0x20); + __m256i z2x1 = _mm256_permute2x128_si256(z2y0, z2y1, 0x31); + __m256i z3x0 = _mm256_permute2x128_si256(z3y0, z3y1, 0x20); + __m256i z3x1 = _mm256_permute2x128_si256(z3y0, z3y1, 0x31); + + // The top 3 bits of the swizzle are handled by the order of the registers here. + // Use streaming stores to optimize memory behavior-- this requires aligned memory. + __m256i* pAlignedOut = reinterpret_cast<__m256i*>(pImgMicroblock); + _mm256_stream_si256(pAlignedOut++, z0x0); + _mm256_stream_si256(pAlignedOut++, z1x0); + _mm256_stream_si256(pAlignedOut++, z0x1); + _mm256_stream_si256(pAlignedOut++, z1x1); + _mm256_stream_si256(pAlignedOut++, z2x0); + _mm256_stream_si256(pAlignedOut++, z3x0); + _mm256_stream_si256(pAlignedOut++, z2x1); + _mm256_stream_si256(pAlignedOut++, z3x1); + } +}; + + +class MicroSw_3D_16BPE_AVX2 +{ + MicroSw_3D_16BPE_AVX2() = delete; +public: + static constexpr UINT_64 MicroEq[8] = { 0, 0, 0, 0, X0, Z0, Y0, Z1 }; + static constexpr ADDR_EXTENT3D MicroBlockExtent = {2, 2, 4}; + static constexpr UINT_32 BpeLog2 = 4; + static constexpr UINT_32 ExpandX = 2; + + AVX2_FUNC static void CopyMicroBlock( + void* pImgMicroblock, // Microblock to write to + const void* pBuf, // Pointer to data starting from the first pixel of this block + size_t bufStrideY, // Stride of each row in pBuf + size_t bufStrideZ) // Stride of each slice in pBuf + { + // Unaligned buffer loads for 4 slices * 2 rows * (2 col * 128b = 256b) + __m256i z0y0 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 0)))); + __m256i z0y1 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 0) + bufStrideY))); + __m256i z1y0 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 1)))); + __m256i z1y1 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 1) + bufStrideY))); + __m256i z2y0 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 2)))); + __m256i z2y1 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 2) + bufStrideY))); + __m256i z3y0 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 3)))); + __m256i z3y1 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 3) + bufStrideY))); + + // The whole swizzle can be handled by the order of the registers here. + // Use streaming stores to optimize memory behavior-- this requires aligned memory. + __m256i* pAlignedOut = reinterpret_cast<__m256i*>(pImgMicroblock); + _mm256_stream_si256(pAlignedOut++, z0y0); + _mm256_stream_si256(pAlignedOut++, z1y0); + _mm256_stream_si256(pAlignedOut++, z0y1); + _mm256_stream_si256(pAlignedOut++, z1y1); + _mm256_stream_si256(pAlignedOut++, z2y0); + _mm256_stream_si256(pAlignedOut++, z3y0); + _mm256_stream_si256(pAlignedOut++, z2y1); + _mm256_stream_si256(pAlignedOut++, z3y1); + } +}; + + +class MicroSw_R_1BPE_AVX2 +{ + MicroSw_R_1BPE_AVX2() = delete; +public: + static constexpr UINT_64 MicroEq[8] = { X0, X1, X2, X3, Y0, Y1, Y2, Y3 }; + static constexpr ADDR_EXTENT3D MicroBlockExtent = {16, 16, 1}; + static constexpr UINT_32 BpeLog2 = 0; + static constexpr UINT_32 ExpandX = 4; + + AVX2_FUNC static void CopyMicroBlock( + void* pImgMicroblock, // Microblock to write to + const void* pBuf, // Pointer to data starting from the first pixel of this block + size_t bufStrideY, // Stride of each row in pBuf + size_t bufStrideZ) // Stride of each slice in pBuf + { + // Unaligned buffer loads for 16 rows * (16 col * 8b = 128b) + // Yes, that means double the load instructions + // Each reg becomes: [ X3 X2 X1 X0 ] + __m128i y0 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 0)); + __m128i y1 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 1)); + __m128i y2 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 2)); + __m128i y3 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 3)); + __m128i y4 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 4)); + __m128i y5 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 5)); + __m128i y6 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 6)); + __m128i y7 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 7)); + __m128i y8 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 8)); + __m128i y9 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 9)); + __m128i y10 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 10)); + __m128i y11 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 11)); + __m128i y12 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 12)); + __m128i y13 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 13)); + __m128i y14 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 14)); + __m128i y15 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 15)); + + // Concat two 128b together to form one 256b register, along y0. + // Each reg becomes: [ Y0 X3 X2 X1 X0 ] + __m256i concat0 = _mm256_set_m128i(y1, y0); + __m256i concat1 = _mm256_set_m128i(y3, y2); + __m256i concat2 = _mm256_set_m128i(y5, y4); + __m256i concat3 = _mm256_set_m128i(y7, y6); + __m256i concat4 = _mm256_set_m128i(y9, y8); + __m256i concat5 = _mm256_set_m128i(y11, y10); + __m256i concat6 = _mm256_set_m128i(y13, y12); + __m256i concat7 = _mm256_set_m128i(y15, y14); + + // The top 3 bits of the swizzle are handled by the order of the registers here. The rest are already right. + // Use streaming stores to optimize memory behavior-- this requires aligned memory. + __m256i* pAlignedOut = reinterpret_cast<__m256i*>(pImgMicroblock); + _mm256_stream_si256(pAlignedOut++, concat0); + _mm256_stream_si256(pAlignedOut++, concat1); + _mm256_stream_si256(pAlignedOut++, concat2); + _mm256_stream_si256(pAlignedOut++, concat3); + _mm256_stream_si256(pAlignedOut++, concat4); + _mm256_stream_si256(pAlignedOut++, concat5); + _mm256_stream_si256(pAlignedOut++, concat6); + _mm256_stream_si256(pAlignedOut++, concat7); + } +}; + +class MicroSw_R_2BPE_AVX2 +{ + MicroSw_R_2BPE_AVX2() = delete; +public: + static constexpr UINT_64 MicroEq[8] = { 0, X0, X1, X2, Y0, Y1, Y2, X3 }; + static constexpr ADDR_EXTENT3D MicroBlockExtent = {16, 8, 1}; + static constexpr UINT_32 BpeLog2 = 1; + static constexpr UINT_32 ExpandX = 4; + + AVX2_FUNC static void CopyMicroBlock( + void* pImgMicroblock, // Microblock to write to + const void* pBuf, // Pointer to data starting from the first pixel of this block + size_t bufStrideY, // Stride of each row in pBuf + size_t bufStrideZ) // Stride of each slice in pBuf + { + // Unaligned buffer loads for 8 rows * (16 col * 16b = 256b) + // Each reg becomes: [ X3 X2 X1 X0 0 ] + __m256i y0 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, bufStrideY * 0))); + __m256i y1 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, bufStrideY * 1))); + __m256i y2 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, bufStrideY * 2))); + __m256i y3 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, bufStrideY * 3))); + __m256i y4 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, bufStrideY * 4))); + __m256i y5 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, bufStrideY * 5))); + __m256i y6 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, bufStrideY * 6))); + __m256i y7 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, bufStrideY * 7))); + + // In-register shuffling on the bottom 5 bits of which the bottom 4 are already right: (Y0, *, *, *, *) + // Start: y0/y1: [ 0 1 ] [ 2 3 ] (128b) + // Desired: [ 0 2 ] [ 1 3 ] + // The magic mask value lines up with the numbers above in hex, so 0x20 means [ 0 2 ] + // Each reg becomes: [ Y0 X2 X1 X0 0 ] + __m256i perm0 = _mm256_permute2x128_si256(y0, y1, 0x20); + __m256i perm1 = _mm256_permute2x128_si256(y0, y1, 0x31); + __m256i perm2 = _mm256_permute2x128_si256(y2, y3, 0x20); + __m256i perm3 = _mm256_permute2x128_si256(y2, y3, 0x31); + __m256i perm4 = _mm256_permute2x128_si256(y4, y5, 0x20); + __m256i perm5 = _mm256_permute2x128_si256(y4, y5, 0x31); + __m256i perm6 = _mm256_permute2x128_si256(y6, y7, 0x20); + __m256i perm7 = _mm256_permute2x128_si256(y6, y7, 0x31); + + // The top 3 bits of the swizzle are handled by the order of the registers here. + // Use streaming stores to optimize memory behavior-- this requires aligned memory. + __m256i* pAlignedOut = reinterpret_cast<__m256i*>(pImgMicroblock); + _mm256_stream_si256(pAlignedOut++, perm0); + _mm256_stream_si256(pAlignedOut++, perm2); + _mm256_stream_si256(pAlignedOut++, perm4); + _mm256_stream_si256(pAlignedOut++, perm6); + _mm256_stream_si256(pAlignedOut++, perm1); + _mm256_stream_si256(pAlignedOut++, perm3); + _mm256_stream_si256(pAlignedOut++, perm5); + _mm256_stream_si256(pAlignedOut++, perm7); + } +}; + +class MicroSw_R_4BPE_AVX2 +{ + MicroSw_R_4BPE_AVX2() = delete; +public: + static constexpr UINT_64 MicroEq[8] = { 0, 0, X0, X1, Y0, Y1, X2, Y2 }; + static constexpr ADDR_EXTENT3D MicroBlockExtent = {8, 8, 1}; + static constexpr UINT_32 BpeLog2 = 2; + static constexpr UINT_32 ExpandX = 4; + + AVX2_FUNC static void CopyMicroBlock( + void* pImgMicroblock, // Microblock to write to + const void* pBuf, // Pointer to data starting from the first pixel of this block + size_t bufStrideY, // Stride of each row in pBuf + size_t bufStrideZ) // Stride of each slice in pBuf + { + // Unaligned buffer loads for 8 rows * (8 col * 32b = 256b) + // Each reg becomes: [ X2 X1 X0 0 0 ] + __m256i y0 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, bufStrideY * 0))); + __m256i y1 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, bufStrideY * 1))); + __m256i y2 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, bufStrideY * 2))); + __m256i y3 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, bufStrideY * 3))); + __m256i y4 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, bufStrideY * 4))); + __m256i y5 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, bufStrideY * 5))); + __m256i y6 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, bufStrideY * 6))); + __m256i y7 = _mm256_loadu_si256(reinterpret_cast(VoidPtrInc(pBuf, bufStrideY * 7))); + + // In-register shuffling on the bottom 5 bits of which the bottom 4 are already right: (Y0, *, *, *, *) + // Start: y0/y1: [ 0 1 ] [ 2 3 ] (128b) + // Desired: [ 0 2 ] [ 1 3 ] + // The magic mask value lines up with the numbers above in hex, so 0x20 means [ 0 2 ] + // Each reg becomes: [ Y0 X1 X0 0 0 ] + __m256i perm0 = _mm256_permute2x128_si256(y0, y1, 0x20); + __m256i perm1 = _mm256_permute2x128_si256(y0, y1, 0x31); + __m256i perm2 = _mm256_permute2x128_si256(y2, y3, 0x20); + __m256i perm3 = _mm256_permute2x128_si256(y2, y3, 0x31); + __m256i perm4 = _mm256_permute2x128_si256(y4, y5, 0x20); + __m256i perm5 = _mm256_permute2x128_si256(y4, y5, 0x31); + __m256i perm6 = _mm256_permute2x128_si256(y6, y7, 0x20); + __m256i perm7 = _mm256_permute2x128_si256(y6, y7, 0x31); + + // The top 3 bits of the swizzle are handled by the order of the registers here. + // Use streaming stores to optimize memory behavior-- this requires aligned memory. + __m256i* pAlignedOut = reinterpret_cast<__m256i*>(pImgMicroblock); + _mm256_stream_si256(pAlignedOut++, perm0); + _mm256_stream_si256(pAlignedOut++, perm2); + _mm256_stream_si256(pAlignedOut++, perm1); + _mm256_stream_si256(pAlignedOut++, perm3); + _mm256_stream_si256(pAlignedOut++, perm4); + _mm256_stream_si256(pAlignedOut++, perm6); + _mm256_stream_si256(pAlignedOut++, perm5); + _mm256_stream_si256(pAlignedOut++, perm7); + } +}; + + +class MicroSw_Z_1BPE_AVX2 +{ + MicroSw_Z_1BPE_AVX2() = delete; +public: + static constexpr UINT_64 MicroEq[8] = { X0, Y0, X1, Y1, X2, Y2, X3, Y3 }; + static constexpr ADDR_EXTENT3D MicroBlockExtent = {16, 16, 1}; + static constexpr UINT_32 BpeLog2 = 0; + static constexpr UINT_32 ExpandX = 2; + + AVX2_FUNC static void CopyMicroBlock( + void* pImgMicroblock, // Microblock to write to + const void* pBuf, // Pointer to data starting from the first pixel of this block + size_t bufStrideY, // Stride of each row in pBuf + size_t bufStrideZ) // Stride of each slice in pBuf + { + // Unaligned buffer loads for 16 rows * (16 col * 8b = 128b) + // Each reg becomes: [ X3 X2 X1 X0 ] + __m128i y0 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY*0)); + __m128i y1 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY*1)); + __m128i y2 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY*2)); + __m128i y3 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY*3)); + __m128i y4 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY*4)); + __m128i y5 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY*5)); + __m128i y6 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY*6)); + __m128i y7 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY*7)); + __m128i y8 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY*8)); + __m128i y9 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY*9)); + __m128i y10 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY*10)); + __m128i y11 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY*11)); + __m128i y12 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY*12)); + __m128i y13 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY*13)); + __m128i y14 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY*14)); + __m128i y15 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY*15)); + + // First combine two 128-bit values (across y1) + // Each reg becomes: [ Y1 X3 X2 X1 X0 ] + __m256i concat0 = _mm256_set_m128i(y2, y0); + __m256i concat1 = _mm256_set_m128i(y3, y1); + __m256i concat2 = _mm256_set_m128i(y6, y4); + __m256i concat3 = _mm256_set_m128i(y7, y5); + __m256i concat4 = _mm256_set_m128i(y10, y8); + __m256i concat5 = _mm256_set_m128i(y11, y9); + __m256i concat6 = _mm256_set_m128i(y14, y12); + __m256i concat7 = _mm256_set_m128i(y15, y13); + + // Then do a 16-bit interleave across y0. This is done in parallel on each 128b lane. + // Each reg becomes: [ Y1 X2 X1 Y0 X0 ] + __m256i unpack0 = _mm256_unpacklo_epi16(concat0, concat1); + __m256i unpack1 = _mm256_unpackhi_epi16(concat0, concat1); + __m256i unpack2 = _mm256_unpacklo_epi16(concat2, concat3); + __m256i unpack3 = _mm256_unpackhi_epi16(concat2, concat3); + __m256i unpack4 = _mm256_unpacklo_epi16(concat4, concat5); + __m256i unpack5 = _mm256_unpackhi_epi16(concat4, concat5); + __m256i unpack6 = _mm256_unpacklo_epi16(concat6, concat7); + __m256i unpack7 = _mm256_unpackhi_epi16(concat6, concat7); + + // Then do a cross-lane permute to change our 128b interleave across y1 to a 64-bit interleave. + // Change [ 0 1 2 3 ] -> [ 0 2 1 3] + // Each reg becomes: [ X2 Y1 X1 Y0 X0 ] + __m256i permute0 = _mm256_permute4x64_epi64(unpack0, 0b11011000); + __m256i permute1 = _mm256_permute4x64_epi64(unpack1, 0b11011000); + __m256i permute2 = _mm256_permute4x64_epi64(unpack2, 0b11011000); + __m256i permute3 = _mm256_permute4x64_epi64(unpack3, 0b11011000); + __m256i permute4 = _mm256_permute4x64_epi64(unpack4, 0b11011000); + __m256i permute5 = _mm256_permute4x64_epi64(unpack5, 0b11011000); + __m256i permute6 = _mm256_permute4x64_epi64(unpack6, 0b11011000); + __m256i permute7 = _mm256_permute4x64_epi64(unpack7, 0b11011000); + + // Finally, the top 3 bits of the swizzle are handled by the order of the registers here. + // Use streaming stores to optimize memory behavior-- this requires aligned memory. + __m256i* pAlignedOut = reinterpret_cast<__m256i*>(pImgMicroblock); + _mm256_stream_si256(pAlignedOut++, permute0); + _mm256_stream_si256(pAlignedOut++, permute2); + _mm256_stream_si256(pAlignedOut++, permute1); + _mm256_stream_si256(pAlignedOut++, permute3); + _mm256_stream_si256(pAlignedOut++, permute4); + _mm256_stream_si256(pAlignedOut++, permute6); + _mm256_stream_si256(pAlignedOut++, permute5); + _mm256_stream_si256(pAlignedOut++, permute7); + } +}; + + +class MicroSw_D_1BPE_AVX2 +{ + MicroSw_D_1BPE_AVX2() = delete; +public: + static constexpr UINT_64 MicroEq[8] = { X0, X1, X2, Y1, Y0, Y2, X3, Y3 }; + static constexpr ADDR_EXTENT3D MicroBlockExtent = {16, 16, 1}; + static constexpr UINT_32 BpeLog2 = 0; + static constexpr UINT_32 ExpandX = 4; + + AVX2_FUNC static void CopyMicroBlock( + void* pImgMicroblock, // Microblock to write to + const void* pBuf, // Pointer to data starting from the first pixel of this block + size_t bufStrideY, // Stride of each row in pBuf + size_t bufStrideZ) // Stride of each slice in pBuf + { + // Unaligned buffer loads for 16 rows * (16 col * 8b = 128b) + // Yes, that means double the load instructions + // Each reg becomes: [ X3 X2 X1 X0 ] + __m128i y0 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 0)); + __m128i y1 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 1)); + __m128i y2 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 2)); + __m128i y3 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 3)); + __m128i y4 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 4)); + __m128i y5 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 5)); + __m128i y6 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 6)); + __m128i y7 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 7)); + __m128i y8 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 8)); + __m128i y9 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 9)); + __m128i y10 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 10)); + __m128i y11 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 11)); + __m128i y12 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 12)); + __m128i y13 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 13)); + __m128i y14 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 14)); + __m128i y15 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 15)); + + // Concat two 128b together to form one 256b register, along y0. + // Each reg becomes: [ Y0 X3 X2 X1 X0 ] + __m256i concat0 = _mm256_set_m128i(y1, y0); + __m256i concat1 = _mm256_set_m128i(y3, y2); + __m256i concat2 = _mm256_set_m128i(y5, y4); + __m256i concat3 = _mm256_set_m128i(y7, y6); + __m256i concat4 = _mm256_set_m128i(y9, y8); + __m256i concat5 = _mm256_set_m128i(y11, y10); + __m256i concat6 = _mm256_set_m128i(y13, y12); + __m256i concat7 = _mm256_set_m128i(y15, y14); + + // Then do a 64-bit interleave along y1. + // Each reg becomes: [ Y0 Y1 X2 X1 X0 ] + __m256i unpack0 = _mm256_unpacklo_epi64(concat0, concat1); + __m256i unpack1 = _mm256_unpackhi_epi64(concat0, concat1); + __m256i unpack2 = _mm256_unpacklo_epi64(concat2, concat3); + __m256i unpack3 = _mm256_unpackhi_epi64(concat2, concat3); + __m256i unpack4 = _mm256_unpacklo_epi64(concat4, concat5); + __m256i unpack5 = _mm256_unpackhi_epi64(concat4, concat5); + __m256i unpack6 = _mm256_unpacklo_epi64(concat6, concat7); + __m256i unpack7 = _mm256_unpackhi_epi64(concat6, concat7); + + // The top 3 bits of the swizzle are handled by the order of the registers here. The rest are already right. + // Use streaming stores to optimize memory behavior-- this requires aligned memory. + __m256i* pAlignedOut = reinterpret_cast<__m256i*>(pImgMicroblock); + _mm256_stream_si256(pAlignedOut++, unpack0); + _mm256_stream_si256(pAlignedOut++, unpack2); + _mm256_stream_si256(pAlignedOut++, unpack1); + _mm256_stream_si256(pAlignedOut++, unpack3); + _mm256_stream_si256(pAlignedOut++, unpack4); + _mm256_stream_si256(pAlignedOut++, unpack6); + _mm256_stream_si256(pAlignedOut++, unpack5); + _mm256_stream_si256(pAlignedOut++, unpack7); + } +}; +#endif // ADDR_HAS_AVX2 + +#if ADDR_HAS_NEON +class MicroSw_2D_1BPE_NEON +{ + MicroSw_2D_1BPE_NEON() = delete; +public: + static constexpr UINT_64 MicroEq[8] = { X0, X1, Y0, X2, Y1, Y2, X3, Y3 }; + static constexpr ADDR_EXTENT3D MicroBlockExtent = {16, 16, 1}; + static constexpr UINT_32 BpeLog2 = 0; + static constexpr UINT_32 ExpandX = 4; + + NEON_FUNC static void CopyMicroBlock( + void* pImgMicroblock, // Microblock to write to + const void* pBuf, // Pointer to data starting from the first pixel of this block + size_t bufStrideY, // Stride of each row in pBuf + size_t bufStrideZ) // Stride of each slice in pBuf + { + // Unaligned buffer loads for 16 rows * (16 col * 8b = 128b) + // Each reg becomes: [ X3 X2 X1 X0 ] + uint32x4_t y0 = vld1q_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 0)))); + uint32x4_t y1 = vld1q_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 1)))); + uint32x4_t y2 = vld1q_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 2)))); + uint32x4_t y3 = vld1q_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 3)))); + uint32x4_t y4 = vld1q_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 4)))); + uint32x4_t y5 = vld1q_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 5)))); + uint32x4_t y6 = vld1q_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 6)))); + uint32x4_t y7 = vld1q_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 7)))); + uint32x4_t y8 = vld1q_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 8)))); + uint32x4_t y9 = vld1q_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 9)))); + uint32x4_t y10 = vld1q_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 10)))); + uint32x4_t y11 = vld1q_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 11)))); + uint32x4_t y12 = vld1q_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 12)))); + uint32x4_t y13 = vld1q_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 13)))); + uint32x4_t y14 = vld1q_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 14)))); + uint32x4_t y15 = vld1q_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 15)))); + + // Do a 32-bit zip/unpack operation to interleave within each 128b reg. + // Each reg becomes: [ X2 Y0 X1 X0 ] + uint32x4_t comb0a = vzip1q_u32(y0, y1); + uint32x4_t comb0b = vzip2q_u32(y0, y1); + uint32x4_t comb1a = vzip1q_u32(y2, y3); + uint32x4_t comb1b = vzip2q_u32(y2, y3); + uint32x4_t comb2a = vzip1q_u32(y4, y5); + uint32x4_t comb2b = vzip2q_u32(y4, y5); + uint32x4_t comb3a = vzip1q_u32(y6, y7); + uint32x4_t comb3b = vzip2q_u32(y6, y7); + uint32x4_t comb4a = vzip1q_u32(y8, y9); + uint32x4_t comb4b = vzip2q_u32(y8, y9); + uint32x4_t comb5a = vzip1q_u32(y10, y11); + uint32x4_t comb5b = vzip2q_u32(y10, y11); + uint32x4_t comb6a = vzip1q_u32(y12, y13); + uint32x4_t comb6b = vzip2q_u32(y12, y13); + uint32x4_t comb7a = vzip1q_u32(y14, y15); + uint32x4_t comb7b = vzip2q_u32(y14, y15); + + // The top 4 bits of the swizzle are handled by plain reg moves here. + uint32x4x4_t out0 = { { comb0a, comb1a, comb2a, comb3a } }; + uint32x4x4_t out1 = { { comb0b, comb1b, comb2b, comb3b } }; + uint32x4x4_t out2 = { { comb4a, comb5a, comb6a, comb7a } }; + uint32x4x4_t out3 = { { comb4b, comb5b, comb6b, comb7b } }; + + // And store them using the largest contiguous store we can. + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 0 * 64)), out0); + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 1 * 64)), out1); + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 2 * 64)), out2); + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 3 * 64)), out3); + } +}; + +class MicroSw_2D_2BPE_NEON +{ + MicroSw_2D_2BPE_NEON() = delete; +public: + static constexpr UINT_64 MicroEq[8] = { 0, X0, Y0, X1, Y1, X2, Y2, X3 }; + static constexpr ADDR_EXTENT3D MicroBlockExtent = {16, 8, 1}; + static constexpr UINT_32 BpeLog2 = 1; + static constexpr UINT_32 ExpandX = 2; + + NEON_FUNC static void CopyMicroBlock( + void* pImgMicroblock, // Microblock to write to + const void* pBuf, // Pointer to data starting from the first pixel of this block + size_t bufStrideY, // Stride of each row in pBuf + size_t bufStrideZ) // Stride of each slice in pBuf + { + // Unaligned buffer loads for 8 rows * (16 col * 16b = 256b) + // ARM can do a 256b load/store, but the actual values are 128b. + // Each reg becomes: [ X2 X1 X0 0 ] + uint32x4x2_t y0 = vld1q_u32_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 0)))); + uint32x4x2_t y1 = vld1q_u32_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 1)))); + uint32x4x2_t y2 = vld1q_u32_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 2)))); + uint32x4x2_t y3 = vld1q_u32_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 3)))); + uint32x4x2_t y4 = vld1q_u32_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 4)))); + uint32x4x2_t y5 = vld1q_u32_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 5)))); + uint32x4x2_t y6 = vld1q_u32_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 6)))); + uint32x4x2_t y7 = vld1q_u32_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 7)))); + + // Do a 32-bit zip/unpack operation to interleave within each 128b reg. + // Each reg becomes: [ X1 Y0 X0 0 ] + uint32x4_t comb0a = vzip1q_u32(y0.val[0], y1.val[0]); + uint32x4_t comb0b = vzip2q_u32(y0.val[0], y1.val[0]); + uint32x4_t comb1a = vzip1q_u32(y0.val[1], y1.val[1]); + uint32x4_t comb1b = vzip2q_u32(y0.val[1], y1.val[1]); + uint32x4_t comb2a = vzip1q_u32(y2.val[0], y3.val[0]); + uint32x4_t comb2b = vzip2q_u32(y2.val[0], y3.val[0]); + uint32x4_t comb3a = vzip1q_u32(y2.val[1], y3.val[1]); + uint32x4_t comb3b = vzip2q_u32(y2.val[1], y3.val[1]); + uint32x4_t comb4a = vzip1q_u32(y4.val[0], y5.val[0]); + uint32x4_t comb4b = vzip2q_u32(y4.val[0], y5.val[0]); + uint32x4_t comb5a = vzip1q_u32(y4.val[1], y5.val[1]); + uint32x4_t comb5b = vzip2q_u32(y4.val[1], y5.val[1]); + uint32x4_t comb6a = vzip1q_u32(y6.val[0], y7.val[0]); + uint32x4_t comb6b = vzip2q_u32(y6.val[0], y7.val[0]); + uint32x4_t comb7a = vzip1q_u32(y6.val[1], y7.val[1]); + uint32x4_t comb7b = vzip2q_u32(y6.val[1], y7.val[1]); + + // The top 4 bits of the swizzle are handled by plain reg moves here. + uint32x4x4_t out0 = { { comb0a, comb2a, comb0b, comb2b } }; + uint32x4x4_t out1 = { { comb4a, comb6a, comb4b, comb6b} }; + uint32x4x4_t out2 = { { comb1a, comb3a, comb1b, comb3b } }; + uint32x4x4_t out3 = { { comb5a, comb7a, comb5b, comb7b } }; + + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 0 * 64)), out0); + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 1 * 64)), out1); + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 2 * 64)), out2); + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 3 * 64)), out3); + } +}; +class MicroSw_2D_4BPE_NEON +{ + MicroSw_2D_4BPE_NEON() = delete; +public: + static constexpr UINT_64 MicroEq[8] = { 0, 0, X0, Y0, X1, Y1, X2, Y2 }; + static constexpr ADDR_EXTENT3D MicroBlockExtent = { 8, 8, 1}; + static constexpr UINT_32 BpeLog2 = 2; + static constexpr UINT_32 ExpandX = 2; + + NEON_FUNC static void CopyMicroBlock( + void* pImgMicroblock, // Microblock to write to + const void* pBuf, // Pointer to data starting from the first pixel of this block + size_t bufStrideY, // Stride of each row in pBuf + size_t bufStrideZ) // Stride of each slice in pBuf + { + // Unaligned buffer loads for 8 rows * (8 col * 32b = 256b) + // ARM can do a 256b load/store, but the actual values are 128b. + // Each reg becomes: [ X1 X0 0 0 ] + uint64x2x2_t y0 = vld1q_u64_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 0)))); + uint64x2x2_t y1 = vld1q_u64_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 1)))); + uint64x2x2_t y2 = vld1q_u64_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 2)))); + uint64x2x2_t y3 = vld1q_u64_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 3)))); + uint64x2x2_t y4 = vld1q_u64_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 4)))); + uint64x2x2_t y5 = vld1q_u64_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 5)))); + uint64x2x2_t y6 = vld1q_u64_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 6)))); + uint64x2x2_t y7 = vld1q_u64_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 7)))); + + // Do a 64-bit zip/unpack operation to interleave within each 128b reg. + // Each reg becomes: [ Y0 X0 0 0 ] + uint64x2_t comb0a = vzip1q_u64(y0.val[0], y1.val[0]); + uint64x2_t comb0b = vzip2q_u64(y0.val[0], y1.val[0]); + uint64x2_t comb1a = vzip1q_u64(y0.val[1], y1.val[1]); + uint64x2_t comb1b = vzip2q_u64(y0.val[1], y1.val[1]); + uint64x2_t comb2a = vzip1q_u64(y2.val[0], y3.val[0]); + uint64x2_t comb2b = vzip2q_u64(y2.val[0], y3.val[0]); + uint64x2_t comb3a = vzip1q_u64(y2.val[1], y3.val[1]); + uint64x2_t comb3b = vzip2q_u64(y2.val[1], y3.val[1]); + uint64x2_t comb4a = vzip1q_u64(y4.val[0], y5.val[0]); + uint64x2_t comb4b = vzip2q_u64(y4.val[0], y5.val[0]); + uint64x2_t comb5a = vzip1q_u64(y4.val[1], y5.val[1]); + uint64x2_t comb5b = vzip2q_u64(y4.val[1], y5.val[1]); + uint64x2_t comb6a = vzip1q_u64(y6.val[0], y7.val[0]); + uint64x2_t comb6b = vzip2q_u64(y6.val[0], y7.val[0]); + uint64x2_t comb7a = vzip1q_u64(y6.val[1], y7.val[1]); + uint64x2_t comb7b = vzip2q_u64(y6.val[1], y7.val[1]); + + // The top 4 bits of the swizzle are handled by plain reg moves here. + uint64x2x4_t out0 = { { comb0a, comb0b, comb2a, comb2b } }; + uint64x2x4_t out1 = { { comb1a, comb1b, comb3a, comb3b } }; + uint64x2x4_t out2 = { { comb4a, comb4b, comb6a, comb6b } }; + uint64x2x4_t out3 = { { comb5a, comb5b, comb7a, comb7b } }; + + vst1q_u64_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 0 * 64)), out0); + vst1q_u64_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 1 * 64)), out1); + vst1q_u64_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 2 * 64)), out2); + vst1q_u64_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 3 * 64)), out3); + } + +}; + +class MicroSw_2D_8BPE_NEON +{ + MicroSw_2D_8BPE_NEON() = delete; +public: + static constexpr UINT_64 MicroEq[8] = { 0, 0, 0, X0, Y0, X1, X2, Y1 }; + static constexpr ADDR_EXTENT3D MicroBlockExtent = {8, 4, 1}; + static constexpr UINT_32 BpeLog2 = 3; + static constexpr UINT_32 ExpandX = 2; + + NEON_FUNC static void CopyMicroBlock( + void* pImgMicroblock, // Microblock to write to + const void* pBuf, // Pointer to data starting from the first pixel of this block + size_t bufStrideY, // Stride of each row in pBuf + size_t bufStrideZ) // Stride of each slice in pBuf + { + + // Unaligned buffer loads for 4 rows * (8 col * 64b = 256bx2) + // ARM can do a 512b load/store, but the actual values are 128b. + uint32x4x4_t y0 = vld1q_u32_x4(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 0)))); + uint32x4x4_t y1 = vld1q_u32_x4(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 1)))); + uint32x4x4_t y2 = vld1q_u32_x4(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 2)))); + uint32x4x4_t y3 = vld1q_u32_x4(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 3)))); + + // The top 4 bits of the swizzle are handled by plain reg moves here. The bottom 4 bits need no handling. + uint32x4x4_t out0 = { { y0.val[0], y1.val[0], y0.val[1], y1.val[1] } }; + uint32x4x4_t out1 = { { y0.val[2], y1.val[2], y0.val[3], y1.val[3] } }; + uint32x4x4_t out2 = { { y2.val[0], y3.val[0], y2.val[1], y3.val[1] } }; + uint32x4x4_t out3 = { { y2.val[2], y3.val[2], y2.val[3], y3.val[3] } }; + + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 0 * 64)), out0); + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 1 * 64)), out1); + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 2 * 64)), out2); + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 3 * 64)), out3); + } +}; +class MicroSw_2D_16BPE_NEON +{ + MicroSw_2D_16BPE_NEON() = delete; +public: + static constexpr UINT_64 MicroEq[8] = { 0, 0, 0, 0, X0, Y0, X1, Y1 }; + static constexpr ADDR_EXTENT3D MicroBlockExtent = {4, 4, 1}; + static constexpr UINT_32 BpeLog2 = 4; + static constexpr UINT_32 ExpandX = 2; + + NEON_FUNC static void CopyMicroBlock( + void* pImgMicroblock, // Microblock to write to + const void* pBuf, // Pointer to data starting from the first pixel of this block + size_t bufStrideY, // Stride of each row in pBuf + size_t bufStrideZ) // Stride of each slice in pBuf + { + + // Unaligned buffer loads for 4 rows * (4 col * 128b = 512b) + // ARM can do a 512b load/store, but the actual values are 128b. + uint32x4x4_t y0 = vld1q_u32_x4(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 0)))); + uint32x4x4_t y1 = vld1q_u32_x4(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 1)))); + uint32x4x4_t y2 = vld1q_u32_x4(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 2)))); + uint32x4x4_t y3 = vld1q_u32_x4(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 3)))); + + // The top 3 bits of the swizzle are handled by plain reg moves here. The rest are already right. + uint32x4x4_t out0 = { { y0.val[0], y0.val[1], y1.val[0], y1.val[1] } }; + uint32x4x4_t out1 = { { y0.val[2], y0.val[3], y1.val[2], y1.val[3] } }; + uint32x4x4_t out2 = { { y2.val[0], y2.val[1], y3.val[0], y3.val[1] } }; + uint32x4x4_t out3 = { { y2.val[2], y2.val[3], y3.val[2], y3.val[3] } }; + + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 0 * 64)), out0); + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 1 * 64)), out1); + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 2 * 64)), out2); + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 3 * 64)), out3); + } +}; +class MicroSw_3D_1BPE_NEON +{ + MicroSw_3D_1BPE_NEON() = delete; +public: + static constexpr UINT_64 MicroEq[8] = { X0, X1, Z0, Y0, Y1, Z1, X2, Z2 }; + static constexpr ADDR_EXTENT3D MicroBlockExtent = {8, 4, 8}; + static constexpr UINT_32 BpeLog2 = 0; + static constexpr UINT_32 ExpandX = 2; + NEON_FUNC static void CopyMicroBlock( + void* pImgMicroblock, // Microblock to write to + const void* pBuf, // Pointer to data starting from the first pixel of this block + size_t bufStrideY, // Stride of each row in pBuf + size_t bufStrideZ) // Stride of each slice in pBuf + { + // Unaligned buffer loads for 8 slices * 4 rows * (4 col * 16b = 64b) + // Do a lot of 64b (half reg) loads and join them + // Each reg becomes: [ Y1 X2 X1 X0 ] + uint32x4_t z0y02 = vcombine_u32( + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 0) + (bufStrideY * 0)))), + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 0) + (bufStrideY * 2))))); + uint32x4_t z0y13 = vcombine_u32( + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 0) + (bufStrideY * 1)))), + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 0) + (bufStrideY * 3))))); + uint32x4_t z1y02 = vcombine_u32( + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 1) + (bufStrideY * 0)))), + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 1) + (bufStrideY * 2))))); + uint32x4_t z1y13 = vcombine_u32( + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 1) + (bufStrideY * 1)))), + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 1) + (bufStrideY * 3))))); + uint32x4_t z2y02 = vcombine_u32( + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 2) + (bufStrideY * 0)))), + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 2) + (bufStrideY * 2))))); + uint32x4_t z2y13 = vcombine_u32( + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 2) + (bufStrideY * 1)))), + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 2) + (bufStrideY * 3))))); + uint32x4_t z3y02 = vcombine_u32( + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 3) + (bufStrideY * 0)))), + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 3) + (bufStrideY * 2))))); + uint32x4_t z3y13 = vcombine_u32( + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 3) + (bufStrideY * 1)))), + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 3) + (bufStrideY * 3))))); + uint32x4_t z4y02 = vcombine_u32( + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 4) + (bufStrideY * 0)))), + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 4) + (bufStrideY * 2))))); + uint32x4_t z4y13 = vcombine_u32( + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 4) + (bufStrideY * 1)))), + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 4) + (bufStrideY * 3))))); + uint32x4_t z5y02 = vcombine_u32( + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 5) + (bufStrideY * 0)))), + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 5) + (bufStrideY * 2))))); + uint32x4_t z5y13 = vcombine_u32( + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 5) + (bufStrideY * 1)))), + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 5) + (bufStrideY * 3))))); + uint32x4_t z6y02 = vcombine_u32( + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 6) + (bufStrideY * 0)))), + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 6) + (bufStrideY * 2))))); + uint32x4_t z6y13 = vcombine_u32( + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 6) + (bufStrideY * 1)))), + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 6) + (bufStrideY * 3))))); + uint32x4_t z7y02 = vcombine_u32( + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 7) + (bufStrideY * 0)))), + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 7) + (bufStrideY * 2))))); + uint32x4_t z7y13 = vcombine_u32( + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 7) + (bufStrideY * 1)))), + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 7) + (bufStrideY * 3))))); + + // Now, do two 32-bit interleaves, first for y0, then for z0 + // Each reg becomes: [ X2 Y0 X1 X0 ] + uint32x4_t ycomb0a = vzip1q_u32(z0y02, z0y13); + uint32x4_t ycomb0b = vzip2q_u32(z0y02, z0y13); + uint32x4_t ycomb1a = vzip1q_u32(z1y02, z1y13); + uint32x4_t ycomb1b = vzip2q_u32(z1y02, z1y13); + uint32x4_t ycomb2a = vzip1q_u32(z2y02, z2y13); + uint32x4_t ycomb2b = vzip2q_u32(z2y02, z2y13); + uint32x4_t ycomb3a = vzip1q_u32(z3y02, z3y13); + uint32x4_t ycomb3b = vzip2q_u32(z3y02, z3y13); + uint32x4_t ycomb4a = vzip1q_u32(z4y02, z4y13); + uint32x4_t ycomb4b = vzip2q_u32(z4y02, z4y13); + uint32x4_t ycomb5a = vzip1q_u32(z5y02, z5y13); + uint32x4_t ycomb5b = vzip2q_u32(z5y02, z5y13); + uint32x4_t ycomb6a = vzip1q_u32(z6y02, z6y13); + uint32x4_t ycomb6b = vzip2q_u32(z6y02, z6y13); + uint32x4_t ycomb7a = vzip1q_u32(z7y02, z7y13); + uint32x4_t ycomb7b = vzip2q_u32(z7y02, z7y13); + + // Each reg becomes: [ Y0 Z0 X1 X0 ] + uint32x4_t comb0a = vzip1q_u32(ycomb0a, ycomb1a); + uint32x4_t comb0b = vzip2q_u32(ycomb0a, ycomb1a); + uint32x4_t comb1a = vzip1q_u32(ycomb0b, ycomb1b); + uint32x4_t comb1b = vzip2q_u32(ycomb0b, ycomb1b); + uint32x4_t comb2a = vzip1q_u32(ycomb2a, ycomb3a); + uint32x4_t comb2b = vzip2q_u32(ycomb2a, ycomb3a); + uint32x4_t comb3a = vzip1q_u32(ycomb2b, ycomb3b); + uint32x4_t comb3b = vzip2q_u32(ycomb2b, ycomb3b); + uint32x4_t comb4a = vzip1q_u32(ycomb4a, ycomb5a); + uint32x4_t comb4b = vzip2q_u32(ycomb4a, ycomb5a); + uint32x4_t comb5a = vzip1q_u32(ycomb4b, ycomb5b); + uint32x4_t comb5b = vzip2q_u32(ycomb4b, ycomb5b); + uint32x4_t comb6a = vzip1q_u32(ycomb6a, ycomb7a); + uint32x4_t comb6b = vzip2q_u32(ycomb6a, ycomb7a); + uint32x4_t comb7a = vzip1q_u32(ycomb6b, ycomb7b); + uint32x4_t comb7b = vzip2q_u32(ycomb6b, ycomb7b); + + // The top 4 bits of the swizzle are handled by plain reg moves here. + uint32x4x4_t out0 = { { comb0a, comb1a, comb2a, comb3a } }; + uint32x4x4_t out1 = { { comb0b, comb1b, comb2b, comb3b } }; + uint32x4x4_t out2 = { { comb4a, comb5a, comb6a, comb7a } }; + uint32x4x4_t out3 = { { comb4b, comb5b, comb6b, comb7b } }; + + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 0 * 64)), out0); + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 1 * 64)), out1); + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 2 * 64)), out2); + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 3 * 64)), out3); + } +}; + +class MicroSw_3D_2BPE_NEON +{ + MicroSw_3D_2BPE_NEON() = delete; +public: + static constexpr UINT_64 MicroEq[8] = { 0, X0, Z0, Y0, X1, Z1, Y1, Z2 }; + static constexpr ADDR_EXTENT3D MicroBlockExtent = {4, 4, 8}; + static constexpr UINT_32 BpeLog2 = 1; + static constexpr UINT_32 ExpandX = 2; + + NEON_FUNC static void CopyMicroBlock( + void* pImgMicroblock, // Microblock to write to + const void* pBuf, // Pointer to data starting from the first pixel of this block + size_t bufStrideY, // Stride of each row in pBuf + size_t bufStrideZ) // Stride of each slice in pBuf + { + + // Unaligned buffer loads for 8 slices * 4 rows * (4 col * 16b = 64b) + // Do a lot of 64b (half reg) loads and join them + // Each reg becomes: [ Y1 X1 X0 0 ] + uint32x4_t z0y02 = vcombine_u32( + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 0) + (bufStrideY * 0)))), + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 0) + (bufStrideY * 2))))); + uint32x4_t z0y13 = vcombine_u32( + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 0) + (bufStrideY * 1)))), + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 0) + (bufStrideY * 3))))); + uint32x4_t z1y02 = vcombine_u32( + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 1) + (bufStrideY * 0)))), + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 1) + (bufStrideY * 2))))); + uint32x4_t z1y13 = vcombine_u32( + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 1) + (bufStrideY * 1)))), + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 1) + (bufStrideY * 3))))); + uint32x4_t z2y02 = vcombine_u32( + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 2) + (bufStrideY * 0)))), + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 2) + (bufStrideY * 2))))); + uint32x4_t z2y13 = vcombine_u32( + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 2) + (bufStrideY * 1)))), + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 2) + (bufStrideY * 3))))); + uint32x4_t z3y02 = vcombine_u32( + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 3) + (bufStrideY * 0)))), + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 3) + (bufStrideY * 2))))); + uint32x4_t z3y13 = vcombine_u32( + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 3) + (bufStrideY * 1)))), + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 3) + (bufStrideY * 3))))); + uint32x4_t z4y02 = vcombine_u32( + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 4) + (bufStrideY * 0)))), + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 4) + (bufStrideY * 2))))); + uint32x4_t z4y13 = vcombine_u32( + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 4) + (bufStrideY * 1)))), + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 4) + (bufStrideY * 3))))); + uint32x4_t z5y02 = vcombine_u32( + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 5) + (bufStrideY * 0)))), + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 5) + (bufStrideY * 2))))); + uint32x4_t z5y13 = vcombine_u32( + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 5) + (bufStrideY * 1)))), + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 5) + (bufStrideY * 3))))); + uint32x4_t z6y02 = vcombine_u32( + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 6) + (bufStrideY * 0)))), + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 6) + (bufStrideY * 2))))); + uint32x4_t z6y13 = vcombine_u32( + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 6) + (bufStrideY * 1)))), + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 6) + (bufStrideY * 3))))); + uint32x4_t z7y02 = vcombine_u32( + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 7) + (bufStrideY * 0)))), + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 7) + (bufStrideY * 2))))); + uint32x4_t z7y13 = vcombine_u32( + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 7) + (bufStrideY * 1)))), + vld1_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 7) + (bufStrideY * 3))))); + + // Now, do two 32-bit interleaves, first for y0, then for z0 + // Each reg becomes: [ X1 Y0 X0 0 ] + uint32x4_t ycomb0a = vzip1q_u32(z0y02, z0y13); + uint32x4_t ycomb0b = vzip2q_u32(z0y02, z0y13); + uint32x4_t ycomb1a = vzip1q_u32(z1y02, z1y13); + uint32x4_t ycomb1b = vzip2q_u32(z1y02, z1y13); + uint32x4_t ycomb2a = vzip1q_u32(z2y02, z2y13); + uint32x4_t ycomb2b = vzip2q_u32(z2y02, z2y13); + uint32x4_t ycomb3a = vzip1q_u32(z3y02, z3y13); + uint32x4_t ycomb3b = vzip2q_u32(z3y02, z3y13); + uint32x4_t ycomb4a = vzip1q_u32(z4y02, z4y13); + uint32x4_t ycomb4b = vzip2q_u32(z4y02, z4y13); + uint32x4_t ycomb5a = vzip1q_u32(z5y02, z5y13); + uint32x4_t ycomb5b = vzip2q_u32(z5y02, z5y13); + uint32x4_t ycomb6a = vzip1q_u32(z6y02, z6y13); + uint32x4_t ycomb6b = vzip2q_u32(z6y02, z6y13); + uint32x4_t ycomb7a = vzip1q_u32(z7y02, z7y13); + uint32x4_t ycomb7b = vzip2q_u32(z7y02, z7y13); + + // Each reg becomes: [ Y0 Z0 X0 0 ] + uint32x4_t comb0a = vzip1q_u32(ycomb0a, ycomb1a); + uint32x4_t comb0b = vzip2q_u32(ycomb0a, ycomb1a); + uint32x4_t comb1a = vzip1q_u32(ycomb0b, ycomb1b); + uint32x4_t comb1b = vzip2q_u32(ycomb0b, ycomb1b); + uint32x4_t comb2a = vzip1q_u32(ycomb2a, ycomb3a); + uint32x4_t comb2b = vzip2q_u32(ycomb2a, ycomb3a); + uint32x4_t comb3a = vzip1q_u32(ycomb2b, ycomb3b); + uint32x4_t comb3b = vzip2q_u32(ycomb2b, ycomb3b); + uint32x4_t comb4a = vzip1q_u32(ycomb4a, ycomb5a); + uint32x4_t comb4b = vzip2q_u32(ycomb4a, ycomb5a); + uint32x4_t comb5a = vzip1q_u32(ycomb4b, ycomb5b); + uint32x4_t comb5b = vzip2q_u32(ycomb4b, ycomb5b); + uint32x4_t comb6a = vzip1q_u32(ycomb6a, ycomb7a); + uint32x4_t comb6b = vzip2q_u32(ycomb6a, ycomb7a); + uint32x4_t comb7a = vzip1q_u32(ycomb6b, ycomb7b); + uint32x4_t comb7b = vzip2q_u32(ycomb6b, ycomb7b); + + // The top 4 bits of the swizzle are handled by plain reg moves here. The bottom 4 bits need no handling. + uint32x4x4_t out0 = { { comb0a, comb0b, comb2a, comb2b } }; + uint32x4x4_t out1 = { { comb1a, comb1b, comb3a, comb3b } }; + uint32x4x4_t out2 = { { comb4a, comb4b, comb6a, comb6b } }; + uint32x4x4_t out3 = { { comb5a, comb5b, comb7a, comb7b } }; + + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 0 * 64)), out0); + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 1 * 64)), out1); + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 2 * 64)), out2); + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 3 * 64)), out3); + } +}; + +class MicroSw_3D_4BPE_NEON +{ + MicroSw_3D_4BPE_NEON() = delete; +public: + static constexpr UINT_64 MicroEq[8] = { 0, 0, X0, Y0, X1, Z0, Y1, Z1 }; + static constexpr ADDR_EXTENT3D MicroBlockExtent = {4, 4, 4}; + static constexpr UINT_32 BpeLog2 = 2; + static constexpr UINT_32 ExpandX = 2; + + NEON_FUNC static void CopyMicroBlock( + void* pImgMicroblock, // Microblock to write to + const void* pBuf, // Pointer to data starting from the first pixel of this block + size_t bufStrideY, // Stride of each row in pBuf + size_t bufStrideZ) // Stride of each slice in pBuf + { + // Unaligned buffer loads for 4 rows * (8 col * 64b = 256bx2) + // Each reg becomes: [ X1 X0 0 0 ] + uint64x2_t z0y0 = vld1q_u64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 0) + (bufStrideY * 0)))); + uint64x2_t z0y1 = vld1q_u64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 0) + (bufStrideY * 1)))); + uint64x2_t z0y2 = vld1q_u64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 0) + (bufStrideY * 2)))); + uint64x2_t z0y3 = vld1q_u64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 0) + (bufStrideY * 3)))); + uint64x2_t z1y0 = vld1q_u64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 1) + (bufStrideY * 0)))); + uint64x2_t z1y1 = vld1q_u64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 1) + (bufStrideY * 1)))); + uint64x2_t z1y2 = vld1q_u64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 1) + (bufStrideY * 2)))); + uint64x2_t z1y3 = vld1q_u64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 1) + (bufStrideY * 3)))); + uint64x2_t z2y0 = vld1q_u64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 2) + (bufStrideY * 0)))); + uint64x2_t z2y1 = vld1q_u64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 2) + (bufStrideY * 1)))); + uint64x2_t z2y2 = vld1q_u64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 2) + (bufStrideY * 2)))); + uint64x2_t z2y3 = vld1q_u64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 2) + (bufStrideY * 3)))); + uint64x2_t z3y0 = vld1q_u64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 3) + (bufStrideY * 0)))); + uint64x2_t z3y1 = vld1q_u64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 3) + (bufStrideY * 1)))); + uint64x2_t z3y2 = vld1q_u64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 3) + (bufStrideY * 2)))); + uint64x2_t z3y3 = vld1q_u64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 3) + (bufStrideY * 3)))); + + // Then do a 64-bit interleave across y0 + // Each reg becomes: [ Y0 X0 0 0 ] + uint64x2_t comb0a = vzip1q_u64(z0y0, z0y1); + uint64x2_t comb0b = vzip2q_u64(z0y0, z0y1); + uint64x2_t comb1a = vzip1q_u64(z0y2, z0y3); + uint64x2_t comb1b = vzip2q_u64(z0y2, z0y3); + uint64x2_t comb2a = vzip1q_u64(z1y0, z1y1); + uint64x2_t comb2b = vzip2q_u64(z1y0, z1y1); + uint64x2_t comb3a = vzip1q_u64(z1y2, z1y3); + uint64x2_t comb3b = vzip2q_u64(z1y2, z1y3); + uint64x2_t comb4a = vzip1q_u64(z2y0, z2y1); + uint64x2_t comb4b = vzip2q_u64(z2y0, z2y1); + uint64x2_t comb5a = vzip1q_u64(z2y2, z2y3); + uint64x2_t comb5b = vzip2q_u64(z2y2, z2y3); + uint64x2_t comb6a = vzip1q_u64(z3y0, z3y1); + uint64x2_t comb6b = vzip2q_u64(z3y0, z3y1); + uint64x2_t comb7a = vzip1q_u64(z3y2, z3y3); + uint64x2_t comb7b = vzip2q_u64(z3y2, z3y3); + + // The top 4 bits of the swizzle are handled by plain reg moves here. The bottom 4 bits need no handling. + uint64x2x4_t out0 = { { comb0a, comb0b, comb2a, comb2b } }; + uint64x2x4_t out1 = { { comb1a, comb1b, comb3a, comb3b } }; + uint64x2x4_t out2 = { { comb4a, comb4b, comb6a, comb6b } }; + uint64x2x4_t out3 = { { comb5a, comb5b, comb7a, comb7b } }; + + vst1q_u64_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 0 * 64)), out0); + vst1q_u64_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 1 * 64)), out1); + vst1q_u64_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 2 * 64)), out2); + vst1q_u64_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 3 * 64)), out3); + } +}; + +class MicroSw_3D_8BPE_NEON +{ + MicroSw_3D_8BPE_NEON() = delete; +public: + static constexpr UINT_64 MicroEq[8] = { 0, 0, 0, X0, Y0, Z0, X1, Z1 }; + static constexpr ADDR_EXTENT3D MicroBlockExtent = {4, 2, 4}; + static constexpr UINT_32 BpeLog2 = 3; + static constexpr UINT_32 ExpandX = 2; + + NEON_FUNC static void CopyMicroBlock( + void* pImgMicroblock, // Microblock to write to + const void* pBuf, // Pointer to data starting from the first pixel of this block + size_t bufStrideY, // Stride of each row in pBuf + size_t bufStrideZ) // Stride of each slice in pBuf + { + // Unaligned buffer loads for 4 slices * 2 rows * (4 col * 64b = 256b) + // ARM can do a 256b load/store, but the actual values are 128b. + // Each reg becomes: [ X0 0 0 0 ] + uint32x4x2_t z0y0 = vld1q_u32_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 0)))); + uint32x4x2_t z0y1 = vld1q_u32_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 0) + bufStrideY))); + uint32x4x2_t z1y0 = vld1q_u32_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 1)))); + uint32x4x2_t z1y1 = vld1q_u32_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 1) + bufStrideY))); + uint32x4x2_t z2y0 = vld1q_u32_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 2)))); + uint32x4x2_t z2y1 = vld1q_u32_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 2) + bufStrideY))); + uint32x4x2_t z3y0 = vld1q_u32_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 3)))); + uint32x4x2_t z3y1 = vld1q_u32_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 3) + bufStrideY))); + + // The top 4 bits of the swizzle are handled by plain reg moves here. The bottom 4 bits need no handling. + uint32x4x4_t out0 = { { z0y0.val[0], z0y1.val[0], z1y0.val[0], z1y1.val[0] } }; + uint32x4x4_t out1 = { { z0y0.val[1], z0y1.val[1], z1y0.val[1], z1y1.val[1] } }; + uint32x4x4_t out2 = { { z2y0.val[0], z2y1.val[0], z3y0.val[0], z3y1.val[0] } }; + uint32x4x4_t out3 = { { z2y0.val[1], z2y1.val[1], z3y0.val[1], z3y1.val[1] } }; + + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 0 * 64)), out0); + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 1 * 64)), out1); + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 2 * 64)), out2); + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 3 * 64)), out3); + } +}; + +class MicroSw_3D_16BPE_NEON +{ + MicroSw_3D_16BPE_NEON() = delete; +public: + static constexpr UINT_64 MicroEq[8] = { 0, 0, 0, 0, X0, Z0, Y0, Z1 }; + static constexpr ADDR_EXTENT3D MicroBlockExtent = {2, 2, 4}; + static constexpr UINT_32 BpeLog2 = 4; + static constexpr UINT_32 ExpandX = 2; + + NEON_FUNC static void CopyMicroBlock( + void* pImgMicroblock, // Microblock to write to + const void* pBuf, // Pointer to data starting from the first pixel of this block + size_t bufStrideY, // Stride of each row in pBuf + size_t bufStrideZ) // Stride of each slice in pBuf + { + // Unaligned buffer loads for 4 slices * 2 rows * (4 col * 64b = 256b) + // ARM can do a 256b load/store, but the actual values are 128b. + uint32x4x2_t z0y0 = vld1q_u32_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 0)))); + uint32x4x2_t z0y1 = vld1q_u32_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 0) + bufStrideY))); + uint32x4x2_t z1y0 = vld1q_u32_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 1)))); + uint32x4x2_t z1y1 = vld1q_u32_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 1) + bufStrideY))); + uint32x4x2_t z2y0 = vld1q_u32_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 2)))); + uint32x4x2_t z2y1 = vld1q_u32_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 2) + bufStrideY))); + uint32x4x2_t z3y0 = vld1q_u32_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 3)))); + uint32x4x2_t z3y1 = vld1q_u32_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideZ * 3) + bufStrideY))); + + // The top 4 bits of the swizzle are handled by plain reg moves here. The bottom 4 bits need no handling. + uint32x4x4_t out0 = { { z0y0.val[0], z0y0.val[1], z1y0.val[0], z1y0.val[1] } }; + uint32x4x4_t out1 = { { z0y1.val[0], z0y1.val[1], z1y1.val[0], z1y1.val[1] } }; + uint32x4x4_t out2 = { { z2y0.val[0], z2y0.val[1], z3y0.val[0], z3y0.val[1] } }; + uint32x4x4_t out3 = { { z2y1.val[0], z2y1.val[1], z3y1.val[0], z3y1.val[1] } }; + + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 0 * 64)), out0); + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 1 * 64)), out1); + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 2 * 64)), out2); + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 3 * 64)), out3); + } +}; + +class MicroSw_R_1BPE_NEON +{ + MicroSw_R_1BPE_NEON() = delete; +public: + static constexpr UINT_64 MicroEq[8] = { X0, X1, X2, X3, Y0, Y1, Y2, Y3 }; + static constexpr ADDR_EXTENT3D MicroBlockExtent = {16, 16, 1}; + static constexpr UINT_32 BpeLog2 = 0; + static constexpr UINT_32 ExpandX = 4; + NEON_FUNC static void CopyMicroBlock( + void* pImgMicroblock, // Microblock to write to + const void* pBuf, // Pointer to data starting from the first pixel of this block + size_t bufStrideY, // Stride of each row in pBuf + size_t bufStrideZ) // Stride of each slice in pBuf + { + // Unaligned buffer loads for 16 rows * (16 col * 8b = 128b) + // Each reg becomes: [ X3 X2 X1 X0 ] + uint32x4_t y0 = vld1q_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 0)))); + uint32x4_t y1 = vld1q_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 1)))); + uint32x4_t y2 = vld1q_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 2)))); + uint32x4_t y3 = vld1q_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 3)))); + uint32x4_t y4 = vld1q_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 4)))); + uint32x4_t y5 = vld1q_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 5)))); + uint32x4_t y6 = vld1q_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 6)))); + uint32x4_t y7 = vld1q_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 7)))); + uint32x4_t y8 = vld1q_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 8)))); + uint32x4_t y9 = vld1q_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 9)))); + uint32x4_t y10 = vld1q_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 10)))); + uint32x4_t y11 = vld1q_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 11)))); + uint32x4_t y12 = vld1q_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 12)))); + uint32x4_t y13 = vld1q_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 13)))); + uint32x4_t y14 = vld1q_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 14)))); + uint32x4_t y15 = vld1q_u32(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 15)))); + + // The top 4 bits of the swizzle are handled by plain reg moves here. The bottom 4 are identity. + uint32x4x4_t out0 = { { y0, y1, y2, y3} }; + uint32x4x4_t out1 = { { y4, y5, y6, y7 } }; + uint32x4x4_t out2 = { { y8, y9, y10, y11} }; + uint32x4x4_t out3 = { { y12, y13, y14, y15} }; + + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 0 * 64)), out0); + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 1 * 64)), out1); + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 2 * 64)), out2); + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 3 * 64)), out3); + } +}; + +class MicroSw_R_2BPE_NEON +{ + MicroSw_R_2BPE_NEON() = delete; +public: + static constexpr UINT_64 MicroEq[8] = { 0, X0, X1, X2, Y0, Y1, Y2, X3 }; + static constexpr ADDR_EXTENT3D MicroBlockExtent = {16, 8, 1}; + static constexpr UINT_32 BpeLog2 = 1; + static constexpr UINT_32 ExpandX = 4; + NEON_FUNC static void CopyMicroBlock( + void* pImgMicroblock, // Microblock to write to + const void* pBuf, // Pointer to data starting from the first pixel of this block + size_t bufStrideY, // Stride of each row in pBuf + size_t bufStrideZ) // Stride of each slice in pBuf + { + // Unaligned buffer loads for 8 rows * (16 col * 16b = 256b) + // ARM can do a 256b load/store, but the actual values are 128b. + // Each reg becomes: [ X2 X1 X0 0 ] + uint32x4x2_t y0 = vld1q_u32_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 0)))); + uint32x4x2_t y1 = vld1q_u32_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 1)))); + uint32x4x2_t y2 = vld1q_u32_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 2)))); + uint32x4x2_t y3 = vld1q_u32_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 3)))); + uint32x4x2_t y4 = vld1q_u32_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 4)))); + uint32x4x2_t y5 = vld1q_u32_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 5)))); + uint32x4x2_t y6 = vld1q_u32_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 6)))); + uint32x4x2_t y7 = vld1q_u32_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 7)))); + + // The top 4 bits of the swizzle are handled by plain reg moves here. The bottom 4 are identity. + uint32x4x4_t out0 = { { y0.val[0], y1.val[0], y2.val[0], y3.val[0]} }; + uint32x4x4_t out1 = { { y4.val[0], y5.val[0], y6.val[0], y7.val[0] } }; + uint32x4x4_t out2 = { { y0.val[1], y1.val[1], y2.val[1], y3.val[1]} }; + uint32x4x4_t out3 = { { y4.val[1], y5.val[1], y6.val[1], y7.val[1] } }; + + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 0 * 64)), out0); + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 1 * 64)), out1); + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 2 * 64)), out2); + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 3 * 64)), out3); + } +}; + +class MicroSw_R_4BPE_NEON +{ + MicroSw_R_4BPE_NEON() = delete; +public: + static constexpr UINT_64 MicroEq[8] = { 0, 0, X0, X1, Y0, Y1, X2, Y2 }; + static constexpr ADDR_EXTENT3D MicroBlockExtent = {8, 8, 1}; + static constexpr UINT_32 BpeLog2 = 2; + static constexpr UINT_32 ExpandX = 4; + + NEON_FUNC static void CopyMicroBlock( + void* pImgMicroblock, // Microblock to write to + const void* pBuf, // Pointer to data starting from the first pixel of this block + size_t bufStrideY, // Stride of each row in pBuf + size_t bufStrideZ) // Stride of each slice in pBuf + { + // Unaligned buffer loads for 8 rows * (16 col * 16b = 256b) + // ARM can do a 256b load/store, but the actual values are 128b. + uint32x4x2_t y0 = vld1q_u32_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 0)))); + uint32x4x2_t y1 = vld1q_u32_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 1)))); + uint32x4x2_t y2 = vld1q_u32_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 2)))); + uint32x4x2_t y3 = vld1q_u32_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 3)))); + uint32x4x2_t y4 = vld1q_u32_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 4)))); + uint32x4x2_t y5 = vld1q_u32_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 5)))); + uint32x4x2_t y6 = vld1q_u32_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 6)))); + uint32x4x2_t y7 = vld1q_u32_x2(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 7)))); + + // The top 4 bits of the swizzle are handled by plain reg moves here. The bottom 4 are identity. + uint32x4x4_t out0 = { { y0.val[0], y1.val[0], y2.val[0], y3.val[0]} }; + uint32x4x4_t out1 = { { y0.val[1], y1.val[1], y2.val[1], y3.val[1]} }; + uint32x4x4_t out2 = { { y4.val[0], y5.val[0], y6.val[0], y7.val[0] } }; + uint32x4x4_t out3 = { { y4.val[1], y5.val[1], y6.val[1], y7.val[1] } }; + + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 0 * 64)), out0); + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 1 * 64)), out1); + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 2 * 64)), out2); + vst1q_u32_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 3 * 64)), out3); + } +}; + +class MicroSw_Z_1BPE_NEON +{ + MicroSw_Z_1BPE_NEON() = delete; +public: + static constexpr UINT_64 MicroEq[8] = { X0, Y0, X1, Y1, X2, Y2, X3, Y3 }; + static constexpr ADDR_EXTENT3D MicroBlockExtent = {16, 16, 1}; + static constexpr UINT_32 BpeLog2 = 0; + static constexpr UINT_32 ExpandX = 2; + + NEON_FUNC static void CopyMicroBlock( + void* pImgMicroblock, // Microblock to write to + const void* pBuf, // Pointer to data starting from the first pixel of this block + size_t bufStrideY, // Stride of each row in pBuf + size_t bufStrideZ) // Stride of each slice in pBuf + { + // Unaligned buffer loads for 16 rows * (16 col * 8b = 128b) + // Each reg becomes: [ X3 X2 X1 X0 ] + uint16x8_t y0 = vld1q_u16(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 0)))); + uint16x8_t y1 = vld1q_u16(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 1)))); + uint16x8_t y2 = vld1q_u16(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 2)))); + uint16x8_t y3 = vld1q_u16(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 3)))); + uint16x8_t y4 = vld1q_u16(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 4)))); + uint16x8_t y5 = vld1q_u16(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 5)))); + uint16x8_t y6 = vld1q_u16(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 6)))); + uint16x8_t y7 = vld1q_u16(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 7)))); + uint16x8_t y8 = vld1q_u16(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 8)))); + uint16x8_t y9 = vld1q_u16(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 9)))); + uint16x8_t y10 = vld1q_u16(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 10)))); + uint16x8_t y11 = vld1q_u16(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 11)))); + uint16x8_t y12 = vld1q_u16(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 12)))); + uint16x8_t y13 = vld1q_u16(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 13)))); + uint16x8_t y14 = vld1q_u16(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 14)))); + uint16x8_t y15 = vld1q_u16(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 15)))); + + // First do a 16-bit interleave across y0 and recast to 64-bits (recast is a nop, just C++-ness) + // Each reg becomes: [ X2 X1 Y0 X0 ] + uint64x2_t firstcomb0a = vreinterpretq_u64_u16(vzip1q_u16(y0, y1)); + uint64x2_t firstcomb0b = vreinterpretq_u64_u16(vzip2q_u16(y0, y1)); + uint64x2_t firstcomb1a = vreinterpretq_u64_u16(vzip1q_u16(y2, y3)); + uint64x2_t firstcomb1b = vreinterpretq_u64_u16(vzip2q_u16(y2, y3)); + uint64x2_t firstcomb2a = vreinterpretq_u64_u16(vzip1q_u16(y4, y5)); + uint64x2_t firstcomb2b = vreinterpretq_u64_u16(vzip2q_u16(y4, y5)); + uint64x2_t firstcomb3a = vreinterpretq_u64_u16(vzip1q_u16(y6, y7)); + uint64x2_t firstcomb3b = vreinterpretq_u64_u16(vzip2q_u16(y6, y7)); + uint64x2_t firstcomb4a = vreinterpretq_u64_u16(vzip1q_u16(y8, y9)); + uint64x2_t firstcomb4b = vreinterpretq_u64_u16(vzip2q_u16(y8, y9)); + uint64x2_t firstcomb5a = vreinterpretq_u64_u16(vzip1q_u16(y10, y11)); + uint64x2_t firstcomb5b = vreinterpretq_u64_u16(vzip2q_u16(y10, y11)); + uint64x2_t firstcomb6a = vreinterpretq_u64_u16(vzip1q_u16(y12, y13)); + uint64x2_t firstcomb6b = vreinterpretq_u64_u16(vzip2q_u16(y12, y13)); + uint64x2_t firstcomb7a = vreinterpretq_u64_u16(vzip1q_u16(y14, y15)); + uint64x2_t firstcomb7b = vreinterpretq_u64_u16(vzip2q_u16(y14, y15)); + + // Then do a 64-bit interleave across y1 + // Each reg becomes: [ Y1 X1 Y0 X0 ] + uint64x2_t comb0a = vzip1q_u64(firstcomb0a, firstcomb1a); + uint64x2_t comb0b = vzip2q_u64(firstcomb0a, firstcomb1a); + uint64x2_t comb1a = vzip1q_u64(firstcomb0b, firstcomb1b); + uint64x2_t comb1b = vzip2q_u64(firstcomb0b, firstcomb1b); + uint64x2_t comb2a = vzip1q_u64(firstcomb2a, firstcomb3a); + uint64x2_t comb2b = vzip2q_u64(firstcomb2a, firstcomb3a); + uint64x2_t comb3a = vzip1q_u64(firstcomb2b, firstcomb3b); + uint64x2_t comb3b = vzip2q_u64(firstcomb2b, firstcomb3b); + uint64x2_t comb4a = vzip1q_u64(firstcomb4a, firstcomb5a); + uint64x2_t comb4b = vzip2q_u64(firstcomb4a, firstcomb5a); + uint64x2_t comb5a = vzip1q_u64(firstcomb4b, firstcomb5b); + uint64x2_t comb5b = vzip2q_u64(firstcomb4b, firstcomb5b); + uint64x2_t comb6a = vzip1q_u64(firstcomb6a, firstcomb7a); + uint64x2_t comb6b = vzip2q_u64(firstcomb6a, firstcomb7a); + uint64x2_t comb7a = vzip1q_u64(firstcomb6b, firstcomb7b); + uint64x2_t comb7b = vzip2q_u64(firstcomb6b, firstcomb7b); + + // Finally, the top 4 bits of the swizzle are handled by plain reg moves here. + uint64x2x4_t out0 = { { comb0a, comb0b, comb2a, comb2b } }; + uint64x2x4_t out1 = { { comb1a, comb1b, comb3a, comb3b } }; + uint64x2x4_t out2 = { { comb4a, comb4b, comb6a, comb6b } }; + uint64x2x4_t out3 = { { comb5a, comb5b, comb7a, comb7b } }; + + vst1q_u64_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 0 * 64)), out0); + vst1q_u64_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 1 * 64)), out1); + vst1q_u64_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 2 * 64)), out2); + vst1q_u64_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 3 * 64)), out3); + } +}; + +class MicroSw_D_1BPE_NEON +{ + MicroSw_D_1BPE_NEON() = delete; +public: + static constexpr UINT_64 MicroEq[8] = { X0, X1, X2, Y1, Y0, Y2, X3, Y3 }; + static constexpr ADDR_EXTENT3D MicroBlockExtent = {16, 16, 1}; + static constexpr UINT_32 BpeLog2 = 0; + static constexpr UINT_32 ExpandX = 4; + NEON_FUNC static void CopyMicroBlock( + void* pImgMicroblock, // Microblock to write to + const void* pBuf, // Pointer to data starting from the first pixel of this block + size_t bufStrideY, // Stride of each row in pBuf + size_t bufStrideZ) // Stride of each slice in pBuf + { + // Unaligned buffer loads for 16 rows * (16 col * 8b = 128b) + // Each reg becomes: [ X3 X2 X1 X0 ] + uint64x2_t y0 = vld1q_u64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 0)))); + uint64x2_t y1 = vld1q_u64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 1)))); + uint64x2_t y2 = vld1q_u64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 2)))); + uint64x2_t y3 = vld1q_u64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 3)))); + uint64x2_t y4 = vld1q_u64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 4)))); + uint64x2_t y5 = vld1q_u64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 5)))); + uint64x2_t y6 = vld1q_u64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 6)))); + uint64x2_t y7 = vld1q_u64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 7)))); + uint64x2_t y8 = vld1q_u64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 8)))); + uint64x2_t y9 = vld1q_u64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 9)))); + uint64x2_t y10 = vld1q_u64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 10)))); + uint64x2_t y11 = vld1q_u64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 11)))); + uint64x2_t y12 = vld1q_u64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 12)))); + uint64x2_t y13 = vld1q_u64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 13)))); + uint64x2_t y14 = vld1q_u64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 14)))); + uint64x2_t y15 = vld1q_u64(reinterpret_cast(VoidPtrInc(pBuf, (bufStrideY * 15)))); + + // Do a 64-bit zip/unpack operation to interleave within each 128b reg. + // Each reg becomes: [ Y1 X2 X1 X0 ] + uint64x2_t comb0a = vzip1q_u64(y0, y2); + uint64x2_t comb0b = vzip2q_u64(y0, y2); + uint64x2_t comb1a = vzip1q_u64(y1, y3); + uint64x2_t comb1b = vzip2q_u64(y1, y3); + uint64x2_t comb2a = vzip1q_u64(y4, y6); + uint64x2_t comb2b = vzip2q_u64(y4, y6); + uint64x2_t comb3a = vzip1q_u64(y5, y7); + uint64x2_t comb3b = vzip2q_u64(y5, y7); + uint64x2_t comb4a = vzip1q_u64(y8, y10); + uint64x2_t comb4b = vzip2q_u64(y8, y10); + uint64x2_t comb5a = vzip1q_u64(y9, y11); + uint64x2_t comb5b = vzip2q_u64(y9, y11); + uint64x2_t comb6a = vzip1q_u64(y12, y14); + uint64x2_t comb6b = vzip2q_u64(y12, y14); + uint64x2_t comb7a = vzip1q_u64(y13, y15); + uint64x2_t comb7b = vzip2q_u64(y13, y15); + + // The top 4 bits of the swizzle are handled by plain reg moves here. + uint64x2x4_t out0 = { { comb0a, comb1a, comb2a, comb3a } }; + uint64x2x4_t out1 = { { comb0b, comb1b, comb2b, comb3b } }; + uint64x2x4_t out2 = { { comb4a, comb5a, comb6a, comb7a } }; + uint64x2x4_t out3 = { { comb4b, comb5b, comb6b, comb7b } }; + + vst1q_u64_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 0 * 64)), out0); + vst1q_u64_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 1 * 64)), out1); + vst1q_u64_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 2 * 64)), out2); + vst1q_u64_x4(reinterpret_cast(VoidPtrInc(pImgMicroblock, 3 * 64)), out3); + } +}; +#endif // ADDR_HAS_NEON + +} +#endif diff --git a/src/amd/addrlib/src/gfx10/gfx10addrlib.cpp b/src/amd/addrlib/src/gfx10/gfx10addrlib.cpp index 190b1b0a5ea..75e47bf2812 100644 --- a/src/amd/addrlib/src/gfx10/gfx10addrlib.cpp +++ b/src/amd/addrlib/src/gfx10/gfx10addrlib.cpp @@ -1,7 +1,7 @@ /* ************************************************************************************************************************ * -* Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved. +* Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved. * SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -2782,9 +2782,11 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlGetPreferredSurfaceSetting( if ((forbid64KbBlockType == FALSE) && (forbidVarBlockType == FALSE)) { + UINT_32 ratioLow; + UINT_32 ratioHi; + GetSwizzleModePreferenceRatio(pIn, &ratioLow, &ratioHi); + const UINT_8 maxFmaskSwizzleModeType = 2; - const UINT_32 ratioLow = pIn->flags.minimizeAlign ? 1 : (pIn->flags.opt4space ? 3 : 2); - const UINT_32 ratioHi = pIn->flags.minimizeAlign ? 1 : (pIn->flags.opt4space ? 2 : 1); const UINT_32 fmaskBpp = GetFmaskBpp(pIn->numSamples, pIn->numFrags); const UINT_32 numSlices = Max(pIn->numSlices, 1u); const UINT_32 width = Max(pIn->width, 1u); @@ -3097,8 +3099,10 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlGetPreferredSurfaceSetting( // Tracks the size of each valid swizzle mode's surface in bytes UINT_64 padSize[AddrBlockMaxTiledType] = {}; - const UINT_32 ratioLow = computeMinSize ? 1 : (pIn->flags.opt4space ? 3 : 2); - const UINT_32 ratioHi = computeMinSize ? 1 : (pIn->flags.opt4space ? 2 : 1); + UINT_32 ratioLow; + UINT_32 ratioHi; + GetSwizzleModePreferenceRatio(pIn, &ratioLow, &ratioHi); + const UINT_64 sizeAlignInElement = Max(NextPow2(pIn->minSizeAlign) / (bpp >> 3), 1u); UINT_32 minSizeBlk = AddrBlockMicro; // Tracks the most optimal block to use UINT_64 minSize = 0; // Tracks the minimum acceptable block type @@ -4111,7 +4115,7 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlComputeSurfaceAddrFromCoordTiled( * Gfx10Lib::HwlCopyMemToSurface * * @brief -* Copy multiple regions from memory to a non-linear surface. +* Copy multiple regions from memory to a non-linear surface. * * @return * Error or success. @@ -4177,7 +4181,7 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlCopyMemToSurface( LutAddresser addresser = LutAddresser(); addresser.Init(fullSwizzlePattern, ADDR_MAX_EQUATION_BIT, blockExtent, blkSizeLog2); - UnalignedCopyMemImgFunc pfnCopyUnaligned = addresser.GetCopyMemImgFunc(); + UnalignedCopyMemImgFunc pfnCopyUnaligned = addresser.GetCopyMemImgFunc(pIn->copyFlags); if (pfnCopyUnaligned == nullptr) { ADDR_ASSERT_ALWAYS(); @@ -4192,35 +4196,27 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlCopyMemToSurface( const ADDR2_MIP_INFO* pMipInfo = &mipInfo[pCurRegion->mipId]; UINT_64 mipOffset = pIn->singleSubres ? 0 : pMipInfo->macroBlockOffset; UINT_32 yBlks = pMipInfo->pitch / localOut.blockWidth; + UINT_32 zBlks = localOut.sliceSize >> (addresser.GetBlockBits() - addresser.GetBlockZBits()); - UINT_32 xStart = pCurRegion->x + pMipInfo->mipTailCoordX; - UINT_32 yStart = pCurRegion->y + pMipInfo->mipTailCoordY; - UINT_32 sliceStart = pCurRegion->slice + pMipInfo->mipTailCoordZ; + ADDR_COORD3D rawOrigin = { + pCurRegion->x + pMipInfo->mipTailCoordX, + pCurRegion->y + pMipInfo->mipTailCoordY, + pCurRegion->slice + pMipInfo->mipTailCoordZ + }; - for (UINT_32 slice = sliceStart; slice < (sliceStart + pCurRegion->copyDims.depth); slice++) - { - // The copy functions take the base address of the hardware slice, not the logical slice. Those are - // not the same thing in 3D swizzles. Logical slices within 3D swizzles are handled by sliceXor - // for unaligned copies. - UINT_32 sliceBlkStart = PowTwoAlignDown(slice, localOut.blockSlices); - UINT_32 sliceXor = pIn->pbXor ^ addresser.GetAddressZ(slice); - - UINT_64 memOffset = ((slice - pCurRegion->slice) * pCurRegion->memSlicePitch); - UINT_64 imgOffset = mipOffset + (sliceBlkStart * localOut.sliceSize); - - ADDR_COORD2D sliceOrigin = { xStart, yStart }; - ADDR_EXTENT2D sliceExtent = { pCurRegion->copyDims.width, pCurRegion->copyDims.height }; - - pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, imgOffset), - VoidPtrInc(pCurRegion->pMem, memOffset), - pCurRegion->memRowPitch, - yBlks, - sliceOrigin, - sliceExtent, - sliceXor, - addresser); - } + pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, mipOffset), + pCurRegion->pMem, + pCurRegion->memRowPitch, + pCurRegion->memSlicePitch, + yBlks, + zBlks, + rawOrigin, + pCurRegion->copyDims, + pIn->pbXor, + (pCurRegion->mipId >= localOut.firstMipIdInTail), + addresser); } + addresser.DoCopyMemImgPostFlushes(pIn->copyFlags); } return returnCode; } @@ -4230,7 +4226,7 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlCopyMemToSurface( * Gfx10Lib::HwlCopySurfaceToMem * * @brief -* Copy multiple regions from a non-linear surface to memory. +* Copy multiple regions from a non-linear surface to memory. * * @return * Error or success. @@ -4296,7 +4292,7 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlCopySurfaceToMem( LutAddresser addresser = LutAddresser(); addresser.Init(fullSwizzlePattern, ADDR_MAX_EQUATION_BIT, blockExtent, blkSizeLog2); - UnalignedCopyMemImgFunc pfnCopyUnaligned = addresser.GetCopyImgMemFunc(); + UnalignedCopyMemImgFunc pfnCopyUnaligned = addresser.GetCopyImgMemFunc(pIn->copyFlags); if (pfnCopyUnaligned == nullptr) { ADDR_ASSERT_ALWAYS(); @@ -4305,40 +4301,32 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlCopySurfaceToMem( if (returnCode == ADDR_OK) { + addresser.DoCopyImgMemPreFlushes(pIn->copyFlags); for (UINT_32 regionIdx = 0; regionIdx < regionCount; regionIdx++) { const ADDR2_COPY_MEMSURFACE_REGION* pCurRegion = &pRegions[regionIdx]; const ADDR2_MIP_INFO* pMipInfo = &mipInfo[pCurRegion->mipId]; UINT_64 mipOffset = pIn->singleSubres ? 0 : pMipInfo->macroBlockOffset; UINT_32 yBlks = pMipInfo->pitch / localOut.blockWidth; + UINT_32 zBlks = localOut.sliceSize >> (addresser.GetBlockBits() - addresser.GetBlockZBits()); - UINT_32 xStart = pCurRegion->x + pMipInfo->mipTailCoordX; - UINT_32 yStart = pCurRegion->y + pMipInfo->mipTailCoordY; - UINT_32 sliceStart = pCurRegion->slice + pMipInfo->mipTailCoordZ; + ADDR_COORD3D rawOrigin = { + pCurRegion->x + pMipInfo->mipTailCoordX, + pCurRegion->y + pMipInfo->mipTailCoordY, + pCurRegion->slice + pMipInfo->mipTailCoordZ + }; - for (UINT_32 slice = sliceStart; slice < (sliceStart + pCurRegion->copyDims.depth); slice++) - { - // The copy functions take the base address of the hardware slice, not the logical slice. Those are - // not the same thing in 3D swizzles. Logical slices within 3D swizzles are handled by sliceXor - // for unaligned copies. - UINT_32 sliceBlkStart = PowTwoAlignDown(slice, localOut.blockSlices); - UINT_32 sliceXor = pIn->pbXor ^ addresser.GetAddressZ(slice); - - UINT_64 memOffset = ((slice - pCurRegion->slice) * pCurRegion->memSlicePitch); - UINT_64 imgOffset = mipOffset + (sliceBlkStart * localOut.sliceSize); - - ADDR_COORD2D sliceOrigin = { xStart, yStart }; - ADDR_EXTENT2D sliceExtent = { pCurRegion->copyDims.width, pCurRegion->copyDims.height }; - - pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, imgOffset), - VoidPtrInc(pCurRegion->pMem, memOffset), - pCurRegion->memRowPitch, - yBlks, - sliceOrigin, - sliceExtent, - sliceXor, - addresser); - } + pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, mipOffset), + pCurRegion->pMem, + pCurRegion->memRowPitch, + pCurRegion->memSlicePitch, + yBlks, + zBlks, + rawOrigin, + pCurRegion->copyDims, + pIn->pbXor, + (pCurRegion->mipId >= localOut.firstMipIdInTail), + addresser); } } return returnCode; diff --git a/src/amd/addrlib/src/gfx11/gfx11addrlib.cpp b/src/amd/addrlib/src/gfx11/gfx11addrlib.cpp index 4fd4fc19474..83dfeee09dc 100644 --- a/src/amd/addrlib/src/gfx11/gfx11addrlib.cpp +++ b/src/amd/addrlib/src/gfx11/gfx11addrlib.cpp @@ -1,7 +1,7 @@ /* ************************************************************************************************************************ * -* Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved. +* Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved. * SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -758,6 +758,14 @@ ChipFamily Gfx11Lib::HwlConvertChipFamily( case FAMILY_PHX: m_settings.isPhoenix = 1; break; + case FAMILY_GFX1170: + { + if (ASICREV_IS_GFX1170(chipRevision)) + { + m_settings.isGfx1170 = 1; + } + } + break; default: ADDR_ASSERT(!"Unknown chip family"); break; @@ -2651,10 +2659,13 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlGetPreferredSurfaceSetting( UINT_64 padSize[AddrBlockMaxTiledType] = {}; - const UINT_32 ratioLow = computeMinSize ? 1 : (pIn->flags.opt4space ? 3 : 2); - const UINT_32 ratioHi = computeMinSize ? 1 : (pIn->flags.opt4space ? 2 : 1); + UINT_32 ratioLow; + UINT_32 ratioHi; + GetSwizzleModePreferenceRatio(pIn, &ratioLow, &ratioHi); + const UINT_64 sizeAlignInElement = Max(NextPow2(pIn->minSizeAlign) / (bpp >> 3), 1u); UINT_32 minSizeBlk = AddrBlockMicro; + UINT_32 selectedBlk = AddrBlockMaxTiledType; UINT_64 minSize = 0; ADDR2_COMPUTE_SURFACE_INFO_OUTPUT localOut = {}; @@ -2678,11 +2689,66 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlGetPreferredSurfaceSetting( { padSize[i] = localOut.surfSize; - if ((minSize == 0) || - Addr2BlockTypeWithinMemoryBudget(minSize, padSize[i], ratioLow, ratioHi)) + if (pIn->useBlockBasedHeuristic) { - minSize = padSize[i]; - minSizeBlk = i; + const UINT_32 blockCountX = localOut.pitch / localOut.blockWidth; + const UINT_32 blockCountY = localOut.height / localOut.blockHeight; + const UINT_32 blockCountZ = localOut.numSlices / localOut.blockSlices; + + UINT_32 requiredBlockCountX = 1; + UINT_32 requiredBlockCountY = 1; + UINT_32 requiredBlockCountZ = 1; + + switch (pIn->resourceType) + { + case ADDR_RSRC_TEX_1D: + requiredBlockCountX = 2; + break; + case ADDR_RSRC_TEX_2D: + requiredBlockCountX = 2; + requiredBlockCountY = 2; + break; + case ADDR_RSRC_TEX_3D: + requiredBlockCountX = 2; + requiredBlockCountY = 2; + if (IsThick(pIn->resourceType, localIn.swizzleMode)) + { + requiredBlockCountZ = 2; + } + break; + default: + ADDR_ASSERT_ALWAYS(); + } + + // If the block count is sufficient, select this block type. Otherwise, track the block type with minimum size to + // fall back to it, in case no block type can satisfy the block count requirement. + if ((blockCountX >= requiredBlockCountX) && + (blockCountY >= requiredBlockCountY) && + (blockCountZ >= requiredBlockCountZ) && + (localIn.swizzleMode != ADDR_SW_LINEAR)) + { + selectedBlk = i; + } + else + { + const bool has3DThick = (allowedSwModeSet.value & Gfx11Rsrc3dThickSwModeMask) != 0; + const bool is3DThin = (pOut->resourceType == ADDR_RSRC_TEX_3D) && + IsThin(pOut->resourceType, swMode[i]); + if (((has3DThick && is3DThin) == FALSE) && (minSize == 0 || (padSize[i] < minSize))) + { + minSize = padSize[i]; + minSizeBlk = i; + } + } + } + else + { + if ((minSize == 0) || + Addr2BlockTypeWithinMemoryBudget(minSize, padSize[i], ratioLow, ratioHi)) + { + minSize = padSize[i]; + minSizeBlk = i; + } } } else @@ -2693,63 +2759,77 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlGetPreferredSurfaceSetting( } } - if (pIn->memoryBudget > 1.0) + if (pIn->useBlockBasedHeuristic) { - // If minimum size is given by swizzle mode with bigger-block type, then don't ever check - // smaller-block type again in coming loop - switch (minSizeBlk) + // If there was no block size that would satisfy block based heuristic, fall back to the budget-based heuristic. + if (selectedBlk == AddrBlockMaxTiledType) { - case AddrBlockThick256KB: - allowedBlockSet.gfx11.thin256KB = 0; - case AddrBlockThin256KB: - allowedBlockSet.macroThick64KB = 0; - case AddrBlockThick64KB: - allowedBlockSet.macroThin64KB = 0; - case AddrBlockThin64KB: - allowedBlockSet.macroThick4KB = 0; - case AddrBlockThick4KB: - allowedBlockSet.macroThin4KB = 0; - case AddrBlockThin4KB: - allowedBlockSet.micro = 0; - case AddrBlockMicro: - allowedBlockSet.linear = 0; - case AddrBlockLinear: - break; - - default: - ADDR_ASSERT_ALWAYS(); - break; + selectedBlk = minSizeBlk; } - - for (UINT_32 i = AddrBlockMicro; i < AddrBlockMaxTiledType; i++) + } + else + { + if (pIn->memoryBudget > 1.0) { - if ((i != minSizeBlk) && - Addr2IsBlockTypeAvailable(allowedBlockSet, static_cast<::AddrBlockType>(i))) + // If minimum size is given by swizzle mode with bigger-block type, then don't ever check + // smaller-block type again in coming loop + switch (minSizeBlk) { - if (Addr2BlockTypeWithinMemoryBudget(minSize, padSize[i], 0, 0, pIn->memoryBudget) == FALSE) + case AddrBlockThick256KB: + allowedBlockSet.gfx11.thin256KB = 0; + case AddrBlockThin256KB: + allowedBlockSet.macroThick64KB = 0; + case AddrBlockThick64KB: + allowedBlockSet.macroThin64KB = 0; + case AddrBlockThin64KB: + allowedBlockSet.macroThick4KB = 0; + case AddrBlockThick4KB: + allowedBlockSet.macroThin4KB = 0; + case AddrBlockThin4KB: + allowedBlockSet.micro = 0; + case AddrBlockMicro: + allowedBlockSet.linear = 0; + case AddrBlockLinear: + break; + + default: + ADDR_ASSERT_ALWAYS(); + break; + } + + for (UINT_32 i = AddrBlockMicro; i < AddrBlockMaxTiledType; i++) + { + if ((i != minSizeBlk) && + Addr2IsBlockTypeAvailable(allowedBlockSet, static_cast(i))) { - // Clear the block type if the memory waste is unacceptable - allowedBlockSet.value &= ~(1u << (i - 1)); + if (Addr2BlockTypeWithinMemoryBudget(minSize, padSize[i], 0, 0, pIn->memoryBudget) == FALSE) + { + // Clear the block type if the memory waste is unacceptable + allowedBlockSet.value &= ~(1u << (i - 1)); + } } } + + // Remove linear block type if 2 or more block types are allowed + if (IsPow2(allowedBlockSet.value) == FALSE) + { + allowedBlockSet.linear = 0; + } + + // Select the biggest allowed block type + minSizeBlk = Log2(allowedBlockSet.value) + 1; + + if (minSizeBlk == static_cast(AddrBlockMaxTiledType)) + { + minSizeBlk = AddrBlockLinear; + } } - // Remove linear block type if 2 or more block types are allowed - if (IsPow2(allowedBlockSet.value) == FALSE) - { - allowedBlockSet.linear = 0; - } - - // Select the biggest allowed block type - minSizeBlk = Log2(allowedBlockSet.value) + 1; - - if (minSizeBlk == static_cast(AddrBlockMaxTiledType)) - { - minSizeBlk = AddrBlockLinear; - } + selectedBlk = minSizeBlk; } - switch (minSizeBlk) + + switch (selectedBlk) { case AddrBlockLinear: allowedSwModeSet.value &= Gfx11LinearSwModeMask; @@ -3685,7 +3765,7 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlComputeSurfaceAddrFromCoordTiled( * Gfx11Lib::HwlCopyMemToSurface * * @brief -* Copy multiple regions from memory to a non-linear surface. +* Copy multiple regions from memory to a non-linear surface. * * @return * Error or success. @@ -3751,7 +3831,7 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlCopyMemToSurface( LutAddresser addresser = LutAddresser(); addresser.Init(fullSwizzlePattern, ADDR_MAX_EQUATION_BIT, blockExtent, blkSizeLog2); - UnalignedCopyMemImgFunc pfnCopyUnaligned = addresser.GetCopyMemImgFunc(); + UnalignedCopyMemImgFunc pfnCopyUnaligned = addresser.GetCopyMemImgFunc(pIn->copyFlags); if (pfnCopyUnaligned == nullptr) { ADDR_ASSERT_ALWAYS(); @@ -3766,35 +3846,27 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlCopyMemToSurface( const ADDR2_MIP_INFO* pMipInfo = &mipInfo[pCurRegion->mipId]; UINT_64 mipOffset = pIn->singleSubres ? 0 : pMipInfo->macroBlockOffset; UINT_32 yBlks = pMipInfo->pitch / localOut.blockWidth; + UINT_32 zBlks = localOut.sliceSize >> (addresser.GetBlockBits() - addresser.GetBlockZBits()); - UINT_32 xStart = pCurRegion->x + pMipInfo->mipTailCoordX; - UINT_32 yStart = pCurRegion->y + pMipInfo->mipTailCoordY; - UINT_32 sliceStart = pCurRegion->slice + pMipInfo->mipTailCoordZ; + ADDR_COORD3D rawOrigin = { + pCurRegion->x + pMipInfo->mipTailCoordX, + pCurRegion->y + pMipInfo->mipTailCoordY, + pCurRegion->slice + pMipInfo->mipTailCoordZ + }; - for (UINT_32 slice = sliceStart; slice < (sliceStart + pCurRegion->copyDims.depth); slice++) - { - // The copy functions take the base address of the hardware slice, not the logical slice. Those are - // not the same thing in 3D swizzles. Logical slices within 3D swizzles are handled by sliceXor - // for unaligned copies. - UINT_32 sliceBlkStart = PowTwoAlignDown(slice, localOut.blockSlices); - UINT_32 sliceXor = pIn->pbXor ^ addresser.GetAddressZ(slice); - - UINT_64 memOffset = ((slice - pCurRegion->slice) * pCurRegion->memSlicePitch); - UINT_64 imgOffset = mipOffset + (sliceBlkStart * localOut.sliceSize); - - ADDR_COORD2D sliceOrigin = { xStart, yStart }; - ADDR_EXTENT2D sliceExtent = { pCurRegion->copyDims.width, pCurRegion->copyDims.height }; - - pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, imgOffset), - VoidPtrInc(pCurRegion->pMem, memOffset), - pCurRegion->memRowPitch, - yBlks, - sliceOrigin, - sliceExtent, - sliceXor, - addresser); - } + pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, mipOffset), + pCurRegion->pMem, + pCurRegion->memRowPitch, + pCurRegion->memSlicePitch, + yBlks, + zBlks, + rawOrigin, + pCurRegion->copyDims, + pIn->pbXor, + (pCurRegion->mipId >= localOut.firstMipIdInTail), + addresser); } + addresser.DoCopyMemImgPostFlushes(pIn->copyFlags); } return returnCode; } @@ -3804,7 +3876,7 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlCopyMemToSurface( * Gfx11Lib::HwlCopySurfaceToMem * * @brief -* Copy multiple regions from a non-linear surface to memory. +* Copy multiple regions from a non-linear surface to memory. * * @return * Error or success. @@ -3870,7 +3942,7 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlCopySurfaceToMem( LutAddresser addresser = LutAddresser(); addresser.Init(fullSwizzlePattern, ADDR_MAX_EQUATION_BIT, blockExtent, blkSizeLog2); - UnalignedCopyMemImgFunc pfnCopyUnaligned = addresser.GetCopyImgMemFunc(); + UnalignedCopyMemImgFunc pfnCopyUnaligned = addresser.GetCopyImgMemFunc(pIn->copyFlags); if (pfnCopyUnaligned == nullptr) { ADDR_ASSERT_ALWAYS(); @@ -3879,40 +3951,32 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlCopySurfaceToMem( if (returnCode == ADDR_OK) { + addresser.DoCopyImgMemPreFlushes(pIn->copyFlags); for (UINT_32 regionIdx = 0; regionIdx < regionCount; regionIdx++) { const ADDR2_COPY_MEMSURFACE_REGION* pCurRegion = &pRegions[regionIdx]; const ADDR2_MIP_INFO* pMipInfo = &mipInfo[pCurRegion->mipId]; UINT_64 mipOffset = pIn->singleSubres ? 0 : pMipInfo->macroBlockOffset; UINT_32 yBlks = pMipInfo->pitch / localOut.blockWidth; + UINT_32 zBlks = localOut.sliceSize >> (addresser.GetBlockBits() - addresser.GetBlockZBits()); - UINT_32 xStart = pCurRegion->x + pMipInfo->mipTailCoordX; - UINT_32 yStart = pCurRegion->y + pMipInfo->mipTailCoordY; - UINT_32 sliceStart = pCurRegion->slice + pMipInfo->mipTailCoordZ; + ADDR_COORD3D rawOrigin = { + pCurRegion->x + pMipInfo->mipTailCoordX, + pCurRegion->y + pMipInfo->mipTailCoordY, + pCurRegion->slice + pMipInfo->mipTailCoordZ + }; - for (UINT_32 slice = sliceStart; slice < (sliceStart + pCurRegion->copyDims.depth); slice++) - { - // The copy functions take the base address of the hardware slice, not the logical slice. Those are - // not the same thing in 3D swizzles. Logical slices within 3D swizzles are handled by sliceXor - // for unaligned copies. - UINT_32 sliceBlkStart = PowTwoAlignDown(slice, localOut.blockSlices); - UINT_32 sliceXor = pIn->pbXor ^ addresser.GetAddressZ(slice); - - UINT_64 memOffset = ((slice - pCurRegion->slice) * pCurRegion->memSlicePitch); - UINT_64 imgOffset = mipOffset + (sliceBlkStart * localOut.sliceSize); - - ADDR_COORD2D sliceOrigin = { xStart, yStart }; - ADDR_EXTENT2D sliceExtent = { pCurRegion->copyDims.width, pCurRegion->copyDims.height }; - - pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, imgOffset), - VoidPtrInc(pCurRegion->pMem, memOffset), - pCurRegion->memRowPitch, - yBlks, - sliceOrigin, - sliceExtent, - sliceXor, - addresser); - } + pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, mipOffset), + pCurRegion->pMem, + pCurRegion->memRowPitch, + pCurRegion->memSlicePitch, + yBlks, + zBlks, + rawOrigin, + pCurRegion->copyDims, + pIn->pbXor, + (pCurRegion->mipId >= localOut.firstMipIdInTail), + addresser); } } return returnCode; diff --git a/src/amd/addrlib/src/gfx11/gfx11addrlib.h b/src/amd/addrlib/src/gfx11/gfx11addrlib.h index fd6508903c1..e0596c656c1 100644 --- a/src/amd/addrlib/src/gfx11/gfx11addrlib.h +++ b/src/amd/addrlib/src/gfx11/gfx11addrlib.h @@ -1,7 +1,7 @@ /* ************************************************************************************************************************ * -* Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved. +* Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved. * SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -36,7 +36,8 @@ struct Gfx11ChipSettings { UINT_32 isStrix : 1; UINT_32 isPhoenix : 1; - UINT_32 reserved1 : 30; + UINT_32 isGfx1170 : 1; + UINT_32 reserved1 : 29; // Misc configuration bits UINT_32 reserved2 : 32; diff --git a/src/amd/addrlib/src/gfx12/gfx12addrlib.cpp b/src/amd/addrlib/src/gfx12/gfx12addrlib.cpp index 755b3c6adbb..0fcc8545194 100644 --- a/src/amd/addrlib/src/gfx12/gfx12addrlib.cpp +++ b/src/amd/addrlib/src/gfx12/gfx12addrlib.cpp @@ -1,7 +1,7 @@ /* ************************************************************************************************************************ * -* Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. +* Copyright (C) 2022-2026 Advanced Micro Devices, Inc. All rights reserved. * SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -71,8 +71,7 @@ const SwizzleModeFlags Gfx12Lib::SwizzleModeTable[ADDR3_MAX_TYPE] = Gfx12Lib::Gfx12Lib( const Client* pClient) : - Lib(pClient), - m_numSwizzleBits(0) + Lib(pClient) { memcpy(m_swizzleModeTable, SwizzleModeTable, sizeof(SwizzleModeTable)); } @@ -878,7 +877,7 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlComputeSurfaceAddrFromCoordTiled( * Gfx12Lib::HwlCopyMemToSurface * * @brief -* Copy multiple regions from memory to a non-linear surface. +* Copy multiple regions from memory to a non-linear surface. * * @return * Error or success. @@ -925,7 +924,7 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlCopyMemToSurface( } LutAddresser addresser = LutAddresser(); - UnalignedCopyMemImgFunc pfnCopyUnaligned = nullptr; + UnalignedCopyMemImgFunc pfnCopyUnaligned = nullptr; if (returnCode == ADDR_OK) { const UINT_32 blkSizeLog2 = GetBlockSizeLog2(pIn->swizzleMode); @@ -936,7 +935,7 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlCopyMemToSurface( ADDR_BIT_SETTING fullSwizzlePattern[Log2Size256K] = {}; GetSwizzlePatternFromPatternInfo(pPatInfo, fullSwizzlePattern); addresser.Init(fullSwizzlePattern, Log2Size256K, localOut.blockExtent, blkSizeLog2); - pfnCopyUnaligned = addresser.GetCopyMemImgFunc(); + pfnCopyUnaligned = addresser.GetCopyMemImgFunc(pIn->copyFlags); if (pfnCopyUnaligned == nullptr) { ADDR_ASSERT_ALWAYS(); // What format is this? @@ -952,35 +951,27 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlCopyMemToSurface( const ADDR3_MIP_INFO* pMipInfo = &mipInfo[pCurRegion->mipId]; UINT_64 mipOffset = pIn->singleSubres ? 0 : pMipInfo->macroBlockOffset; UINT_32 yBlks = pMipInfo->pitch / localOut.blockExtent.width; + UINT_32 zBlks = localOut.sliceSize >> (addresser.GetBlockBits() - addresser.GetBlockZBits()); - UINT_32 xStart = pCurRegion->x + pMipInfo->mipTailCoordX; - UINT_32 yStart = pCurRegion->y + pMipInfo->mipTailCoordY; - UINT_32 sliceStart = pCurRegion->slice + pMipInfo->mipTailCoordZ; + ADDR_COORD3D rawOrigin = { + pCurRegion->x + pMipInfo->mipTailCoordX, + pCurRegion->y + pMipInfo->mipTailCoordY, + pCurRegion->slice + pMipInfo->mipTailCoordZ + }; - for (UINT_32 slice = sliceStart; slice < (sliceStart + pCurRegion->copyDims.depth); slice++) - { - // The copy functions take the base address of the hardware slice, not the logical slice. Those are - // not the same thing in 3D swizzles. Logical slices within 3D swizzles are handled by sliceXor - // for unaligned copies. - UINT_32 sliceBlkStart = PowTwoAlignDown(slice, localOut.blockExtent.depth); - UINT_32 sliceXor = pIn->pbXor ^ addresser.GetAddressZ(slice); - - UINT_64 memOffset = ((slice - pCurRegion->slice) * pCurRegion->memSlicePitch); - UINT_64 imgOffset = mipOffset + (sliceBlkStart * localOut.sliceSize); - - ADDR_COORD2D sliceOrigin = { xStart, yStart }; - ADDR_EXTENT2D sliceExtent = { pCurRegion->copyDims.width, pCurRegion->copyDims.height }; - - pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, imgOffset), - VoidPtrInc(pCurRegion->pMem, memOffset), - pCurRegion->memRowPitch, - yBlks, - sliceOrigin, - sliceExtent, - sliceXor, - addresser); - } + pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, mipOffset), + pCurRegion->pMem, + pCurRegion->memRowPitch, + pCurRegion->memSlicePitch, + yBlks, + zBlks, + rawOrigin, + pCurRegion->copyDims, + pIn->pbXor, + (pCurRegion->mipId >= localOut.firstMipIdInTail), + addresser); } + addresser.DoCopyMemImgPostFlushes(pIn->copyFlags); } return returnCode; } @@ -990,7 +981,7 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlCopyMemToSurface( * Gfx12Lib::HwlCopySurfaceToMem * * @brief -* Copy multiple regions from a non-linear surface to memory. +* Copy multiple regions from a non-linear surface to memory. * * @return * Error or success. @@ -1037,7 +1028,7 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlCopySurfaceToMem( } LutAddresser addresser = LutAddresser(); - UnalignedCopyMemImgFunc pfnCopyUnaligned = nullptr; + UnalignedCopyMemImgFunc pfnCopyUnaligned = nullptr; if (returnCode == ADDR_OK) { const UINT_32 blkSizeLog2 = GetBlockSizeLog2(pIn->swizzleMode); @@ -1048,7 +1039,7 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlCopySurfaceToMem( ADDR_BIT_SETTING fullSwizzlePattern[Log2Size256K] = {}; GetSwizzlePatternFromPatternInfo(pPatInfo, fullSwizzlePattern); addresser.Init(fullSwizzlePattern, Log2Size256K, localOut.blockExtent, blkSizeLog2); - pfnCopyUnaligned = addresser.GetCopyImgMemFunc(); + pfnCopyUnaligned = addresser.GetCopyImgMemFunc(pIn->copyFlags); if (pfnCopyUnaligned == nullptr) { ADDR_ASSERT_ALWAYS(); // What format is this? @@ -1058,78 +1049,37 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlCopySurfaceToMem( if (returnCode == ADDR_OK) { + addresser.DoCopyImgMemPreFlushes(pIn->copyFlags); for (UINT_32 regionIdx = 0; regionIdx < regionCount; regionIdx++) { const ADDR3_COPY_MEMSURFACE_REGION* pCurRegion = &pRegions[regionIdx]; const ADDR3_MIP_INFO* pMipInfo = &mipInfo[pCurRegion->mipId]; UINT_64 mipOffset = pIn->singleSubres ? 0 : pMipInfo->macroBlockOffset; UINT_32 yBlks = pMipInfo->pitch / localOut.blockExtent.width; + UINT_32 zBlks = localOut.sliceSize >> (addresser.GetBlockBits() - addresser.GetBlockZBits()); - UINT_32 xStart = pCurRegion->x + pMipInfo->mipTailCoordX; - UINT_32 yStart = pCurRegion->y + pMipInfo->mipTailCoordY; - UINT_32 sliceStart = pCurRegion->slice + pMipInfo->mipTailCoordZ; + ADDR_COORD3D rawOrigin = { + pCurRegion->x + pMipInfo->mipTailCoordX, + pCurRegion->y + pMipInfo->mipTailCoordY, + pCurRegion->slice + pMipInfo->mipTailCoordZ + }; - for (UINT_32 slice = sliceStart; slice < (sliceStart + pCurRegion->copyDims.depth); slice++) - { - // The copy functions take the base address of the hardware slice, not the logical slice. Those are - // not the same thing in 3D swizzles. Logical slices within 3D swizzles are handled by sliceXor - // for unaligned copies. - UINT_32 sliceBlkStart = PowTwoAlignDown(slice, localOut.blockExtent.depth); - UINT_32 sliceXor = pIn->pbXor ^ addresser.GetAddressZ(slice); - - UINT_64 memOffset = ((slice - pCurRegion->slice) * pCurRegion->memSlicePitch); - UINT_64 imgOffset = mipOffset + (sliceBlkStart * localOut.sliceSize); - - ADDR_COORD2D sliceOrigin = { xStart, yStart }; - ADDR_EXTENT2D sliceExtent = { pCurRegion->copyDims.width, pCurRegion->copyDims.height }; - - pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, imgOffset), - VoidPtrInc(pCurRegion->pMem, memOffset), - pCurRegion->memRowPitch, - yBlks, - sliceOrigin, - sliceExtent, - sliceXor, - addresser); - } + pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, mipOffset), + pCurRegion->pMem, + pCurRegion->memRowPitch, + pCurRegion->memSlicePitch, + yBlks, + zBlks, + rawOrigin, + pCurRegion->copyDims, + pIn->pbXor, + (pCurRegion->mipId >= localOut.firstMipIdInTail), + addresser); } } return returnCode; } - -/** -************************************************************************************************************************ -* Gfx12Lib::HwlComputePipeBankXor -* -* @brief -* Generate a PipeBankXor value to be ORed into bits above numSwizzleBits of address -* -* @return -* PipeBankXor value -************************************************************************************************************************ -*/ -ADDR_E_RETURNCODE Gfx12Lib::HwlComputePipeBankXor( - const ADDR3_COMPUTE_PIPEBANKXOR_INPUT* pIn, ///< [in] input structure - ADDR3_COMPUTE_PIPEBANKXOR_OUTPUT* pOut ///< [out] output structure - ) const -{ - if ((m_numSwizzleBits != 0) && // does this configuration support swizzling - // base address XOR in GFX12 will be applied to all blk_size = 4KB, 64KB, or 256KB swizzle modes, - // Note that Linear and 256B are excluded. - (IsLinear(pIn->swizzleMode) == FALSE) && - (IsBlock256b(pIn->swizzleMode) == FALSE)) - { - pOut->pipeBankXor = pIn->surfIndex % (1 << m_numSwizzleBits); - } - else - { - pOut->pipeBankXor = 0; - } - - return ADDR_OK; -} - /** ************************************************************************************************************************ * Gfx12Lib::GetSwizzlePatternInfo @@ -1263,72 +1213,13 @@ const ADDR_SW_PATINFO* Gfx12Lib::GetSwizzlePatternInfo( BOOL_32 Gfx12Lib::HwlInitGlobalParams( const ADDR_CREATE_INPUT* pCreateIn) ///< [in] create input { - BOOL_32 valid = TRUE; - GB_ADDR_CONFIG_GFX12 gbAddrConfig; - - gbAddrConfig.u32All = pCreateIn->regValue.gbAddrConfig; - - switch (gbAddrConfig.bits.NUM_PIPES) - { - case ADDR_CONFIG_1_PIPE: - m_pipesLog2 = 0; - break; - case ADDR_CONFIG_2_PIPE: - m_pipesLog2 = 1; - break; - case ADDR_CONFIG_4_PIPE: - m_pipesLog2 = 2; - break; - case ADDR_CONFIG_8_PIPE: - m_pipesLog2 = 3; - break; - case ADDR_CONFIG_16_PIPE: - m_pipesLog2 = 4; - break; - case ADDR_CONFIG_32_PIPE: - m_pipesLog2 = 5; - break; - case ADDR_CONFIG_64_PIPE: - m_pipesLog2 = 6; - break; - default: - ADDR_ASSERT_ALWAYS(); - valid = FALSE; - break; - } - - switch (gbAddrConfig.bits.PIPE_INTERLEAVE_SIZE) - { - case ADDR_CONFIG_PIPE_INTERLEAVE_256B: - m_pipeInterleaveLog2 = 8; - break; - case ADDR_CONFIG_PIPE_INTERLEAVE_512B: - m_pipeInterleaveLog2 = 9; - break; - case ADDR_CONFIG_PIPE_INTERLEAVE_1KB: - m_pipeInterleaveLog2 = 10; - break; - case ADDR_CONFIG_PIPE_INTERLEAVE_2KB: - m_pipeInterleaveLog2 = 11; - break; - default: - ADDR_ASSERT_ALWAYS(); - valid = FALSE; - break; - } - - m_numSwizzleBits = ((m_pipesLog2 >= 3) ? m_pipesLog2 - 2 : 0); - // Gfx10+ chips treat packed 8-bit 422 formats as 32bpe with 2pix/elem. m_configFlags.use32bppFor422Fmt = TRUE; - if (valid) - { - InitEquationTable(); - InitBlockDimensionTable(); - } + InitEquationTable(); + InitBlockDimensionTable(); - return valid; + return TRUE; } /** @@ -1579,10 +1470,10 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlComputeSlicePipeBankXor( pIn->slice, 0); - const UINT_32 pipeBankXor = pipeBankXorOffset >> m_pipeInterleaveLog2; + const UINT_32 pipeBankXor = pipeBankXorOffset >> PipeInterleaveLog2; // Should have no bit set under pipe interleave - ADDR_ASSERT((pipeBankXor << m_pipeInterleaveLog2) == pipeBankXorOffset); + ADDR_ASSERT((pipeBankXor << PipeInterleaveLog2) == pipeBankXorOffset); pOut->pipeBankXor = pIn->basePipeBankXor ^ pipeBankXor; } @@ -2043,7 +1934,7 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlComputeStereoInfo( UINT_32 yPosMask = 0; // First get "max y bit" - for (UINT_32 i = m_pipeInterleaveLog2; i < blkSizeLog2; i++) + for (UINT_32 i = PipeInterleaveLog2; i < blkSizeLog2; i++) { ADDR_ASSERT(m_equationTable[eqIndex].addr[i].valid == 1); @@ -2055,7 +1946,7 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlComputeStereoInfo( } // Then loop again for populating a position mask of "max Y bit" - for (UINT_32 i = m_pipeInterleaveLog2; i < blkSizeLog2; i++) + for (UINT_32 i = PipeInterleaveLog2; i < blkSizeLog2; i++) { if ((m_equationTable[eqIndex].addr[i].channel == 1) && (m_equationTable[eqIndex].addr[i].index == yMax)) @@ -2074,7 +1965,7 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlComputeStereoInfo( if ((alignedHeight >> yMax) & 1) { - *pRightXor = yPosMask >> m_pipeInterleaveLog2; + *pRightXor = yPosMask >> PipeInterleaveLog2; } } } diff --git a/src/amd/addrlib/src/gfx12/gfx12addrlib.h b/src/amd/addrlib/src/gfx12/gfx12addrlib.h index fec4db0ce49..22cdb4959c5 100644 --- a/src/amd/addrlib/src/gfx12/gfx12addrlib.h +++ b/src/amd/addrlib/src/gfx12/gfx12addrlib.h @@ -1,7 +1,7 @@ /* ************************************************************************************************************************ * -* Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. +* Copyright (C) 2022-2026 Advanced Micro Devices, Inc. All rights reserved. * SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -147,10 +147,6 @@ private: static const UINT_32 MaxImageDim = 32768; // Max image size is 32k static const UINT_32 MaxMipLevels = 16; - virtual ADDR_E_RETURNCODE HwlComputePipeBankXor( - const ADDR3_COMPUTE_PIPEBANKXOR_INPUT* pIn, - ADDR3_COMPUTE_PIPEBANKXOR_OUTPUT* pOut) const override; - virtual BOOL_32 HwlInitGlobalParams(const ADDR_CREATE_INPUT* pCreateIn) override; virtual ADDR_E_RETURNCODE HwlComputeStereoInfo( @@ -172,8 +168,6 @@ private: const ADDR3_COPY_MEMSURFACE_REGION* pRegions, UINT_32 regionCount) const override; - UINT_32 m_numSwizzleBits; - // Initialize equation table VOID InitEquationTable();