From d778ede72c4c31bf4ed80578688c3e78d9af2dfa Mon Sep 17 00:00:00 2001
From: Ganesh Belgur Ramachandra <ganesh.belgurramachandra@amd.com>
Date: Mon, 23 Mar 2026 22:14:28 +0000
Subject: [PATCH] amd: import gfx11.7 addrlib

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40958>
---
 src/amd/addrlib/inc/addrinterface.h         |  132 +-
 src/amd/addrlib/inc/addrtypes.h             |    2 +-
 src/amd/addrlib/meson.build                 |    1 +
 src/amd/addrlib/src/addrinterface.cpp       |   30 +-
 src/amd/addrlib/src/amdgpu_asic_addr.h      |    5 +-
 src/amd/addrlib/src/core/addrcommon.h       |   39 +-
 src/amd/addrlib/src/core/addrlib.cpp        |   84 +-
 src/amd/addrlib/src/core/addrlib.h          |   10 +-
 src/amd/addrlib/src/core/addrlib1.cpp       |   10 +-
 src/amd/addrlib/src/core/addrlib2.cpp       |   20 +-
 src/amd/addrlib/src/core/addrlib3.cpp       |   43 +-
 src/amd/addrlib/src/core/addrlib3.h         |   13 +-
 src/amd/addrlib/src/core/addrswizzler.cpp   |  847 +++++++-
 src/amd/addrlib/src/core/addrswizzler.h     |   33 +-
 src/amd/addrlib/src/core/addrswizzlersimd.h | 2031 +++++++++++++++++++
 src/amd/addrlib/src/gfx10/gfx10addrlib.cpp  |  110 +-
 src/amd/addrlib/src/gfx11/gfx11addrlib.cpp  |  280 ++-
 src/amd/addrlib/src/gfx11/gfx11addrlib.h    |    5 +-
 src/amd/addrlib/src/gfx12/gfx12addrlib.cpp  |  213 +-
 src/amd/addrlib/src/gfx12/gfx12addrlib.h    |    8 +-
 20 files changed, 3407 insertions(+), 509 deletions(-)
 create mode 100644 src/amd/addrlib/src/core/addrswizzlersimd.h

diff --git a/src/amd/addrlib/inc/addrinterface.h b/src/amd/addrlib/inc/addrinterface.h
index 4ec2026376d..c1d9c1f7e5b 100644
--- a/src/amd/addrlib/inc/addrinterface.h
+++ b/src/amd/addrlib/inc/addrinterface.h
@@ -1,7 +1,7 @@
 /*
 ************************************************************************************************************************
 *
-*  Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved.
+*  Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved.
 *  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
@@ -24,7 +24,7 @@ extern "C"
 #endif
 
 #define ADDRLIB_VERSION_MAJOR 10
-#define ADDRLIB_VERSION_MINOR 1
+#define ADDRLIB_VERSION_MINOR 6
 #define ADDRLIB_MAKE_VERSION(major, minor) ((major << 16) | minor)
 #define ADDRLIB_VERSION                    ADDRLIB_MAKE_VERSION(ADDRLIB_VERSION_MAJOR, ADDRLIB_VERSION_MINOR)
 
@@ -107,6 +107,11 @@ typedef struct _ADDR_EXTENT3D
 *     AddrComputeFmaskAddrFromCoord()
 *     AddrComputeFmaskCoordFromAddr()
 *
+* /////////////////////////////////////////////////////////////////////////////////////////////////
+* //                                   Format properties functions
+* /////////////////////////////////////////////////////////////////////////////////////////////////
+*     AddrFormatProperties()
+*
 **/
 /**
 * /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -452,6 +457,49 @@ ADDR_E_RETURNCODE ADDR_API AddrCreate(
 ADDR_E_RETURNCODE ADDR_API AddrDestroy(
     ADDR_HANDLE hLib);
 
+/**
+****************************************************************************************************
+* ADDR_FORMAT_PROPERTIES_IN
+*
+*   @brief
+*       Input structure to the AddrFormatProperties routine.
+*
+****************************************************************************************************
+*/
+typedef struct _ADDR_FORMAT_PROPERTIES_IN {
+    UINT_32     size;     ///< Size of this structure in bytes
+    AddrFormat  format;   ///< If format is set to valid one, bpp/width/height
+                          ///  might be overwritten
+} ADDR_FORMAT_PROPERTIES_IN;
+
+/**
+****************************************************************************************************
+* ADDR_FORMAT_PROPERTIES_OUT
+*
+*   @brief
+*       Output structure from the AddrFormatProperties routine.
+*
+****************************************************************************************************
+*/
+typedef struct _ADDR_FORMAT_PROPERTIES_OUT {
+    UINT_32        size;     ///< Size of this structure in bytes
+    UINT_32        bpp;      ///< Bits per pixel as laid out in memory (eg. 128bpp for BC7)
+    ADDR_EXTENT2D  expand;   ///< Dimensions of one macro pixel block
+} ADDR_FORMAT_PROPERTIES_OUT;
+
+/**
+****************************************************************************************************
+*   AddrFormatProperties
+*
+*   @brief
+*       Gets a list of format properties
+*
+****************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrFormatProperties(
+    ADDR_HANDLE                       hLib,
+    const ADDR_FORMAT_PROPERTIES_IN*  in,
+    ADDR_FORMAT_PROPERTIES_OUT*       pOut);
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 //                                    Surface functions
@@ -2463,6 +2511,7 @@ typedef union _ADDR2_SURFACE_FLAGS
         UINT_32 rotated           :  1; ///< This resource is rotated and displayable
         UINT_32 needEquation      :  1; ///< This resource needs equation to be generated if possible
         UINT_32 opt4space         :  1; ///< This resource should be optimized for space
+        UINT_32 computeMaxSize    :  1; ///< This resource should select the largest swizzle possible
         UINT_32 minimizeAlign     :  1; ///< This resource should use minimum alignment
         UINT_32 noMetadata        :  1; ///< This resource has no metadata
         UINT_32 metaRbUnaligned   :  1; ///< This resource has rb unaligned metadata
@@ -2470,7 +2519,7 @@ typedef union _ADDR2_SURFACE_FLAGS
         UINT_32 view3dAs2dArray   :  1; ///< This resource is a 3D resource viewed as 2D array
         UINT_32 allowExtEquation  :  1; ///< If unset, only legacy DX eqs are allowed (2 XORs)
         UINT_32 requireMetadata   :  1; ///< This resource must support metadata
-        UINT_32 reserved          : 11; ///< Reserved bits
+        UINT_32 reserved          : 10; ///< Reserved bits
     };
 
     UINT_32 value;
@@ -2666,6 +2715,31 @@ ADDR_E_RETURNCODE ADDR_API Addr2ComputeSurfaceAddrFromCoord(
     const ADDR2_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT*    pIn,
     ADDR2_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT*         pOut);
 
+/**
+****************************************************************************************************
+*   ADDR_COPY_FLAGS
+*
+*   @brief
+*       Options controlling image copy functions.
+****************************************************************************************************
+*/
+typedef union _ADDR_COPY_FLAGS {
+    struct
+    {
+        UINT_32 blockMemcpy  : 1; ///< Memory layout is pre-swizzled and stored block-by-block.
+                                  ///  For regions in the miptail, this uses hybrid memcpy.
+                                  ///  Regions must cover full width/height of the subresource.
+        UINT_32 hybridMemcpy : 1; ///< Memory layout is partially pre-swizzled and stored
+                                  ///  microblock-by-microblock. Data in this format is agnostic to
+                                  ///  chip harvesting and block size. Regions will be padded out
+                                  ///  to microblock boundaries for alignment.
+                                  ///  Mutually exclusive with 'blockMemcpy'.
+        UINT_32 reserved    : 30; ///< Reserved bits
+    };
+
+    UINT_32 value;
+} ADDR_COPY_FLAGS;
+
 /**
 ****************************************************************************************************
 *   ADDR2_COPY_MEMSURFACE_REGION
@@ -2718,6 +2792,7 @@ typedef struct _ADDR2_COPY_MEMSURFACE_INPUT
                                          ///   - copyDims.depth == 1
                                          ///   - all copy regions target the same mip
                                          ///   - all copy regions target the same slice/depth
+    ADDR_COPY_FLAGS     copyFlags;       ///< Controls how the copy is performed.
 } ADDR2_COPY_MEMSURFACE_INPUT;
 
 /**
@@ -4008,30 +4083,34 @@ typedef union _ADDR2_SWMODE_SET
 */
 typedef struct _ADDR2_GET_PREFERRED_SURF_SETTING_INPUT
 {
-    UINT_32               size;              ///< Size of this structure in bytes
+    UINT_32               size;                   ///< Size of this structure in bytes
 
-    ADDR2_SURFACE_FLAGS   flags;             ///< Surface flags
-    AddrResourceType      resourceType;      ///< Surface type
-    AddrFormat            format;            ///< Surface format
-    AddrResrouceLocation  resourceLoction;   ///< Surface heap choice
-    ADDR2_BLOCK_SET       forbiddenBlock;    ///< Client can use it to disable some block setting
-                                             ///< such as linear for DXTn, tiled for YUV
-    ADDR2_SWTYPE_SET      preferredSwSet;    ///< Client can use it to specify sw type(s) wanted
-    BOOL_32               noXor;             ///< Do not use xor mode for this resource
-    UINT_32               bpp;               ///< bits per pixel
-    UINT_32               width;             ///< Width (of mip0), in pixels
-    UINT_32               height;            ///< Height (of mip0), in pixels
-    UINT_32               numSlices;         ///< Number surface slice/depth (of mip0),
-    UINT_32               numMipLevels;      ///< Total mipmap levels.
-    UINT_32               numSamples;        ///< Number of samples
-    UINT_32               numFrags;          ///< Number of fragments, leave it zero or the same as
-                                             ///  number of samples for normal AA; Set it to the
-                                             ///  number of fragments for EQAA
-    UINT_32               maxAlign;          ///< maximum base/size alignment requested by client
-    UINT_32               minSizeAlign;      ///< memory allocated for surface in client driver will
-                                             ///  be padded to multiple of this value (in bytes)
-    DOUBLE                memoryBudget;      ///< Memory consumption ratio based on minimum possible
-                                             ///  size.
+    ADDR2_SURFACE_FLAGS   flags;                  ///< Surface flags
+    AddrResourceType      resourceType;           ///< Surface type
+    AddrFormat            format;                 ///< Surface format
+    AddrResrouceLocation  resourceLoction;        ///< Surface heap choice
+    ADDR2_BLOCK_SET       forbiddenBlock;         ///< Client can use it to disable some block setting
+                                                  ///< such as linear for DXTn, tiled for YUV
+    ADDR2_SWTYPE_SET      preferredSwSet;         ///< Client can use it to specify sw type(s) wanted
+    BOOL_32               noXor;                  ///< Do not use xor mode for this resource
+    UINT_32               bpp;                    ///< bits per pixel
+    UINT_32               width;                  ///< Width (of mip0), in pixels
+    UINT_32               height;                 ///< Height (of mip0), in pixels
+    UINT_32               numSlices;              ///< Number surface slice/depth (of mip0),
+    UINT_32               numMipLevels;           ///< Total mipmap levels.
+    UINT_32               numSamples;             ///< Number of samples
+    UINT_32               numFrags;               ///< Number of fragments, leave it zero or the same as
+                                                  ///  number of samples for normal AA; Set it to the
+                                                  ///  number of fragments for EQAA
+    UINT_32               maxAlign;               ///< maximum base/size alignment requested by client
+    UINT_32               minSizeAlign;           ///< memory allocated for surface in client driver will
+                                                  ///  be padded to multiple of this value (in bytes)
+    DOUBLE                memoryBudget;           ///< Memory consumption ratio based on minimum possible
+                                                  ///  size.
+    bool                  useBlockBasedHeuristic; ///< Use the block-based heuristic for swizzle mode selection.
+                                                  ///  The heuristic has the property of image size predictably
+                                                  ///  with image extents, which is needed for Vulkan. It ignores
+                                                  ///  minSizeAlign, maxAlign and memoryBudget options
 } ADDR2_GET_PREFERRED_SURF_SETTING_INPUT;
 
 /**
@@ -4488,6 +4567,7 @@ typedef struct _ADDR3_COPY_MEMSURFACE_INPUT
                                          ///   - copyDims.depth == 1
                                          ///   - all copy regions target the same mip
                                          ///   - all copy regions target the same slice/depth
+    ADDR_COPY_FLAGS     copyFlags;       ///< Controls how the copy is performed.
 } ADDR3_COPY_MEMSURFACE_INPUT;
 
 /**
diff --git a/src/amd/addrlib/inc/addrtypes.h b/src/amd/addrlib/inc/addrtypes.h
index 66f7ed4a4af..0934cf023a6 100644
--- a/src/amd/addrlib/inc/addrtypes.h
+++ b/src/amd/addrlib/inc/addrtypes.h
@@ -1,7 +1,7 @@
 /*
 ************************************************************************************************************************
 *
-*  Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved.
+*  Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved.
 *  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
diff --git a/src/amd/addrlib/meson.build b/src/amd/addrlib/meson.build
index 7a5266f8f38..6341649152b 100644
--- a/src/amd/addrlib/meson.build
+++ b/src/amd/addrlib/meson.build
@@ -20,6 +20,7 @@ files_addrlib = files(
   'src/core/addrobject.h',
   'src/core/addrswizzler.cpp',
   'src/core/addrswizzler.h',
+  'src/core/addrswizzlersimd.h',
   'src/core/coord.cpp',
   'src/core/coord.h',
   'src/gfx9/gfx9addrlib.cpp',
diff --git a/src/amd/addrlib/src/addrinterface.cpp b/src/amd/addrlib/src/addrinterface.cpp
index 52a1f91ef6d..2f2073dc0b6 100644
--- a/src/amd/addrlib/src/addrinterface.cpp
+++ b/src/amd/addrlib/src/addrinterface.cpp
@@ -1,7 +1,7 @@
 /*
 ************************************************************************************************************************
 *
-*  Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved.
+*  Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved.
 *  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
@@ -81,6 +81,34 @@ ADDR_E_RETURNCODE ADDR_API AddrDestroy(
     return returnCode;
 }
 
+/**
+****************************************************************************************************
+*   AddrFormatProperties
+*
+*   @brief
+*       Retreives properties of the specified format.
+*
+****************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrFormatProperties(
+    ADDR_HANDLE                       hLib,
+    const ADDR_FORMAT_PROPERTIES_IN&  in,
+    ADDR_FORMAT_PROPERTIES_OUT*       pOut)
+{
+    ADDR_E_RETURNCODE  retCode = ADDR_INVALIDPARAMS;
+
+    if (hLib)
+    {
+        Lib* pLib = Lib::GetLib(hLib);
+
+        if (pLib != NULL)
+        {
+            retCode = pLib->GetFormatProperties(in, pOut);
+        }
+    }
+
+    return retCode;
+}
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 //                                    Surface functions
diff --git a/src/amd/addrlib/src/amdgpu_asic_addr.h b/src/amd/addrlib/src/amdgpu_asic_addr.h
index 58e4be056d8..2f56743b040 100644
--- a/src/amd/addrlib/src/amdgpu_asic_addr.h
+++ b/src/amd/addrlib/src/amdgpu_asic_addr.h
@@ -1,7 +1,7 @@
 /*
 ************************************************************************************************************************
 *
-*  Copyright (C) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
+*  Copyright (C) 2017-2026 Advanced Micro Devices, Inc. All rights reserved.
 *  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
@@ -29,6 +29,7 @@
 #define FAMILY_NV3     0x91 //# 145 / Navi: 3x
 #define FAMILY_STX     0x96
 #define FAMILY_PHX     0x94 //# 148 / Phoenix
+#define FAMILY_GFX1170 0x9A
 #define FAMILY_RMB     0x92 //# 146 / Rembrandt
 #define FAMILY_RPL     0x95 //# 149 / Raphael
 #define FAMILY_MDN     0x97 //# 151 / Mendocino
@@ -109,6 +110,7 @@
 #define AMDGPU_PHOENIX2_RANGE      0x80, 0xC0 //# 128 <= x < 192
 #define AMDGPU_HAWK_POINT1_RANGE   0xC0, 0xF0 //# 192 <= x < 240
 #define AMDGPU_HAWK_POINT2_RANGE   0xF0, 0xFF //# 240 <= x < 255
+#define AMDGPU_GFX1170_RANGE        0x01, 0x40 //# 1 <= x < 64
 
 #define AMDGPU_REMBRANDT_RANGE  0x01, 0xFF //# 01 <= x < 255
 #define AMDGPU_RAPHAEL_RANGE    0x01, 0xFF //# 1 <= x < max
@@ -189,6 +191,7 @@
 #define ASICREV_IS_PHOENIX2(r)         ASICREV_IS(r, PHOENIX2)
 #define ASICREV_IS_HAWK_POINT1(r)      ASICREV_IS(r, HAWK_POINT1)
 #define ASICREV_IS_HAWK_POINT2(r)      ASICREV_IS(r, HAWK_POINT2)
+#define ASICREV_IS_GFX1170(r)         ASICREV_IS(r, GFX1170)
 
 #define ASICREV_IS_REMBRANDT(r)        ASICREV_IS(r, REMBRANDT)
 #define ASICREV_IS_RAPHAEL(r)          ASICREV_IS(r, RAPHAEL)
diff --git a/src/amd/addrlib/src/core/addrcommon.h b/src/amd/addrlib/src/core/addrcommon.h
index f07de1aef93..9c1766ae603 100644
--- a/src/amd/addrlib/src/core/addrcommon.h
+++ b/src/amd/addrlib/src/core/addrcommon.h
@@ -1,7 +1,7 @@
 /*
 ************************************************************************************************************************
 *
-*  Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved.
+*  Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved.
 *  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
@@ -360,7 +360,7 @@ static inline UINT_32 BitMaskScanForward(
 {
     ADDR_ASSERT(mask > 0);
     unsigned long out = 0;
-#if (defined(_WIN64) && defined(_M_X64)) || (defined(_WIN32) && defined(_M_IX64))
+#if ((defined(_WIN64) && defined(_M_X64)) || (defined(_WIN32) && defined(_M_IX64))) && !defined(_M_ARM64EC)
     out = ::_tzcnt_u32(mask);
 #elif (defined(_WIN32) || defined(_WIN64))
     ::_BitScanForward(&out, mask);
@@ -436,6 +436,22 @@ static inline UINT_64 IsPow2(
     return !(dim & (dim - 1));
 }
 
+/**
+****************************************************************************************************
+*   RoundUpToMultiple
+*
+*   @brief
+*       Rounds up the specified integer to the nearest multiple of the specified alignment value.
+****************************************************************************************************
+*/
+template <typename T>
+constexpr T RoundUpToMultiple(
+    T operand,   ///< Value to be aligned.
+    T alignment) ///< Alignment desired.
+{
+    return (((operand + (alignment - 1)) / alignment) * alignment);
+}
+
 /**
 ****************************************************************************************************
 *   PowTwoAlign
@@ -647,6 +663,25 @@ static inline UINT_32 Log2(
     return (x != 0) ? (31 ^ BitMaskScanReverse(x)) : 0;
 }
 
+/**
+****************************************************************************************************
+*   ConstexprLog2
+*
+*   @brief
+*       Compute log of base 2 no matter the target is power of 2 or not. Returns 0 if 0.
+****************************************************************************************************
+*/
+static constexpr inline UINT_32 ConstexprLog2(
+    UINT_32 x)      ///< [in] the value should calculate log based 2
+{
+    UINT_32 out = 0;
+    while (x >>= 1)
+    {
+        out++;
+    }
+    return out;
+}
+
 /**
 ****************************************************************************************************
 *   QLog2
diff --git a/src/amd/addrlib/src/core/addrlib.cpp b/src/amd/addrlib/src/core/addrlib.cpp
index 457ee43129a..916b77b5309 100644
--- a/src/amd/addrlib/src/core/addrlib.cpp
+++ b/src/amd/addrlib/src/core/addrlib.cpp
@@ -1,7 +1,7 @@
 /*
 ************************************************************************************************************************
 *
-*  Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved.
+*  Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved.
 *  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
@@ -222,6 +222,7 @@ ADDR_E_RETURNCODE Lib::Create(
                     case FAMILY_NV3:
                     case FAMILY_STX:
                     case FAMILY_PHX:
+                    case FAMILY_GFX1170:
                         pLib = Gfx11HwlInit(&client);
                         break;
                     case FAMILY_NV4:
@@ -304,6 +305,44 @@ ADDR_E_RETURNCODE Lib::Create(
     return returnCode;
 }
 
+/**
+****************************************************************************************************
+*   Lib::GetFormatProperties
+*
+*   @brief
+*       Returns the properties of the format as specifed in the input.
+*   @return
+*       ADDR_E_RETURNCODE
+****************************************************************************************************
+*/
+ADDR_E_RETURNCODE Lib::GetFormatProperties(
+    const ADDR_FORMAT_PROPERTIES_IN&  in,
+    ADDR_FORMAT_PROPERTIES_OUT*       pOut
+    ) const
+{
+    ADDR_E_RETURNCODE  returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((in.size    != sizeof(ADDR_FORMAT_PROPERTIES_IN)) ||
+            (pOut->size != sizeof(ADDR_FORMAT_PROPERTIES_OUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        pOut->bpp = GetElemLib()->GetBitsPerPixel(in.format,
+                                                  nullptr,                  // elemMode, unused
+                                                  &pOut->expand.width,
+                                                  &pOut->expand.height,
+                                                  nullptr);                 // unused bits
+    }
+
+    return returnCode;
+}
+
 /**
 ****************************************************************************************************
 *   Lib::SetChipFamily
@@ -315,7 +354,7 @@ ADDR_E_RETURNCODE Lib::Create(
 ****************************************************************************************************
 */
 VOID Lib::SetChipFamily(
-    UINT_32 uChipFamily,        ///< [in] chip family defined in atiih.h
+    UINT_32 uChipFamily,        ///< [in] chip family defined in atiid.h
     UINT_32 uChipRevision)      ///< [in] chip revision defined in "asic_family"_id.h
 {
     ChipFamily family = HwlConvertChipFamily(uChipFamily, uChipRevision);
@@ -668,6 +707,47 @@ UINT_32 Lib::GetBpe(AddrFormat format) const
     return GetElemLib()->GetBitsPerPixel(format);
 }
 
+/**
+****************************************************************************************************
+*   Lib::GetSwizzleModePreferenceRatio
+*
+*   @brief
+*       Get ratio driving swizzle mode selection heuristic. Ratio is returned as fraction nominator
+*       and denominator
+*   @return
+*       void
+****************************************************************************************************
+*/
+void Lib::GetSwizzleModePreferenceRatio(
+    const ADDR2_GET_PREFERRED_SURF_SETTING_INPUT* pIn,
+    UINT_32*                                      pOutRatioLo,
+    UINT_32*                                      pOutRatioHi
+    ) const
+{
+    const BOOL_32 computeMinSize = (pIn->flags.minimizeAlign == 1) || (pIn->memoryBudget >= 1.0);
+
+    if (computeMinSize)
+    {
+        *pOutRatioLo = 1;
+        *pOutRatioHi = 1;
+    }
+    else if (pIn->flags.opt4space)
+    {
+        *pOutRatioLo = 3;
+        *pOutRatioHi = 2;
+    }
+    else if (pIn->flags.computeMaxSize)
+    {
+        *pOutRatioLo = 1024;
+        *pOutRatioHi = 1;
+    }
+    else
+    {
+        *pOutRatioLo = 2;
+        *pOutRatioHi = 1;
+    }
+}
+
 /**
 ************************************************************************************************************************
 *   Lib::ComputeOffsetFromSwizzlePattern
diff --git a/src/amd/addrlib/src/core/addrlib.h b/src/amd/addrlib/src/core/addrlib.h
index 64881a7f434..7f2a4fe0ad6 100644
--- a/src/amd/addrlib/src/core/addrlib.h
+++ b/src/amd/addrlib/src/core/addrlib.h
@@ -1,7 +1,7 @@
 /*
 ************************************************************************************************************************
 *
-*  Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved.
+*  Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved.
 *  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
@@ -300,6 +300,10 @@ public:
         delete this;
     }
 
+    ADDR_E_RETURNCODE GetFormatProperties(
+        const ADDR_FORMAT_PROPERTIES_IN&  in,
+        ADDR_FORMAT_PROPERTIES_OUT*       pOut) const;
+
     static Lib* GetLib(ADDR_HANDLE hLib);
     
     /// Returns which version of addrlib functions should be used.
@@ -333,6 +337,10 @@ public:
 
     UINT_32 GetBpe(AddrFormat format) const;
 
+    void GetSwizzleModePreferenceRatio(
+        const ADDR2_GET_PREFERRED_SURF_SETTING_INPUT* pIn,
+        UINT_32*                                      pOutRatioLo,
+        UINT_32*                                      pOutRatioHi) const;
 
     static UINT_32 ComputeOffsetFromSwizzlePattern(
         const UINT_64* pPattern,
diff --git a/src/amd/addrlib/src/core/addrlib1.cpp b/src/amd/addrlib/src/core/addrlib1.cpp
index fc459f5684e..2742ce18467 100644
--- a/src/amd/addrlib/src/core/addrlib1.cpp
+++ b/src/amd/addrlib/src/core/addrlib1.cpp
@@ -1,7 +1,7 @@
 /*
 ************************************************************************************************************************
 *
-*  Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved.
+*  Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved.
 *  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
@@ -2473,7 +2473,7 @@ UINT_64 Lib::HwlComputeXmaskAddrFromCoord(
     //
     macroTileIndexX = x / macroTileWidth;
     macroTileIndexY = y / macroTileHeight;
-    macroTileOffset = ((macroTileIndexY * macroTilesPerRow) + macroTileIndexX) * macroTileBytes;
+    macroTileOffset = (static_cast<UINT_64>(macroTileIndexY * macroTilesPerRow) + macroTileIndexX) * macroTileBytes;
 
     //
     // Compute the pixel offset within the macro tile.
@@ -2675,7 +2675,7 @@ VOID Lib::ComputeSurfaceCoordFromAddrMicroTiled(
     //
     sliceBits = static_cast<UINT_64>(pitch) * height * microTileThickness * bpp * numSamples;
 
-    rowBits   = (pitch / MicroTileWidth) * microTileBits;
+    rowBits   = static_cast<UINT_64>(pitch / MicroTileWidth) * microTileBits;
 
     //
     // Extract the slice index.
@@ -3559,11 +3559,11 @@ BOOL_32 Lib::DegradeTo1D(
     if (degrade == FALSE)
     {
         // Only check width and height as slices are aligned to thickness
-        UINT_64 unalignedSize = width * height;
+        UINT_64 unalignedSize = static_cast<UINT_64>(width) * height;
 
         UINT_32 alignedPitch = PowTwoAlign(width, macroTilePitchAlign);
         UINT_32 alignedHeight = PowTwoAlign(height, macroTileHeightAlign);
-        UINT_64 alignedSize = alignedPitch * alignedHeight;
+        UINT_64 alignedSize = static_cast<UINT_64>(alignedPitch) * alignedHeight;
 
         // alignedSize > 1.5 * unalignedSize
         if (2 * alignedSize > 3 * unalignedSize)
diff --git a/src/amd/addrlib/src/core/addrlib2.cpp b/src/amd/addrlib/src/core/addrlib2.cpp
index b3920f34959..a46aa4617a5 100644
--- a/src/amd/addrlib/src/core/addrlib2.cpp
+++ b/src/amd/addrlib/src/core/addrlib2.cpp
@@ -1,7 +1,7 @@
 /*
 ************************************************************************************************************************
 *
-*  Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved.
+*  Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved.
 *  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
@@ -207,7 +207,7 @@ ADDR_E_RETURNCODE Lib::ComputeSurfaceInfo(
             // Overwrite these parameters if we have a valid format
         }
 
-        if (localIn.bpp != 0)
+        if (localIn.bpp >= 8)
         {
             localIn.width  = Max(localIn.width, 1u);
             localIn.height = Max(localIn.height, 1u);
@@ -444,8 +444,8 @@ ADDR_E_RETURNCODE Lib::CopyLinearSurface(
             void* pMipBase = VoidPtrInc(pIn->pMappedSurface,
                                         (pIn->singleSubres ? 0 : mipInfo[pCurRegion->mipId].offset));
 
-            const size_t lineSizeBytes = (localIn.bpp >> 3) * pCurRegion->copyDims.width;
-            const size_t lineImgPitchBytes = (localIn.bpp >> 3) * mipInfo[pCurRegion->mipId].pitch;
+            const size_t lineSizeBytes = (static_cast<size_t>(localIn.bpp) >> 3) * pCurRegion->copyDims.width;
+            const size_t lineImgPitchBytes = (static_cast<size_t>(localIn.bpp) >> 3) * mipInfo[pCurRegion->mipId].pitch;
 
             for (UINT_32 sliceIdx = 0; sliceIdx < pCurRegion->copyDims.depth; sliceIdx++)
             {
@@ -504,6 +504,11 @@ ADDR_E_RETURNCODE Lib::CopyMemToSurface(
         {
             returnCode = ADDR_INVALIDPARAMS;
         }
+        else if (pIn->copyFlags.blockMemcpy && pIn->copyFlags.hybridMemcpy)
+        {
+            // Invalid to specify conflicting copy modes.
+            returnCode = ADDR_INVALIDPARAMS;
+        }
         else
         {
             UINT_32 baseSlice = pRegions[0].slice;
@@ -573,6 +578,11 @@ ADDR_E_RETURNCODE Lib::CopySurfaceToMem(
         {
             returnCode = ADDR_INVALIDPARAMS;
         }
+        else if (pIn->copyFlags.blockMemcpy && pIn->copyFlags.hybridMemcpy)
+        {
+            // Invalid to specify conflicting copy modes.
+            returnCode = ADDR_INVALIDPARAMS;
+        }
         else
         {
             UINT_32 baseSlice = pRegions[0].slice;
@@ -1424,7 +1434,7 @@ ADDR_E_RETURNCODE Lib::ComputeSurfaceAddrFromCoordLinear(
         {
             pOut->addr        = (localOut.sliceSize * pIn->slice) +
                                 mipInfo[pIn->mipId].offset +
-                                (pIn->y * mipInfo[pIn->mipId].pitch + pIn->x) * (pIn->bpp >> 3);
+                                (static_cast<size_t>(pIn->y) * mipInfo[pIn->mipId].pitch + pIn->x) * (pIn->bpp >> 3);
             pOut->bitPosition = 0;
         }
         else
diff --git a/src/amd/addrlib/src/core/addrlib3.cpp b/src/amd/addrlib/src/core/addrlib3.cpp
index b809f908375..ac9d1022bad 100644
--- a/src/amd/addrlib/src/core/addrlib3.cpp
+++ b/src/amd/addrlib/src/core/addrlib3.cpp
@@ -1,7 +1,7 @@
 /*
 ************************************************************************************************************************
 *
-*  Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved.
+*  Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved.
 *  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
@@ -39,8 +39,6 @@ namespace V3
 Lib::Lib()
     :
     Addr::Lib(),
-    m_pipesLog2(0),
-    m_pipeInterleaveLog2(0),
     m_numEquations(0)
 {
     Init();
@@ -59,8 +57,6 @@ Lib::Lib(
     const Client* pClient)
     :
     Addr::Lib(pClient),
-    m_pipesLog2(0),
-    m_pipeInterleaveLog2(0),
     m_numEquations(0)
 {
     Init();
@@ -265,7 +261,7 @@ ADDR_E_RETURNCODE Lib::ComputeSurfaceInfo(
             // Overwrite these parameters if we have a valid format
         }
 
-        if (localIn.bpp != 0)
+        if (localIn.bpp >= 8)
         {
             localIn.width  = Max(localIn.width, 1u);
             localIn.height = Max(localIn.height, 1u);
@@ -547,8 +543,8 @@ ADDR_E_RETURNCODE Lib::CopyLinearSurface(
             void* pMipBase = VoidPtrInc(pIn->pMappedSurface,
                                         (pIn->singleSubres ? 0 : mipInfo[pCurRegion->mipId].offset));
 
-            const size_t lineSizeBytes = (localIn.bpp >> 3) * pCurRegion->copyDims.width;
-            const size_t lineImgPitchBytes = (localIn.bpp >> 3) * mipInfo[pCurRegion->mipId].pitch;
+            const size_t lineSizeBytes = (static_cast<size_t>(localIn.bpp) >> 3) * pCurRegion->copyDims.width;
+            const size_t lineImgPitchBytes = (static_cast<size_t>(localIn.bpp) >> 3) * mipInfo[pCurRegion->mipId].pitch;
 
             for (UINT_32 sliceIdx = 0; sliceIdx < pCurRegion->copyDims.depth; sliceIdx++)
             {
@@ -611,6 +607,11 @@ ADDR_E_RETURNCODE Lib::CopyMemToSurface(
         {
             returnCode = ADDR_INVALIDPARAMS;
         }
+        else if (pIn->copyFlags.blockMemcpy && pIn->copyFlags.hybridMemcpy)
+        {
+            // Invalid to specify conflicting copy modes.
+            returnCode = ADDR_INVALIDPARAMS;
+        }
         else
         {
             UINT_32 baseSlice    = pRegions[0].slice;
@@ -680,6 +681,11 @@ ADDR_E_RETURNCODE Lib::CopySurfaceToMem(
         {
             returnCode = ADDR_INVALIDPARAMS;
         }
+        else if (pIn->copyFlags.blockMemcpy && pIn->copyFlags.hybridMemcpy)
+        {
+            // Invalid to specify conflicting copy modes.
+            returnCode = ADDR_INVALIDPARAMS;
+        }
         else
         {
             UINT_32 baseSlice    = pRegions[0].slice;
@@ -736,7 +742,7 @@ ADDR_E_RETURNCODE Lib::ComputePipeBankXor(
     const ADDR3_COMPUTE_PIPEBANKXOR_INPUT* pIn,
     ADDR3_COMPUTE_PIPEBANKXOR_OUTPUT*      pOut)
 {
-    ADDR_E_RETURNCODE returnCode;
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
 
     if ((GetFillSizeFieldsFlags() == TRUE) &&
         ((pIn->size  != sizeof(ADDR3_COMPUTE_PIPEBANKXOR_INPUT)) ||
@@ -746,7 +752,23 @@ ADDR_E_RETURNCODE Lib::ComputePipeBankXor(
     }
     else
     {
-        returnCode = HwlComputePipeBankXor(pIn, pOut);
+        // The swizzle mode determines how many unused bits there are in the address.  We never (ok, rarely...) program
+        // the low eight bits of the address, so the "numSwizzleBits" effectively represents the number of "guaranteed
+        // zero" programmed bits in the address.
+        const UINT_32  numSwizzleBits = GetBlockSizeLog2(pIn->swizzleMode, FALSE) - 8;
+
+        // make sure this configuration supports swizzling
+        if (numSwizzleBits != 0) 
+        {
+            // These cases should have been excluded with the "numSwizzleBits" calculation above, but make sure here.
+            ADDR_ASSERT((IsLinear(pIn->swizzleMode) == FALSE) && (IsBlock256b(pIn->swizzleMode) == FALSE));
+
+            pOut->pipeBankXor = pIn->surfIndex % (1 << numSwizzleBits);
+        }
+        else
+        {
+            pOut->pipeBankXor = 0;
+        }
     }
 
     return returnCode;
@@ -1167,7 +1189,6 @@ ADDR_E_RETURNCODE Lib::ComputeSurfaceInfoSanityCheck(
     return HwlValidateNonSwModeParams(&localIn) ? ADDR_OK : ADDR_INVALIDPARAMS;
 }
 
-
 /**
 ************************************************************************************************************************
 *   Lib::ComputeOffsetFromEquation
diff --git a/src/amd/addrlib/src/core/addrlib3.h b/src/amd/addrlib/src/core/addrlib3.h
index 6f6ecf6d99f..ae4e6b7fd8f 100644
--- a/src/amd/addrlib/src/core/addrlib3.h
+++ b/src/amd/addrlib/src/core/addrlib3.h
@@ -1,7 +1,7 @@
 /*
 ************************************************************************************************************************
 *
-*  Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
+*  Copyright (C) 2022-2026 Advanced Micro Devices, Inc. All rights reserved.
 *  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
@@ -44,7 +44,6 @@ struct ADDR3_COORD
 struct ADDR3_COMPUTE_SURFACE_INFO_PARAMS_INPUT
 {
     const ADDR3_COMPUTE_SURFACE_INFO_INPUT* pSurfInfo;
-    void*                                   pvAddrParams;
 };
 
 /**
@@ -155,14 +154,16 @@ protected:
     Lib();  // Constructor is protected
     Lib(const Client* pClient);
 
-    UINT_32 m_pipesLog2;                ///< Number of pipe per shader engine Log2
-    UINT_32 m_pipeInterleaveLog2;       ///< Log2 of pipe interleave bytes
-
     SwizzleModeFlags m_swizzleModeTable[ADDR3_MAX_TYPE];  ///< Swizzle mode table
 
     // Number of unique MSAA sample rates (1/2/4/8)
     static const UINT_32 MaxNumMsaaRates     = 4;
 
+    //# These fields exist in the GB_ADDR_CONFIG register; however, the HW does not care about them.
+    //# The HW acts as if the log2(pipes)==5 and log2(pi) == 8, always.
+    static const UINT_32  NumPipesLog2       = 5;
+    static const UINT_32  PipeInterleaveLog2 = 8;
+
     // Number of equation entries in the table
     UINT_32              m_numEquations;
 
@@ -444,4 +445,4 @@ private:
 } // V3
 } // Addr
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/amd/addrlib/src/core/addrswizzler.cpp b/src/amd/addrlib/src/core/addrswizzler.cpp
index b27851b9ec2..19e7d212bb1 100644
--- a/src/amd/addrlib/src/core/addrswizzler.cpp
+++ b/src/amd/addrlib/src/core/addrswizzler.cpp
@@ -2,7 +2,8 @@
 /*
 ************************************************************************************************************************
 *
-*  Copyright (C) 2024 Advanced Micro Devices, Inc.  All rights reserved.
+*  Copyright (C) 2024-2026 Advanced Micro Devices, Inc.  All rights reserved.
+*  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
 
@@ -14,6 +15,7 @@
 */
 
 #include "addrswizzler.h"
+#include "addrswizzlersimd.h"
 
 namespace Addr
 {
@@ -98,6 +100,23 @@ void LutAddresser::InitSwizzleProps()
         m_sLutMask |= m_bit[i].s;
     }
 
+    // Derive the microblock size from the swizzle equation.
+    UINT_32 xMbMask = 0;
+    UINT_32 yMbMask = 0;
+    UINT_32 zMbMask = 0;
+    for (UINT_32 i = 0; i < 8; i++)
+    {
+        xMbMask |= m_bit[i].x;
+        yMbMask |= m_bit[i].y;
+        zMbMask |= m_bit[i].z;
+    }
+    m_microBlockSize.width  = xMbMask + 1;
+    m_microBlockSize.height = yMbMask + 1;
+    m_microBlockSize.depth  = zMbMask + 1;
+    ADDR_ASSERT(IsPow2(m_microBlockSize.width));
+    ADDR_ASSERT(IsPow2(m_microBlockSize.height));
+    ADDR_ASSERT(IsPow2(m_microBlockSize.depth));
+
     // An expandX of 1 is a no-op
     m_maxExpandX = 1;
     if (m_sLutMask == 0)
@@ -153,7 +172,7 @@ void LutAddresser::InitLuts()
         m_pYLut = &m_lutData[0];
         ADDR_ASSERT(m_pYLut[0] == 0);
     }
-    
+
     if (m_zLutMask != 0)
     {
         m_pZLut = &m_lutData[curOffset];
@@ -269,82 +288,33 @@ UINT_32 LutAddresser::EvalEquation(
 
 /**
 ****************************************************************************************************
-*   Copy2DSliceUnaligned
+*   CopyRowUnaligned
 *
 *   @brief
-*       Copies an arbitrary 2D pixel region to or from a surface.
+*       Copies a single row to or from a surface.
 ****************************************************************************************************
 */
 template <int BPELog2, int ExpandX, bool ImgIsDest>
-void Copy2DSliceUnaligned(
-    void*               pImgBlockSliceStart, // Block corresponding to beginning of slice
-    void*               pBuf,                // Pointer to data starting from the copy origin.
-    size_t              bufStrideY,          // Stride of each row in pBuf
-    UINT_32             imageBlocksY,        // Width of the image slice, in blocks.
-    ADDR_COORD2D        origin,              // Absolute origin, in elements
-    ADDR_EXTENT2D       extent,              // Size to copy, in elements
-    UINT_32             sliceXor,            // Includes pipeBankXor and z XOR
+void CopyRowUnaligned(
+    void*               pRowImgBlockStart, // Pointer to the image block at x=0
+    void*               pBuf,              // Pointer to data at x=0
+    UINT_32             xStart,            // x value to start at
+    UINT_32             xEnd,              // x value to finish at (not inclusive)
+    UINT_32             rowXor,            // Value to XOR in for each address (makes up PBX and y/z coords)
     const LutAddresser& addresser)
 {
-    UINT_32  xStart = origin.x;
-    UINT_32  xEnd   = origin.x + extent.width;
-
+    UINT_32 x = xStart;
     constexpr UINT_32  PixBytes = (1 << BPELog2);
 
-    // Apply a negative offset now so later code can do eg. pBuf[x] instead of pBuf[x - origin.x]
-    pBuf = VoidPtrDec(pBuf, xStart * PixBytes);
-
-    // Do things one row at a time for unaligned regions.
-    for (UINT_32 y = origin.y; y < (origin.y + extent.height); y++)
+    // Most swizzles pack 2-4 pixels horizontally. Take advantage of this even in non-microblock-aligned
+    // regions to commonly do 2-4x less work. This is still way less good than copying by whole microblocks though.
+    if (ExpandX > 1)
     {
-        UINT_32 yBlk = (y >> addresser.GetBlockYBits()) * imageBlocksY;
-        UINT_32 rowXor = sliceXor ^ addresser.GetAddressY(y);
-
-        UINT_32 x = xStart;
-
-        // Most swizzles pack 2-4 pixels horizontally. Take advantage of this even in non-microblock-aligned
-        // regions to commonly do 2-4x less work. This is still way less good than copying by whole microblocks though.
-        if (ExpandX > 1)
+        // Unaligned left edge
+        for (; x < Min(xEnd, PowTwoAlign(xStart, ExpandX)); x++)
         {
-            // Unaligned left edge
-            for (; x < Min(xEnd, PowTwoAlign(xStart, ExpandX)); x++)
-            {
-                UINT_32 blk = (yBlk + (x >> addresser.GetBlockXBits()));
-                void* pImgBlock = VoidPtrInc(pImgBlockSliceStart, blk << addresser.GetBlockBits());
-                void* pPix = VoidPtrInc(pImgBlock, rowXor ^ addresser.GetAddressX(x));
-                if (ImgIsDest)
-                {
-                    memcpy(pPix, VoidPtrInc(pBuf, x * PixBytes), PixBytes);
-                }
-                else
-                {
-                    memcpy(VoidPtrInc(pBuf, x * PixBytes), pPix, PixBytes);
-                }
-            }
-            // Aligned middle
-            for (; x < PowTwoAlignDown(xEnd, ExpandX); x += ExpandX)
-            {
-                UINT_32 blk = (yBlk + (x >> addresser.GetBlockXBits()));
-                void* pImgBlock = VoidPtrInc(pImgBlockSliceStart, blk << addresser.GetBlockBits());
-                void* pPix = VoidPtrInc(pImgBlock, rowXor ^ addresser.GetAddressX(x));
-                if (ImgIsDest)
-                {
-                    memcpy(pPix, VoidPtrInc(pBuf, x * PixBytes), PixBytes * ExpandX);
-                }
-                else
-                {
-                    memcpy(VoidPtrInc(pBuf, x * PixBytes), pPix, PixBytes * ExpandX);
-                }
-            }
-        }
-        // Unaligned end (or the whole thing when ExpandX == 1)
-        for (; x < xEnd; x++)
-        {
-            // Get the index of the block within the slice
-            UINT_32 blk = (yBlk + (x >> addresser.GetBlockXBits()));
-            // Apply that index to get the base address of the current block. 
-            void* pImgBlock = VoidPtrInc(pImgBlockSliceStart, blk << addresser.GetBlockBits());
-            // Grab the x-xor and XOR it all together, adding to get the final address
+            UINT_32 blk = (x >> addresser.GetBlockXBits());
+            void* pImgBlock = VoidPtrInc(pRowImgBlockStart, blk << addresser.GetBlockBits());
             void* pPix = VoidPtrInc(pImgBlock, rowXor ^ addresser.GetAddressX(x));
             if (ImgIsDest)
             {
@@ -355,8 +325,478 @@ void Copy2DSliceUnaligned(
                 memcpy(VoidPtrInc(pBuf, x * PixBytes), pPix, PixBytes);
             }
         }
+        // Aligned middle
+        for (; x < PowTwoAlignDown(xEnd, ExpandX); x += ExpandX)
+        {
+            UINT_32 blk = (x >> addresser.GetBlockXBits());
+            void* pImgBlock = VoidPtrInc(pRowImgBlockStart, blk << addresser.GetBlockBits());
+            void* pPix = VoidPtrInc(pImgBlock, rowXor ^ addresser.GetAddressX(x));
+            if (ImgIsDest)
+            {
+                memcpy(pPix, VoidPtrInc(pBuf, x * PixBytes), PixBytes * ExpandX);
+            }
+            else
+            {
+                memcpy(VoidPtrInc(pBuf, x * PixBytes), pPix, PixBytes * ExpandX);
+            }
+        }
+    }
+    // Unaligned end (or the whole thing when ExpandX == 1)
+    for (; x < xEnd; x++)
+    {
+        // Get the index of the block within the slice
+        UINT_32 blk = (x >> addresser.GetBlockXBits());
+        // Apply that index to get the base address of the current block.
+        void* pImgBlock = VoidPtrInc(pRowImgBlockStart, blk << addresser.GetBlockBits());
+        // Grab the x-xor and XOR it all together, adding to get the final address
+        void* pPix = VoidPtrInc(pImgBlock, rowXor ^ addresser.GetAddressX(x));
+        if (ImgIsDest)
+        {
+            memcpy(pPix, VoidPtrInc(pBuf, x * PixBytes), PixBytes);
+        }
+        else
+        {
+            memcpy(VoidPtrInc(pBuf, x * PixBytes), pPix, PixBytes);
+        }
+    }
+}
 
-        pBuf = VoidPtrInc(pBuf, bufStrideY);
+/**
+****************************************************************************************************
+*   CopyImgUnaligned
+*
+*   @brief
+*       Copies an arbitrary 3D pixel region to or from a surface.
+****************************************************************************************************
+*/
+template <int BPELog2, int ExpandX, bool ImgIsDest>
+void CopyImgUnaligned(
+    void*               pImgBlockStart, // Block corresponding to beginning of image
+    void*               pBuf,           // Pointer to data starting from the copy origin.
+    size_t              bufStrideY,     // Stride of each row in pBuf
+    size_t              bufStrideZ,     // Stride of each slice in pBuf
+    UINT_32             imageBlocksY,   // Width of the image slice, in blocks.
+    UINT_32             imageBlocksZ,   // Depth pitch of the image slice, in blocks.
+    ADDR_COORD3D        origin,         // Absolute origin, in elements
+    ADDR_EXTENT3D       extent,         // Size to copy, in elements
+    UINT_32             pipeBankXor,    // Final value to xor in
+    BOOL_32             isInMipTail,    // True if this is in the mip tail.
+    const LutAddresser& addresser)
+{
+    constexpr UINT_32  PixBytes = (1 << BPELog2);
+
+    // Apply a negative x/y offset now so later code can do eg. pBuf[x] instead of pBuf[x - origin.x]
+    // Keep the z offset.
+    pBuf = VoidPtrDec(pBuf, origin.x * PixBytes);
+
+    void* pSliceBuf = pBuf;
+    // Do things one slice/row at a time for unaligned regions.
+    for (UINT_32 z = origin.z; z < (origin.z + extent.depth); z++)
+    {
+        UINT_32 sliceXor = pipeBankXor ^ addresser.GetAddressZ(z);
+        UINT_32 zBlk = (z >> addresser.GetBlockZBits()) * imageBlocksZ;
+        void* pRowBuf = pSliceBuf;
+        for (UINT_32 y = origin.y; y < (origin.y + extent.height); y++)
+        {
+            UINT_32 yBlk = (y >> addresser.GetBlockYBits()) * imageBlocksY;
+            UINT_32 rowXor = sliceXor ^ addresser.GetAddressY(y);
+            UINT_64 rowOffset = ((zBlk + yBlk) << addresser.GetBlockBits());
+
+            void* pImgBlockRow = VoidPtrInc(pImgBlockStart, rowOffset);
+
+            CopyRowUnaligned<BPELog2, ExpandX, ImgIsDest>(
+                pImgBlockRow,
+                pRowBuf,
+                origin.x,
+                origin.x + extent.width,
+                rowXor,
+                addresser);
+
+            pRowBuf = VoidPtrInc(pRowBuf, bufStrideY);
+        }
+        pSliceBuf = VoidPtrInc(pSliceBuf, bufStrideZ);
+    }
+}
+
+
+/**
+****************************************************************************************************
+*   HandleUnalignedRegions
+*
+*   @brief
+*       Does unaligned copies for any X/Y/Z edges that are not fully aligned, fixing up the
+*       copy region and pointer to point at the aligned region that remains.
+****************************************************************************************************
+*/
+template <int BPELog2, int ExpandX>
+void HandleUnalignedRegions(
+    void*               pImgBlockStart, // Block corresponding to beginning of image
+    void**              ppBuf,          // Pointer to pointer to data starting from the copy origin.
+    size_t              bufStrideY,     // Stride of each row in pBuf
+    size_t              bufStrideZ,     // Stride of each slice in pBuf
+    UINT_32             imageBlocksY,   // Width of the image slice, in blocks.
+    UINT_32             imageBlocksZ,   // Depth pitch of the image slice, in blocks.
+    ADDR_COORD3D*       pOrigin,        // Absolute origin, in elements
+    ADDR_EXTENT3D*      pExtent,        // Size to copy, in elements
+    ADDR_EXTENT3D       align,          // Size to align on, in elements
+    UINT_32             pipeBankXor,    // Final value to xor in
+    BOOL_32             isInMipTail,    // True if this is in the mip tail.
+    const LutAddresser& addresser)
+{
+    constexpr bool ImgIsDest = true;
+
+    // Go through the start/end of the x/y/z extents and copy the parts that aren't aligned.
+    if (pOrigin->x != PowTwoAlign(pOrigin->x, align.width))
+    {
+        UINT_32 xSize = Min(pOrigin->x + pExtent->width, PowTwoAlign(pOrigin->x, align.width)) - pOrigin->x;
+        CopyImgUnaligned<BPELog2, ExpandX, ImgIsDest>(
+            pImgBlockStart,
+            *ppBuf,
+            bufStrideY,
+            bufStrideZ,
+            imageBlocksY,
+            imageBlocksZ,
+            *pOrigin,
+            { xSize, pExtent->height, pExtent->depth},
+            pipeBankXor,
+            isInMipTail,
+            addresser);
+        pExtent->width -= xSize;
+        pOrigin->x     += xSize;
+        *ppBuf = VoidPtrInc(*ppBuf, xSize << BPELog2);
+    }
+    if (pOrigin->y != PowTwoAlign(pOrigin->y, align.height))
+    {
+        UINT_32 ySize = Min(pOrigin->y + pExtent->height, PowTwoAlign(pOrigin->y, align.height)) - pOrigin->y;
+        CopyImgUnaligned<BPELog2, ExpandX, ImgIsDest>(
+            pImgBlockStart,
+            *ppBuf,
+            bufStrideY,
+            bufStrideZ,
+            imageBlocksY,
+            imageBlocksZ,
+            *pOrigin,
+            { pExtent->width, ySize, pExtent->depth},
+            pipeBankXor,
+            isInMipTail,
+            addresser);
+        pExtent->height -= ySize;
+        pOrigin->y      += ySize;
+        *ppBuf = VoidPtrInc(*ppBuf, ySize * bufStrideY);
+    }
+    if (pOrigin->z != PowTwoAlign(pOrigin->z, align.depth))
+    {
+        UINT_32 zSize = Min(pOrigin->z + pExtent->depth, PowTwoAlign(pOrigin->z, align.depth)) - pOrigin->z;
+        CopyImgUnaligned<BPELog2, ExpandX, ImgIsDest>(
+            pImgBlockStart,
+            *ppBuf,
+            bufStrideY,
+            bufStrideZ,
+            imageBlocksY,
+            imageBlocksZ,
+            *pOrigin,
+            { pExtent->width, pExtent->height, zSize },
+            pipeBankXor,
+            isInMipTail,
+            addresser);
+        pExtent->depth -= zSize;
+        pOrigin->z     += zSize;
+        *ppBuf = VoidPtrInc(*ppBuf, zSize * bufStrideZ);
+    }
+
+    // At this point the starts are aligned, so we can care about just size rather than origin+size.
+    if ((pExtent->width) != PowTwoAlignDown(pExtent->width, align.width))
+    {
+        UINT_32 xAlignedSize = PowTwoAlignDown(pOrigin->x + pExtent->width, align.width) - pOrigin->x;
+        void* pBuf = VoidPtrInc(*ppBuf, xAlignedSize << BPELog2);
+        CopyImgUnaligned<BPELog2, ExpandX, ImgIsDest>(
+            pImgBlockStart,
+            pBuf,
+            bufStrideY,
+            bufStrideZ,
+            imageBlocksY,
+            imageBlocksZ,
+            { pOrigin->x + xAlignedSize, pOrigin->y, pOrigin->z},
+            { pExtent->width - xAlignedSize, pExtent->height, pExtent->depth },
+            pipeBankXor,
+            isInMipTail,
+            addresser);
+        pExtent->width = xAlignedSize;
+    }
+
+    if ((pExtent->height) != PowTwoAlignDown(pExtent->height, align.height))
+    {
+        UINT_32 yAlignedSize = PowTwoAlignDown(pOrigin->y + pExtent->height, align.height) - pOrigin->y;
+        void* pBuf = VoidPtrInc(*ppBuf, yAlignedSize * bufStrideY);
+        CopyImgUnaligned<BPELog2, ExpandX, ImgIsDest>(
+            pImgBlockStart,
+            pBuf,
+            bufStrideY,
+            bufStrideZ,
+            imageBlocksY,
+            imageBlocksZ,
+            { pOrigin->x, pOrigin->y + yAlignedSize, pOrigin->z},
+            { pExtent->width, pExtent->height - yAlignedSize, pExtent->depth },
+            pipeBankXor,
+            isInMipTail,
+            addresser);
+        pExtent->height = yAlignedSize;
+    }
+
+    if ((pExtent->depth) != PowTwoAlignDown(pExtent->depth, align.depth))
+    {
+        UINT_32 zAlignedSize = PowTwoAlignDown(pOrigin->z + pExtent->depth, align.depth) - pOrigin->z;
+        void* pBuf = VoidPtrInc(*ppBuf, zAlignedSize * bufStrideZ);
+        CopyImgUnaligned<BPELog2, ExpandX, ImgIsDest>(
+            pImgBlockStart,
+            pBuf,
+            bufStrideY,
+            bufStrideZ,
+            imageBlocksY,
+            imageBlocksZ,
+            { pOrigin->x, pOrigin->y, pOrigin->z + zAlignedSize },
+            { pExtent->width, pExtent->height, pExtent->depth - zAlignedSize },
+            pipeBankXor,
+            isInMipTail,
+            addresser);
+        pExtent->depth = zAlignedSize;
+    }
+}
+
+/**
+****************************************************************************************************
+*   CopyMemImgHybrid
+*
+*   @brief
+*       Copies a 3D pixel region to a surface. Uses fast copies for fully covered microblocks.
+****************************************************************************************************
+*/
+template <class MicroSw>
+AVX2_FUNC NEON_FUNC void CopyMemImgHybrid(
+    void*               pImgBlockStart, // Block corresponding to beginning of image
+    void*               pBuf,           // Pointer to data starting from the copy origin.
+    size_t              bufStrideY,     // Stride of each row in pBuf
+    size_t              bufStrideZ,     // Stride of each slice in pBuf
+    UINT_32             imageBlocksY,   // Width of the image slice, in blocks.
+    UINT_32             imageBlocksZ,   // Depth pitch of the image slice, in blocks.
+    ADDR_COORD3D        origin,         // Absolute origin, in elements
+    ADDR_EXTENT3D       extent,         // Size to copy, in elements
+    UINT_32             pipeBankXor,    // Final value to xor in
+    BOOL_32             isInMipTail,    // True if this is in the mip tail.
+    const LutAddresser& addresser)
+{
+    // Handle unaligned edges in x/y/z and fixup the extents to match.
+    HandleUnalignedRegions<MicroSw::BpeLog2, MicroSw::ExpandX>(
+        pImgBlockStart,
+        &pBuf,
+        bufStrideY,
+        bufStrideZ,
+        imageBlocksY,
+        imageBlocksZ,
+        &origin,
+        &extent,
+        MicroSw::MicroBlockExtent,
+        pipeBankXor,
+        isInMipTail,
+        addresser
+    );
+
+    // Apply a negative x/y offset now so later code can do eg. pBuf[x] instead of pBuf[x - origin.x]
+    // Keep the z offset.
+    pBuf = VoidPtrDec(pBuf, origin.x << MicroSw::BpeLog2);
+
+    void* pSliceBuf = pBuf;
+    // Do things one slice/row at a time for unaligned regions.
+    for (UINT_32 z = origin.z; z < (origin.z + extent.depth); z += MicroSw::MicroBlockExtent.depth)
+    {
+        UINT_32 sliceXor = pipeBankXor ^ addresser.GetAddressZ(z);
+        UINT_32 zBlk = (z >> addresser.GetBlockZBits()) * imageBlocksZ;
+        void* pRowBuf = pSliceBuf;
+        for (UINT_32 y = origin.y; y < (origin.y + extent.height); y += MicroSw::MicroBlockExtent.height)
+        {
+            UINT_32 yBlk = ((y >> addresser.GetBlockYBits()) * imageBlocksY) + zBlk;
+            UINT_32 rowXor = sliceXor ^ addresser.GetAddressY(y);
+
+            for (UINT_32 x = origin.x; x < (origin.x + extent.width); x += MicroSw::MicroBlockExtent.width)
+            {
+                UINT_32 xBlk = (x >> addresser.GetBlockXBits()) + yBlk;
+                UINT_64 offset = (xBlk << addresser.GetBlockBits());
+                offset ^= rowXor;
+                offset ^= addresser.GetAddressX(x);
+
+                void* pPix = VoidPtrInc(pImgBlockStart, offset);
+                void* pPixBuf = VoidPtrInc(pRowBuf, x << MicroSw::BpeLog2);
+
+                MicroSw::CopyMicroBlock(
+                    pPix,
+                    pPixBuf,
+                    bufStrideY,
+                    bufStrideZ
+                );
+            }
+            pRowBuf = VoidPtrInc(pRowBuf, bufStrideY * MicroSw::MicroBlockExtent.height);
+        }
+        pSliceBuf = VoidPtrInc(pSliceBuf, bufStrideZ * MicroSw::MicroBlockExtent.depth);
+    }
+}
+
+/**
+****************************************************************************************************
+*   CopyMemImgMicroblocks
+*
+*   @brief
+*       Copies the microblocks of a 3D pixel region to/from a surface.
+****************************************************************************************************
+*/
+template <bool ImgIsDest, bool NonTemporal>
+AVX2_FUNC NEON_FUNC void CopyMemImgMicroblocks(
+    void*               pImgBlockStart, // Block corresponding to beginning of image
+    void*               pBuf,           // Pointer to data starting from the copy origin.
+    size_t              bufStrideY,     // Stride of each row in pBuf, ignored.
+    size_t              bufStrideZ,     // Stride of each slice in pBuf, ignored.
+    UINT_32             imageBlocksY,   // Width of the image slice, in blocks.
+    UINT_32             imageBlocksZ,   // Depth pitch of the image slice, in blocks.
+    ADDR_COORD3D        origin,         // Absolute origin, in elements
+    ADDR_EXTENT3D       extent,         // Size to copy, in elements
+    UINT_32             pipeBankXor,    // Final value to xor in
+    BOOL_32             isInMipTail,    // True if this is in the mip tail.
+    const LutAddresser& addresser)
+{
+    // Pad out our dims to microblock boundaries.
+    origin.x = PowTwoAlignDown(origin.x, addresser.GetMicroBlockX());
+    origin.y = PowTwoAlignDown(origin.y, addresser.GetMicroBlockY());
+    origin.z = PowTwoAlignDown(origin.z, addresser.GetMicroBlockZ());
+    extent.width  = PowTwoAlign(extent.width,  addresser.GetMicroBlockX());
+    extent.height = PowTwoAlign(extent.height, addresser.GetMicroBlockY());
+    extent.depth  = PowTwoAlign(extent.depth,  addresser.GetMicroBlockZ());
+
+    // Calculate the address of the first pixel in each microblock (256B), then copy it.
+    for (UINT_32 z = origin.z; z < (origin.z + extent.depth); z += addresser.GetMicroBlockZ())
+    {
+        UINT_32 sliceXor = pipeBankXor ^ addresser.GetAddressZ(z);
+        UINT_32 zBlk = (z >> addresser.GetBlockZBits()) * imageBlocksZ;
+        for (UINT_32 y = origin.y; y < (origin.y + extent.height); y += addresser.GetMicroBlockY())
+        {
+            UINT_32 yBlk = ((y >> addresser.GetBlockYBits()) * imageBlocksY) + zBlk;
+            UINT_32 rowXor = sliceXor ^ addresser.GetAddressY(y);
+
+            for (UINT_32 x = origin.x; x < (origin.x + extent.width); x += addresser.GetMicroBlockX())
+            {
+                UINT_32 xBlk = (x >> addresser.GetBlockXBits()) + yBlk;
+                UINT_64 offset = (xBlk << addresser.GetBlockBits());
+                offset ^= rowXor;
+                offset ^= addresser.GetAddressX(x);
+
+                void* pPix = VoidPtrInc(pImgBlockStart, offset);
+                constexpr UINT_32 CopySize = 1 << 8;
+
+#if ADDR_HAS_AVX2
+                if (NonTemporal && ImgIsDest)
+                {
+                    StreamCopyToImgAligned(pPix, pBuf, CopySize);
+                }
+                else if (NonTemporal)
+                {
+                    StreamCopyFromImgAligned(pBuf, pPix, CopySize);
+                }
+                else
+#endif
+                if (ImgIsDest)
+                {
+                    memcpy(pPix, pBuf, CopySize);
+                }
+                else
+                {
+                    memcpy(pBuf, pPix, CopySize);
+                }
+                pBuf = VoidPtrInc(pBuf, CopySize);
+            }
+        }
+    }
+}
+
+/**
+****************************************************************************************************
+*   CopyMemImgBlocks
+*
+*   @brief
+*       Copies the blocks of a 3D pixel region to/from a surface.
+****************************************************************************************************
+*/
+template <bool ImgIsDest, bool NonTemporal>
+AVX2_FUNC NEON_FUNC void CopyMemImgBlocks(
+    void*               pImgBlockStart, // Block corresponding to beginning of image
+    void*               pBuf,           // Pointer to data starting from the copy origin.
+    size_t              bufStrideY,     // Stride of each row in pBuf, ignored.
+    size_t              bufStrideZ,     // Stride of each slice in pBuf, ignored.
+    UINT_32             imageBlocksY,   // Width of the image slice, in blocks.
+    UINT_32             imageBlocksZ,   // Depth pitch of the image slice, in blocks.
+    ADDR_COORD3D        origin,         // Absolute origin, in elements
+    ADDR_EXTENT3D       extent,         // Size to copy, in elements
+    UINT_32             pipeBankXor,    // Final value to xor in
+    BOOL_32             isInMipTail,    // True if this is in the mip tail.
+    const LutAddresser& addresser)
+{
+    if (isInMipTail)
+    {
+        return CopyMemImgMicroblocks<ImgIsDest, NonTemporal>(
+            pImgBlockStart,
+            pBuf,
+            bufStrideY,
+            bufStrideZ,
+            imageBlocksY,
+            imageBlocksZ,
+            origin,
+            extent,
+            pipeBankXor,
+            isInMipTail,
+            addresser
+        );
+    }
+
+    // Pad out our dims to block boundaries.
+    origin.x = PowTwoAlignDown(origin.x, addresser.GetBlockX());
+    origin.y = PowTwoAlignDown(origin.y, addresser.GetBlockY());
+    origin.z = PowTwoAlignDown(origin.z, addresser.GetBlockZ());
+    extent.width  = PowTwoAlign(extent.width,  addresser.GetBlockX());
+    extent.height = PowTwoAlign(extent.height, addresser.GetBlockY());
+    extent.depth  = PowTwoAlign(extent.depth,  addresser.GetBlockZ());
+
+    // Copy block by block. No complex swizzling here, everything is in (strided) typewriter order.
+    for (UINT_32 z = origin.z; z < (origin.z + extent.depth); z += addresser.GetBlockZ())
+    {
+        UINT_32 zBlk = (z >> addresser.GetBlockZBits()) * imageBlocksZ;
+        for (UINT_32 y = origin.y; y < (origin.y + extent.height); y += addresser.GetBlockY())
+        {
+            UINT_32 yBlk = ((y >> addresser.GetBlockYBits()) * imageBlocksY) + zBlk;
+            UINT_32 xBlkStart = (origin.x >> addresser.GetBlockXBits()) + yBlk;
+            UINT_32 numXBlk = extent.width >> addresser.GetBlockXBits();
+            UINT_64 offset = (xBlkStart << addresser.GetBlockBits());
+
+            void* pPix = VoidPtrInc(pImgBlockStart, offset);
+            UINT_32 copySize = numXBlk << addresser.GetBlockBits();
+
+#if ADDR_HAS_AVX2
+            if (NonTemporal && ImgIsDest)
+            {
+                StreamCopyToImgAligned(pPix, pBuf, copySize);
+            }
+            else if (NonTemporal)
+            {
+                StreamCopyFromImgAligned(pBuf, pPix, copySize);
+            }
+            else
+#endif
+            if (ImgIsDest)
+            {
+                memcpy(pPix, pBuf, copySize);
+            }
+            else
+            {
+                memcpy(pBuf, pPix, copySize);
+            }
+            pBuf = VoidPtrInc(pBuf, copySize);
+        }
     }
 }
 
@@ -368,33 +808,130 @@ void Copy2DSliceUnaligned(
 *       Determines and returns which copy function to use for copying to images
 ****************************************************************************************************
 */
-UnalignedCopyMemImgFunc LutAddresser::GetCopyMemImgFunc() const
+UnalignedCopyMemImgFunc LutAddresser::GetCopyMemImgFunc(
+    ADDR_COPY_FLAGS flags
+    ) const
 {
+    UnalignedCopyMemImgFunc pfnRet = nullptr;
+    // This key encodes how the bottom 8 bits (256B) are formed, so we can match to the correct optimized
+    // swizzle function (they are all swizzle-agnostic beyond those 256B).
+    UINT_64 microSwKey = GetMicroSwKey(reinterpret_cast<const UINT_64*>(&m_bit[0]));
+
+    if (flags.blockMemcpy)
+    {
+#if ADDR_HAS_AVX2
+        if (CpuSupportsAvx2())
+        {
+            pfnRet = CopyMemImgBlocks<true, true>;
+        }
+        else
+#endif
+        {
+            pfnRet = CopyMemImgBlocks<true, false>;
+        }
+    }
+
+    if ((pfnRet == nullptr) && flags.hybridMemcpy)
+    {
+#if ADDR_HAS_AVX2
+        if (CpuSupportsAvx2())
+        {
+            pfnRet = CopyMemImgMicroblocks<true, true>;
+        }
+        else
+#endif
+        {
+            pfnRet = CopyMemImgMicroblocks<true, false>;
+        }
+    }
+
+    // If this is one of the known microswizzles and CPU support is present, use a hybrid copy that does
+    // SIMD swizzling for aligned regions and falls back for unaligned edges.
+#if ADDR_HAS_AVX2
+    static constexpr struct {
+        UINT_64                 microSwKey;
+        UnalignedCopyMemImgFunc pfn;
+    } AvxFuncs[] = {
+        { GetMicroSwKey(MicroSw_2D_1BPE_AVX2::MicroEq),  CopyMemImgHybrid<MicroSw_2D_1BPE_AVX2>},
+        { GetMicroSwKey(MicroSw_2D_2BPE_AVX2::MicroEq),  CopyMemImgHybrid<MicroSw_2D_2BPE_AVX2>},
+        { GetMicroSwKey(MicroSw_2D_4BPE_AVX2::MicroEq),  CopyMemImgHybrid<MicroSw_2D_4BPE_AVX2>},
+        { GetMicroSwKey(MicroSw_2D_8BPE_AVX2::MicroEq),  CopyMemImgHybrid<MicroSw_2D_8BPE_AVX2>},
+        { GetMicroSwKey(MicroSw_2D_16BPE_AVX2::MicroEq), CopyMemImgHybrid<MicroSw_2D_16BPE_AVX2>},
+        { GetMicroSwKey(MicroSw_3D_1BPE_AVX2::MicroEq),  CopyMemImgHybrid<MicroSw_3D_1BPE_AVX2>},
+        { GetMicroSwKey(MicroSw_3D_2BPE_AVX2::MicroEq),  CopyMemImgHybrid<MicroSw_3D_2BPE_AVX2>},
+        { GetMicroSwKey(MicroSw_3D_4BPE_AVX2::MicroEq),  CopyMemImgHybrid<MicroSw_3D_4BPE_AVX2>},
+        { GetMicroSwKey(MicroSw_3D_8BPE_AVX2::MicroEq),  CopyMemImgHybrid<MicroSw_3D_8BPE_AVX2>},
+        { GetMicroSwKey(MicroSw_3D_16BPE_AVX2::MicroEq), CopyMemImgHybrid<MicroSw_3D_16BPE_AVX2>},
+        { GetMicroSwKey(MicroSw_R_1BPE_AVX2::MicroEq),   CopyMemImgHybrid<MicroSw_R_1BPE_AVX2>},
+        { GetMicroSwKey(MicroSw_R_2BPE_AVX2::MicroEq),   CopyMemImgHybrid<MicroSw_R_2BPE_AVX2>},
+        { GetMicroSwKey(MicroSw_R_4BPE_AVX2::MicroEq),   CopyMemImgHybrid<MicroSw_R_4BPE_AVX2>},
+        { GetMicroSwKey(MicroSw_Z_1BPE_AVX2::MicroEq),   CopyMemImgHybrid<MicroSw_Z_1BPE_AVX2>},
+        { GetMicroSwKey(MicroSw_D_1BPE_AVX2::MicroEq),   CopyMemImgHybrid<MicroSw_D_1BPE_AVX2>}
+    };
+    if ((pfnRet == nullptr) && CpuSupportsAvx2())
+    {
+        for (const auto& func : AvxFuncs)
+        {
+            if (func.microSwKey == microSwKey)
+            {
+                pfnRet = func.pfn;
+                break;
+            }
+        }
+    }
+#endif // ADDR_HAS_AVX2
+
+#if ADDR_HAS_NEON
+    static constexpr struct {
+        UINT_64                 microSwKey;
+        UnalignedCopyMemImgFunc pfn;
+    } NeonFuncs[] = {
+        { GetMicroSwKey(MicroSw_2D_1BPE_NEON::MicroEq),  CopyMemImgHybrid<MicroSw_2D_1BPE_NEON>},
+        { GetMicroSwKey(MicroSw_2D_2BPE_NEON::MicroEq),  CopyMemImgHybrid<MicroSw_2D_2BPE_NEON>},
+        { GetMicroSwKey(MicroSw_2D_4BPE_NEON::MicroEq),  CopyMemImgHybrid<MicroSw_2D_4BPE_NEON>},
+        { GetMicroSwKey(MicroSw_2D_8BPE_NEON::MicroEq),  CopyMemImgHybrid<MicroSw_2D_8BPE_NEON>},
+        { GetMicroSwKey(MicroSw_2D_16BPE_NEON::MicroEq), CopyMemImgHybrid<MicroSw_2D_16BPE_NEON>},
+        { GetMicroSwKey(MicroSw_3D_1BPE_NEON::MicroEq),  CopyMemImgHybrid<MicroSw_3D_1BPE_NEON>},
+        { GetMicroSwKey(MicroSw_3D_2BPE_NEON::MicroEq),  CopyMemImgHybrid<MicroSw_3D_2BPE_NEON>},
+        { GetMicroSwKey(MicroSw_3D_4BPE_NEON::MicroEq),  CopyMemImgHybrid<MicroSw_3D_4BPE_NEON>},
+        { GetMicroSwKey(MicroSw_3D_8BPE_NEON::MicroEq),  CopyMemImgHybrid<MicroSw_3D_8BPE_NEON>},
+        { GetMicroSwKey(MicroSw_3D_16BPE_NEON::MicroEq), CopyMemImgHybrid<MicroSw_3D_16BPE_NEON>},
+        { GetMicroSwKey(MicroSw_R_1BPE_NEON::MicroEq),   CopyMemImgHybrid<MicroSw_R_1BPE_NEON>},
+        { GetMicroSwKey(MicroSw_R_2BPE_NEON::MicroEq),   CopyMemImgHybrid<MicroSw_R_2BPE_NEON>},
+        { GetMicroSwKey(MicroSw_R_4BPE_NEON::MicroEq),   CopyMemImgHybrid<MicroSw_R_4BPE_NEON>},
+        { GetMicroSwKey(MicroSw_Z_1BPE_NEON::MicroEq),   CopyMemImgHybrid<MicroSw_Z_1BPE_NEON>},
+        { GetMicroSwKey(MicroSw_D_1BPE_NEON::MicroEq),   CopyMemImgHybrid<MicroSw_D_1BPE_NEON>}
+    };
+    if ((pfnRet == nullptr) && CpuSupportsNeon())
+    {
+        for (const auto& func : NeonFuncs)
+        {
+            if (func.microSwKey == microSwKey)
+            {
+                pfnRet = func.pfn;
+                break;
+            }
+        }
+    }
+#endif // ADDR_HAS_NEON
+
     // While these are all the same function, the codegen gets really bad if the size of each pixel
     // is not known at compile time. Hence, templates.
     const UnalignedCopyMemImgFunc Funcs[MaxElementBytesLog2][3] =
     {
         // ExpandX =  1, 2, 4
-        { Copy2DSliceUnaligned<0, 1, true>, Copy2DSliceUnaligned<0, 2, true>, Copy2DSliceUnaligned<0, 4, true> }, // 1BPE
-        { Copy2DSliceUnaligned<1, 1, true>, Copy2DSliceUnaligned<1, 2, true>, Copy2DSliceUnaligned<1, 4, true> }, // 2BPE
-        { Copy2DSliceUnaligned<2, 1, true>, Copy2DSliceUnaligned<2, 2, true>, Copy2DSliceUnaligned<2, 4, true> }, // 4BPE
-        { Copy2DSliceUnaligned<3, 1, true>, Copy2DSliceUnaligned<3, 2, true>, Copy2DSliceUnaligned<3, 4, true> }, // 8BPE
-        { Copy2DSliceUnaligned<4, 1, true>, Copy2DSliceUnaligned<4, 2, true>, Copy2DSliceUnaligned<4, 4, true> }, // 16BPE
+        { CopyImgUnaligned<0, 1, true>, CopyImgUnaligned<0, 2, true>, CopyImgUnaligned<0, 4, true> }, // 1BPE
+        { CopyImgUnaligned<1, 1, true>, CopyImgUnaligned<1, 2, true>, CopyImgUnaligned<1, 4, true> }, // 2BPE
+        { CopyImgUnaligned<2, 1, true>, CopyImgUnaligned<2, 2, true>, CopyImgUnaligned<2, 4, true> }, // 4BPE
+        { CopyImgUnaligned<3, 1, true>, CopyImgUnaligned<3, 2, true>, CopyImgUnaligned<3, 4, true> }, // 8BPE
+        { CopyImgUnaligned<4, 1, true>, CopyImgUnaligned<4, 2, true>, CopyImgUnaligned<4, 4, true> }, // 16BPE
     };
 
-    UnalignedCopyMemImgFunc pfnRet = nullptr;
-    ADDR_ASSERT(m_bpeLog2 < MaxElementBytesLog2);
-    if (m_maxExpandX >= 4)
+    // Fallback functions
+    if (pfnRet == nullptr)
     {
-        pfnRet = Funcs[m_bpeLog2][2];
-    }
-    else if (m_maxExpandX >= 2)
-    {
-        pfnRet = Funcs[m_bpeLog2][1];
-    }
-    else
-    {
-        pfnRet = Funcs[m_bpeLog2][0];
+        ADDR_ASSERT(m_bpeLog2 < MaxElementBytesLog2);
+        pfnRet = Funcs[m_bpeLog2][Min(2U, Log2(m_maxExpandX))];
     }
     return pfnRet;
 }
@@ -407,35 +944,139 @@ UnalignedCopyMemImgFunc LutAddresser::GetCopyMemImgFunc() const
 *       Determines and returns which copy function to use for copying from images
 ****************************************************************************************************
 */
-UnalignedCopyMemImgFunc LutAddresser::GetCopyImgMemFunc() const
+UnalignedCopyMemImgFunc LutAddresser::GetCopyImgMemFunc(
+    ADDR_COPY_FLAGS flags
+    ) const
 {
+    UnalignedCopyMemImgFunc pfnRet = nullptr;
+    if (flags.blockMemcpy)
+    {
+#if ADDR_HAS_AVX2
+        if (CpuSupportsAvx2())
+        {
+            pfnRet = CopyMemImgBlocks<false, true>;
+        }
+        else
+#endif
+        {
+            pfnRet = CopyMemImgBlocks<false, false>;
+        }
+    }
+
+    if ((pfnRet == nullptr) && flags.hybridMemcpy)
+    {
+#if ADDR_HAS_AVX2
+        if (CpuSupportsAvx2())
+        {
+            pfnRet = CopyMemImgMicroblocks<false, true>;
+        }
+        else
+#endif
+        {
+            pfnRet = CopyMemImgMicroblocks<false, false>;
+        }
+    }
     // While these are all the same function, the codegen gets really bad if the size of each pixel
     // is not known at compile time. Hence, templates.
     const UnalignedCopyMemImgFunc Funcs[MaxElementBytesLog2][3] =
     {
         // ExpandX =  1, 2, 4
-        { Copy2DSliceUnaligned<0, 1, false>, Copy2DSliceUnaligned<0, 2, false>, Copy2DSliceUnaligned<0, 4, false> }, // 1BPE
-        { Copy2DSliceUnaligned<1, 1, false>, Copy2DSliceUnaligned<1, 2, false>, Copy2DSliceUnaligned<1, 4, false> }, // 2BPE
-        { Copy2DSliceUnaligned<2, 1, false>, Copy2DSliceUnaligned<2, 2, false>, Copy2DSliceUnaligned<2, 4, false> }, // 4BPE
-        { Copy2DSliceUnaligned<3, 1, false>, Copy2DSliceUnaligned<3, 2, false>, Copy2DSliceUnaligned<3, 4, false> }, // 8BPE
-        { Copy2DSliceUnaligned<4, 1, false>, Copy2DSliceUnaligned<4, 2, false>, Copy2DSliceUnaligned<4, 4, false> }, // 16BPE
+        { CopyImgUnaligned<0, 1, false>, CopyImgUnaligned<0, 2, false>, CopyImgUnaligned<0, 4, false> }, // 1BPE
+        { CopyImgUnaligned<1, 1, false>, CopyImgUnaligned<1, 2, false>, CopyImgUnaligned<1, 4, false> }, // 2BPE
+        { CopyImgUnaligned<2, 1, false>, CopyImgUnaligned<2, 2, false>, CopyImgUnaligned<2, 4, false> }, // 4BPE
+        { CopyImgUnaligned<3, 1, false>, CopyImgUnaligned<3, 2, false>, CopyImgUnaligned<3, 4, false> }, // 8BPE
+        { CopyImgUnaligned<4, 1, false>, CopyImgUnaligned<4, 2, false>, CopyImgUnaligned<4, 4, false> }, // 16BPE
     };
 
-    UnalignedCopyMemImgFunc pfnRet = nullptr;
     ADDR_ASSERT(m_bpeLog2 < MaxElementBytesLog2);
-    if (m_maxExpandX >= 4)
+    if (pfnRet == nullptr)
     {
-        pfnRet = Funcs[m_bpeLog2][2];
-    }
-    else if (m_maxExpandX >= 2)
-    {
-        pfnRet = Funcs[m_bpeLog2][1];
-    }
-    else
-    {
-        pfnRet = Funcs[m_bpeLog2][0];
+        pfnRet = Funcs[m_bpeLog2][Min(2U, Log2(m_maxExpandX))];
     }
     return pfnRet;
 }
 
+/**
+****************************************************************************************************
+*   LutAddresser::DoCopyImgMemPreFlushes
+*
+*   @brief
+*       Does any flushes required for nontemporal SIMD instructions to access the image memory.
+****************************************************************************************************
+*/
+void LutAddresser::DoCopyImgMemPreFlushes(
+    ADDR_COPY_FLAGS flags
+    ) const
+{
+#if ADDR_HAS_AVX2
+    if ((flags.blockMemcpy || flags.hybridMemcpy) && CpuSupportsAvx2())
+    {
+        // Loads are weakly ordered, and we need to ensure they start after the previous copy
+        NonTemporalLoadStoreFence();
+    }
+#endif
+}
+
+/**
+****************************************************************************************************
+*   LutAddresser::DoCopyMemImgPostFlushes
+*
+*   @brief
+*       Does any flushes required for nontemporal SIMD instructions to access the image memory.
+****************************************************************************************************
+*/
+void LutAddresser::DoCopyMemImgPostFlushes(
+    ADDR_COPY_FLAGS flags
+    ) const
+{
+#if ADDR_HAS_AVX2
+    if (CpuSupportsAvx2())
+    {
+        // Stores are weakly ordered, and we need to ensure they finish before the next submission
+        // or copy.
+        NonTemporalStoreFence();
+    }
+#endif
+}
+
+
+#if __cplusplus < 201703L
+// Constexpr arrays need an additional definition at namespace scope until c++17
+#if ADDR_HAS_AVX2
+constexpr ADDR_EXTENT3D MicroSw_2D_1BPE_AVX2::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_2D_2BPE_AVX2::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_2D_4BPE_AVX2::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_2D_8BPE_AVX2::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_2D_16BPE_AVX2::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_3D_1BPE_AVX2::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_3D_2BPE_AVX2::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_3D_4BPE_AVX2::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_3D_8BPE_AVX2::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_3D_16BPE_AVX2::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_R_1BPE_AVX2::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_R_2BPE_AVX2::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_R_4BPE_AVX2::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_Z_1BPE_AVX2::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_D_1BPE_AVX2::MicroBlockExtent;
+#endif
+#if ADDR_HAS_NEON
+constexpr ADDR_EXTENT3D MicroSw_2D_1BPE_NEON::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_2D_2BPE_NEON::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_2D_4BPE_NEON::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_2D_8BPE_NEON::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_2D_16BPE_NEON::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_3D_1BPE_NEON::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_3D_2BPE_NEON::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_3D_4BPE_NEON::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_3D_8BPE_NEON::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_3D_16BPE_NEON::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_R_1BPE_NEON::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_R_2BPE_NEON::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_R_4BPE_NEON::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_Z_1BPE_NEON::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_D_1BPE_NEON::MicroBlockExtent;
+#endif
+
+#endif
+
 }
diff --git a/src/amd/addrlib/src/core/addrswizzler.h b/src/amd/addrlib/src/core/addrswizzler.h
index feace2761c1..7f615711b84 100644
--- a/src/amd/addrlib/src/core/addrswizzler.h
+++ b/src/amd/addrlib/src/core/addrswizzler.h
@@ -1,7 +1,8 @@
 /*
 ************************************************************************************************************************
 *
-*  Copyright (C) 2024 Advanced Micro Devices, Inc.  All rights reserved.
+*  Copyright (C) 2024-2026 Advanced Micro Devices, Inc.  All rights reserved.
+*  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
 /**
@@ -26,10 +27,13 @@ typedef void (*UnalignedCopyMemImgFunc)(
     void*               pImgBlockSliceStart,  // Block corresponding to beginning of slice
     void*               pBuf,                 // Pointer to data starting from the copy origin.
     size_t              bufStrideY,           // Stride of each row in pBuf
+    size_t              bufStrideZ,           // Stride of each slice in pBuf
     UINT_32             imageBlocksY,         // Width of the image slice, in blocks.
-    ADDR_COORD2D        origin,               // Absolute origin, in elements
-    ADDR_EXTENT2D       extent,               // Size to copy, in elements
-    UINT_32             sliceXor,             // Includes pipeBankXor and z XOR
+    UINT_32             imageBlocksZ,         // Depth pitch of the image slice, in blocks.
+    ADDR_COORD3D        origin,               // Absolute origin, in elements
+    ADDR_EXTENT3D       extent,               // Size to copy, in elements
+    UINT_32             pipeBankXor,          // Final value to XOR into the address
+    BOOL_32             isInMipTail,          // True if this is in the mip tail.
     const LutAddresser& addresser);
 
 // This class calculates and holds up to four lookup tables (x/y/z/s) which can be used to cheaply calculate the
@@ -60,10 +64,21 @@ public:
 
     // Get the block size
     UINT_32  GetBlockBits() const { return m_blockBits; }
+    UINT_32  GetBlockX() const { return m_blockSize.width; }
+    UINT_32  GetBlockY() const { return m_blockSize.height; }
+    UINT_32  GetBlockZ() const { return m_blockSize.depth; }
     UINT_32  GetBlockXBits() const { return Log2(m_blockSize.width); }
     UINT_32  GetBlockYBits() const { return Log2(m_blockSize.height); }
     UINT_32  GetBlockZBits() const { return Log2(m_blockSize.depth); }
 
+    // Get the microblock size
+    UINT_32  GetMicroBlockX() const { return m_microBlockSize.width; }
+    UINT_32  GetMicroBlockY() const { return m_microBlockSize.height; }
+    UINT_32  GetMicroBlockZ() const { return m_microBlockSize.depth; }
+
+    // Get other image props
+    UINT_32  GetBpeLog2() const { return m_bpeLog2; }
+
     // "Fast single channel" functions to get the part that each channel contributes to be XORd together.
     UINT_32  GetAddressX(UINT_32  x) const { return m_pXLut[x & m_xLutMask];}
     UINT_32  GetAddressY(UINT_32  y) const { return m_pYLut[y & m_yLutMask];}
@@ -71,8 +86,11 @@ public:
     UINT_32  GetAddressS(UINT_32  s) const { return m_pSLut[s & m_sLutMask];}
 
     // Get a function that can copy a single 2D slice of an image with this swizzle.
-    UnalignedCopyMemImgFunc GetCopyMemImgFunc() const;
-    UnalignedCopyMemImgFunc GetCopyImgMemFunc() const;
+    UnalignedCopyMemImgFunc GetCopyMemImgFunc(ADDR_COPY_FLAGS flags) const;
+    UnalignedCopyMemImgFunc GetCopyImgMemFunc(ADDR_COPY_FLAGS flags) const;
+
+    void DoCopyMemImgPostFlushes(ADDR_COPY_FLAGS flags) const;
+    void DoCopyImgMemPreFlushes(ADDR_COPY_FLAGS flags) const;
 private:
     // Calculate general properties of the swizzle equations
     void InitSwizzleProps();
@@ -99,6 +117,9 @@ private:
 
     // The block size
     ADDR_EXTENT3D m_blockSize;
+    
+    // The microblock size
+    ADDR_EXTENT3D m_microBlockSize;
 
     // Number of 'x' bits at the bottom of the equation. Must be a pow2 and at least 1.
     // This will be used as a simple optimization to batch together operations on adjacent x pixels.
diff --git a/src/amd/addrlib/src/core/addrswizzlersimd.h b/src/amd/addrlib/src/core/addrswizzlersimd.h
new file mode 100644
index 00000000000..fd53e4080a3
--- /dev/null
+++ b/src/amd/addrlib/src/core/addrswizzlersimd.h
@@ -0,0 +1,2031 @@
+/*
+************************************************************************************************************************
+*
+*  Copyright (C) 2024-2026 Advanced Micro Devices, Inc.  All rights reserved.
+*  SPDX-License-Identifier: MIT
+*
+***********************************************************************************************************************/
+/**
+****************************************************************************************************
+* @file  addrswizzlersimd.h
+* @brief Contains CPU/swizzle-specific code for efficient CPU swizzling.
+****************************************************************************************************
+*/
+
+#ifndef __ADDR_SWIZZLER_SIMD_H__
+#define __ADDR_SWIZZLER_SIMD_H__
+
+#include "addrswizzler.h"
+#include "addrcommon.h"
+
+#if !ADDR_ALLOW_SIMD
+// Disabled
+#define ADDR_HAS_AVX2 0
+#define AVX2_FUNC
+#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
+// x86 visual studio builds
+#define ADDR_HAS_AVX2 1
+#define AVX2_FUNC
+#elif defined(__x86_64__) || defined(__i386__)
+// x86 GCC/Clang builds
+#define ADDR_HAS_AVX2 1
+#define AVX2_FUNC [[gnu::target("avx2")]]
+#else
+// Unknown
+#define ADDR_HAS_AVX2 0
+#define AVX2_FUNC
+#endif
+
+#if !ADDR_ALLOW_SIMD
+// Disabled
+#define ADDR_HAS_NEON 0
+#define NEON_FUNC
+#elif defined(_MSC_VER) && (defined(_M_ARM64) || defined(_M_ARM))
+// arm visual studio builds
+#define ADDR_HAS_NEON 1
+#define NEON_FUNC
+#elif (defined(__linux__) || defined(_WIN32)) && (defined(__aarch64__) || (defined(__arm__) && defined(__ARM_FP)))
+// arm GCC/Clang builds on windows or linux
+#define ADDR_HAS_NEON 1
+#define NEON_FUNC
+#else
+// Unknown
+#define ADDR_HAS_NEON 0
+#define NEON_FUNC
+#endif
+
+#if ADDR_HAS_AVX2
+#if _MSC_VER
+#include <Windows.h>
+#endif
+#include <immintrin.h>
+
+// Certain compiler versions lack this intrinsic.
+#if (defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 8) || (_MSC_VER && !defined(_mm256_set_m128i))
+#define _mm256_set_m128i(hi, lo) _mm256_inserti128_si256(_mm256_castsi128_si256(lo), (hi), 1)
+#endif
+#endif
+
+#if ADDR_HAS_NEON
+#if _WIN32
+#include <immintrin.h>
+#else
+#include <arm_neon.h>
+#include <sys/auxv.h>
+#include <asm/hwcap.h>
+#endif
+#endif
+
+namespace Addr
+{
+
+#if ADDR_HAS_AVX2
+static inline bool CpuSupportsAvx2() {
+    // Use compiler builtins to check for support
+#if _MSC_VER
+    return IsProcessorFeaturePresent(PF_AVX2_INSTRUCTIONS_AVAILABLE);
+#elif defined(__GNUC__)
+    return __builtin_cpu_supports("avx2");
+#else
+#error "What platform is this?"
+#endif
+}
+#endif
+
+#if ADDR_HAS_NEON
+static inline bool CpuSupportsNeon() {
+    // ARM can't check this without OS help. Use OS knowledge and helpers to check for support.
+#if _WIN32
+    return true; // Mandatory for WoA
+#elif defined(__linux__) && defined(__aarch64__)
+    return ((getauxval(AT_HWCAP) & HWCAP_ASIMD) != 0);
+#elif defined(__linux__) && defined(__arm__)
+    return ((getauxval(AT_HWCAP) & HWCAP_NEON) != 0);
+#else
+#error "What platform is this?"
+#endif
+}
+#endif
+
+constexpr UINT_64 GetMicroSwKey(const UINT_64* pEq, bool isPlanarMsaa = false)
+{
+    UINT_64 out = 0;
+
+    // Microswizzles never have move than 1 xor, so just use the log2 of each bit (6 bits each = 48 bits)
+    for (UINT_32 i = 0; i < 8; i++)
+    {
+        if (pEq[i] != 0)
+        {
+            out |= (static_cast<UINT_64>(ConstexprLog2(pEq[i]) + 1) << (i * 6));
+        }
+    }
+
+    // Bits 48+ can be other things. The big one is if this is actually MSAA but the sample bits are in the
+    // high part of the equation.
+    if (isPlanarMsaa)
+    {
+        out |= (static_cast<UINT_64>(1) << 48);
+    }
+
+    return out;
+}
+
+#if ADDR_HAS_AVX2
+// Ensures all non-temporal/stream stores have completed.
+AVX2_FUNC static inline void NonTemporalStoreFence()
+{
+    _mm_sfence();
+}
+
+// Ensures all non-temporal/stream loads and stores have completed.
+AVX2_FUNC static inline void NonTemporalLoadStoreFence()
+{
+    _mm_mfence();
+}
+
+AVX2_FUNC static inline void StreamCopyToImgAligned(
+    void*       pImg, // Memory to write to, must be 256B aligned.
+    const void* pBuf, // Memory to read from, can be unaligned.
+    size_t      size) // Bytes to copy, must be 256B aligned.
+{
+    __m256i*       pAlignedOut  = reinterpret_cast<__m256i*>(pImg);
+    const __m256i* pUnalignedIn = reinterpret_cast<const __m256i*>(pBuf);
+    ADDR_ASSERT(PowTwoAlign(uint64_t(size), 256ULL) == uint64_t(size));
+    while (size > 0)
+    {
+        // Use streaming stores to optimize memory behavior-- this requires aligned memory.
+        _mm256_stream_si256(pAlignedOut++, _mm256_loadu_si256(pUnalignedIn++));
+        _mm256_stream_si256(pAlignedOut++, _mm256_loadu_si256(pUnalignedIn++));
+        _mm256_stream_si256(pAlignedOut++, _mm256_loadu_si256(pUnalignedIn++));
+        _mm256_stream_si256(pAlignedOut++, _mm256_loadu_si256(pUnalignedIn++));
+        _mm256_stream_si256(pAlignedOut++, _mm256_loadu_si256(pUnalignedIn++));
+        _mm256_stream_si256(pAlignedOut++, _mm256_loadu_si256(pUnalignedIn++));
+        _mm256_stream_si256(pAlignedOut++, _mm256_loadu_si256(pUnalignedIn++));
+        _mm256_stream_si256(pAlignedOut++, _mm256_loadu_si256(pUnalignedIn++));
+        
+        size -= 256;
+    }
+}
+
+AVX2_FUNC static inline void StreamCopyFromImgAligned(
+    void*       pBuf, // Memory to write to, can be unaligned.
+    const void* pImg, // Memory to read from, must be 256B aligned.
+    size_t      size) // Bytes to copy, must be 256B aligned.
+{
+    __m256i*       pUnalignedOut = reinterpret_cast<__m256i*>(pBuf);
+    const __m256i* pAlignedIn    = reinterpret_cast<const __m256i*>(pImg);
+    ADDR_ASSERT(PowTwoAlign(uint64_t(size), 256ULL) == uint64_t(size));
+    while (size > 0)
+    {
+        // Use streaming loads to optimize memory behavior-- this requires aligned memory.
+        _mm256_storeu_si256(pUnalignedOut++, _mm256_stream_load_si256(pAlignedIn++));
+        _mm256_storeu_si256(pUnalignedOut++, _mm256_stream_load_si256(pAlignedIn++));
+        _mm256_storeu_si256(pUnalignedOut++, _mm256_stream_load_si256(pAlignedIn++));
+        _mm256_storeu_si256(pUnalignedOut++, _mm256_stream_load_si256(pAlignedIn++));
+        _mm256_storeu_si256(pUnalignedOut++, _mm256_stream_load_si256(pAlignedIn++));
+        _mm256_storeu_si256(pUnalignedOut++, _mm256_stream_load_si256(pAlignedIn++));
+        _mm256_storeu_si256(pUnalignedOut++, _mm256_stream_load_si256(pAlignedIn++));
+        _mm256_storeu_si256(pUnalignedOut++, _mm256_stream_load_si256(pAlignedIn++));
+        
+        size -= 256;
+    }
+}
+
+class MicroSw_2D_1BPE_AVX2
+{
+    MicroSw_2D_1BPE_AVX2() = delete;
+public:
+    static constexpr UINT_64       MicroEq[8] = { X0, X1, Y0, X2, Y1, Y2, X3, Y3 };
+    static constexpr ADDR_EXTENT3D MicroBlockExtent = {16, 16, 1};
+    static constexpr UINT_32       BpeLog2 = 0;
+    static constexpr UINT_32       ExpandX = 4;
+
+    AVX2_FUNC static void CopyMicroBlock(
+        void*       pImgMicroblock,  // Microblock to write to
+        const void* pBuf,            // Pointer to data starting from the first pixel of this block
+        size_t      bufStrideY,      // Stride of each row in pBuf
+        size_t      bufStrideZ)      // Stride of each slice in pBuf
+    {
+        // Unaligned buffer loads for 16 rows * (16 col * 8b = 128b)
+        // Yes, that means double the load instructions
+        // Each reg becomes: [ X3 X2 X1 X0 ]
+        __m128i y0  = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 0));
+        __m128i y1  = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 1));
+        __m128i y2  = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 2));
+        __m128i y3  = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 3));
+        __m128i y4  = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 4));
+        __m128i y5  = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 5));
+        __m128i y6  = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 6));
+        __m128i y7  = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 7));
+        __m128i y8  = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 8));
+        __m128i y9  = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 9));
+        __m128i y10 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 10));
+        __m128i y11 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 11));
+        __m128i y12 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 12));
+        __m128i y13 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 13));
+        __m128i y14 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 14));
+        __m128i y15 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 15));
+
+        // First, concat the SSE regs so they end up in pairs that will unpack well
+        // Each reg becomes: [ Y1 X3 X2 X1 X0 ]
+        __m256i m1_0 = _mm256_set_m128i(y2, y0);
+        __m256i m1_1 = _mm256_set_m128i(y3, y1);
+        __m256i m1_2 = _mm256_set_m128i(y6, y4);
+        __m256i m1_3 = _mm256_set_m128i(y7, y5);
+        __m256i m1_4 = _mm256_set_m128i(y10, y8);
+        __m256i m1_5 = _mm256_set_m128i(y11, y9);
+        __m256i m1_6 = _mm256_set_m128i(y14, y12);
+        __m256i m1_7 = _mm256_set_m128i(y15, y13);
+
+        // Unpack to handle the rest of the swizzling within each reg
+        // Each reg becomes: [ Y1 X2 Y0 X1 X0 ]
+        __m256i m2_0 = _mm256_unpacklo_epi32(m1_0, m1_1);
+        __m256i m2_1 = _mm256_unpackhi_epi32(m1_0, m1_1);
+        __m256i m2_2 = _mm256_unpacklo_epi32(m1_2, m1_3);
+        __m256i m2_3 = _mm256_unpackhi_epi32(m1_2, m1_3);
+        __m256i m2_4 = _mm256_unpacklo_epi32(m1_4, m1_5);
+        __m256i m2_5 = _mm256_unpackhi_epi32(m1_4, m1_5);
+        __m256i m2_6 = _mm256_unpacklo_epi32(m1_6, m1_7);
+        __m256i m2_7 = _mm256_unpackhi_epi32(m1_6, m1_7);
+
+        // Move each reg around to handle high bit swizzling
+        // Use streaming stores to optimize memory behavior-- this requires aligned memory.
+        __m256i* pAlignedOut = reinterpret_cast<__m256i*>(pImgMicroblock);
+        _mm256_stream_si256(pAlignedOut++, m2_0);
+        _mm256_stream_si256(pAlignedOut++, m2_2);
+        _mm256_stream_si256(pAlignedOut++, m2_1);
+        _mm256_stream_si256(pAlignedOut++, m2_3);
+        _mm256_stream_si256(pAlignedOut++, m2_4);
+        _mm256_stream_si256(pAlignedOut++, m2_6);
+        _mm256_stream_si256(pAlignedOut++, m2_5);
+        _mm256_stream_si256(pAlignedOut++, m2_7);
+    }
+};
+
+
+class MicroSw_2D_2BPE_AVX2
+{
+    MicroSw_2D_2BPE_AVX2() = delete;
+public:
+    static constexpr UINT_64       MicroEq[8] = { 0, X0, Y0, X1, Y1, X2, Y2, X3 };
+    static constexpr ADDR_EXTENT3D MicroBlockExtent = {16, 8, 1};
+    static constexpr UINT_32       BpeLog2 = 1;
+    static constexpr UINT_32       ExpandX = 2;
+
+    AVX2_FUNC static void CopyMicroBlock(
+        void*       pImgMicroblock,  // Microblock to write to
+        const void* pBuf,            // Pointer to data starting from the first pixel of this block
+        size_t      bufStrideY,      // Stride of each row in pBuf
+        size_t      bufStrideZ)      // Stride of each slice in pBuf
+    {
+        // Unaligned buffer loads for 8 rows * (16 col * 16b = 256b)
+        // Each reg becomes: [ X3 X2 X1 X0 0 ]
+        __m256i y0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, bufStrideY * 0)));
+        __m256i y1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, bufStrideY * 1)));
+        __m256i y2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, bufStrideY * 2)));
+        __m256i y3 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, bufStrideY * 3)));
+        __m256i y4 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, bufStrideY * 4)));
+        __m256i y5 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, bufStrideY * 5)));
+        __m256i y6 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, bufStrideY * 6)));
+        __m256i y7 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, bufStrideY * 7)));
+
+        // Do a 32-bit zip/unpack operation to interleave within each 128b reg.
+        // Each reg becomes: [ X2 X1 Y0 X0 0 ]
+        __m256i m0_0 = _mm256_unpacklo_epi32(y0, y1);
+        __m256i m0_1 = _mm256_unpackhi_epi32(y0, y1);
+        __m256i m0_2 = _mm256_unpacklo_epi32(y2, y3);
+        __m256i m0_3 = _mm256_unpackhi_epi32(y2, y3);
+        __m256i m0_4 = _mm256_unpacklo_epi32(y4, y5);
+        __m256i m0_5 = _mm256_unpackhi_epi32(y4, y5);
+        __m256i m0_6 = _mm256_unpacklo_epi32(y6, y7);
+        __m256i m0_7 = _mm256_unpackhi_epi32(y6, y7);
+
+        // Then use a cross-lane dual permute to do a 128b interleave across y1
+        // Each reg becomes: [ Y1 X1 Y0 X0 0 ]
+        __m256i m1_0 = _mm256_permute2x128_si256(m0_0, m0_2, 0x20);
+        __m256i m1_1 = _mm256_permute2x128_si256(m0_1, m0_3, 0x20);
+        __m256i m1_2 = _mm256_permute2x128_si256(m0_4, m0_6, 0x20);
+        __m256i m1_3 = _mm256_permute2x128_si256(m0_5, m0_7, 0x20);
+        __m256i m1_4 = _mm256_permute2x128_si256(m0_0, m0_2, 0x31);
+        __m256i m1_5 = _mm256_permute2x128_si256(m0_1, m0_3, 0x31);
+        __m256i m1_6 = _mm256_permute2x128_si256(m0_4, m0_6, 0x31);
+        __m256i m1_7 = _mm256_permute2x128_si256(m0_5, m0_7, 0x31);
+
+        // Use streaming stores to optimize memory behavior-- this requires aligned memory.
+        __m256i* pAlignedOut = reinterpret_cast<__m256i*>(pImgMicroblock);
+        _mm256_stream_si256(pAlignedOut++, m1_0);
+        _mm256_stream_si256(pAlignedOut++, m1_1);
+        _mm256_stream_si256(pAlignedOut++, m1_2);
+        _mm256_stream_si256(pAlignedOut++, m1_3);
+        _mm256_stream_si256(pAlignedOut++, m1_4);
+        _mm256_stream_si256(pAlignedOut++, m1_5);
+        _mm256_stream_si256(pAlignedOut++, m1_6);
+        _mm256_stream_si256(pAlignedOut++, m1_7);
+    }
+};
+
+
+class MicroSw_2D_4BPE_AVX2
+{
+    MicroSw_2D_4BPE_AVX2() = delete;
+public:
+    static constexpr UINT_64       MicroEq[8] = { 0, 0, X0, Y0, X1, Y1, X2, Y2 };
+    static constexpr ADDR_EXTENT3D MicroBlockExtent = { 8, 8, 1};
+    static constexpr UINT_32       BpeLog2 = 2;
+    static constexpr UINT_32       ExpandX = 2;
+
+    AVX2_FUNC static void CopyMicroBlock(
+        void*       pImgMicroblock,  // Microblock to write to
+        const void* pBuf,            // Pointer to data starting from the first pixel of this block
+        size_t      bufStrideY,      // Stride of each row in pBuf
+        size_t      bufStrideZ)      // Stride of each slice in pBuf
+    {
+        // Unaligned buffer loads for 8 rows * (8 col * 32b = 256b)
+        // Each reg becomes: [ X2 X1 X0 0 0 ]
+        __m256i y0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, bufStrideY * 0)));
+        __m256i y1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, bufStrideY * 1)));
+        __m256i y2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, bufStrideY * 2)));
+        __m256i y3 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, bufStrideY * 3)));
+        __m256i y4 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, bufStrideY * 4)));
+        __m256i y5 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, bufStrideY * 5)));
+        __m256i y6 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, bufStrideY * 6)));
+        __m256i y7 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, bufStrideY * 7)));
+
+        // In-register shuffling on the bottom 5 bits of which the bottom 3 are already right: (X1, Y0, *, *, *)
+        // Start: y0/y1: [ 0 1 2 3 ] [ 4 5 6 7 ] (QWORDS)
+        // Desired:      [ 0 4 1 5 ] [ 2 6 3 7 ]
+
+        // Do a permute to reorder each register to have all values in the correct 128b lanes.
+        // Swap 2/3 -> mask 0b11_01_10_00 -> 0xD8
+        // Result:       [ 0 2 1 3 ] [ 4 6 5 7 ]
+        // Each reg becomes: [ X1 X2 X0 0 0 ]
+        __m256i perm0 = _mm256_permute4x64_epi64(y0, 0xD8);
+        __m256i perm1 = _mm256_permute4x64_epi64(y1, 0xD8);
+        __m256i perm2 = _mm256_permute4x64_epi64(y2, 0xD8);
+        __m256i perm3 = _mm256_permute4x64_epi64(y3, 0xD8);
+        __m256i perm4 = _mm256_permute4x64_epi64(y4, 0xD8);
+        __m256i perm5 = _mm256_permute4x64_epi64(y5, 0xD8);
+        __m256i perm6 = _mm256_permute4x64_epi64(y6, 0xD8);
+        __m256i perm7 = _mm256_permute4x64_epi64(y7, 0xD8);
+
+        // Then use unpack intrinsics to interleave two regs (within those lanes), which leaves it in the final place
+        // Result:       [ 0 4 1 5 ] [ 2 6 3 7]
+        // Each reg becomes: [ X1 Y0 X0 0 0 ]
+        __m256i unpack0 = _mm256_unpacklo_epi64(perm0, perm1);
+        __m256i unpack1 = _mm256_unpackhi_epi64(perm0, perm1);
+        __m256i unpack2 = _mm256_unpacklo_epi64(perm2, perm3);
+        __m256i unpack3 = _mm256_unpackhi_epi64(perm2, perm3);
+        __m256i unpack4 = _mm256_unpacklo_epi64(perm4, perm5);
+        __m256i unpack5 = _mm256_unpackhi_epi64(perm4, perm5);
+        __m256i unpack6 = _mm256_unpacklo_epi64(perm6, perm7);
+        __m256i unpack7 = _mm256_unpackhi_epi64(perm6, perm7);
+
+        // The top 3 bits of the swizzle are handled by the order of the registers here.
+        // Use streaming stores to optimize memory behavior-- this requires aligned memory.
+        __m256i* pAlignedOut = reinterpret_cast<__m256i*>(pImgMicroblock);
+        _mm256_stream_si256(pAlignedOut++, unpack0);
+        _mm256_stream_si256(pAlignedOut++, unpack2);
+        _mm256_stream_si256(pAlignedOut++, unpack1);
+        _mm256_stream_si256(pAlignedOut++, unpack3);
+        _mm256_stream_si256(pAlignedOut++, unpack4);
+        _mm256_stream_si256(pAlignedOut++, unpack6);
+        _mm256_stream_si256(pAlignedOut++, unpack5);
+        _mm256_stream_si256(pAlignedOut++, unpack7);
+    }
+};
+
+class MicroSw_2D_8BPE_AVX2
+{
+    MicroSw_2D_8BPE_AVX2() = delete;
+public:
+    static constexpr UINT_64       MicroEq[8] = { 0, 0, 0, X0, Y0, X1, X2, Y1 };
+    static constexpr ADDR_EXTENT3D MicroBlockExtent = {8, 4, 1};
+    static constexpr UINT_32       BpeLog2 = 3;
+    static constexpr UINT_32       ExpandX = 2;
+
+    AVX2_FUNC static void CopyMicroBlock(
+        void*       pImgMicroblock,  // Microblock to write to
+        const void* pBuf,            // Pointer to data starting from the first pixel of this block
+        size_t      bufStrideY,      // Stride of each row in pBuf
+        size_t      bufStrideZ)      // Stride of each slice in pBuf
+    {
+        // Unaligned buffer loads for 4 rows * (8 col * 64b = 256bx2)
+        // Each reg becomes: [ X1 X0 0 0 0 ]
+        __m256i y0a = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, (bufStrideY * 0) + 0)));
+        __m256i y0b = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, (bufStrideY * 0) + 32)));
+        __m256i y1a = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, (bufStrideY * 1) + 0)));
+        __m256i y1b = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, (bufStrideY * 1) + 32)));
+        __m256i y2a = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, (bufStrideY * 2) + 0)));
+        __m256i y2b = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, (bufStrideY * 2) + 32)));
+        __m256i y3a = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, (bufStrideY * 3) + 0)));
+        __m256i y3b = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, (bufStrideY * 3) + 32)));
+
+        // In-register shuffling on the bottom 5 bits of which the bottom 4 are already right: (Y0, *, *, *, *)
+        // Start: y0a/y1a: [ 0 1 ] [ 2 3 ] (128b)
+        // Desired:        [ 0 2 ] [ 1 3 ]
+        // The magic mask value lines up with the numbers above in hex, so 0x20 means [ 0 2 ]
+        // Each reg becomes: [ Y0 X0 0 0 0 ]
+        __m256i perm0 = _mm256_permute2x128_si256(y0a, y1a, 0x20);
+        __m256i perm1 = _mm256_permute2x128_si256(y0a, y1a, 0x31);
+        __m256i perm2 = _mm256_permute2x128_si256(y2a, y3a, 0x20);
+        __m256i perm3 = _mm256_permute2x128_si256(y2a, y3a, 0x31);
+        __m256i perm4 = _mm256_permute2x128_si256(y0b, y1b, 0x20);
+        __m256i perm5 = _mm256_permute2x128_si256(y0b, y1b, 0x31);
+        __m256i perm6 = _mm256_permute2x128_si256(y2b, y3b, 0x20);
+        __m256i perm7 = _mm256_permute2x128_si256(y2b, y3b, 0x31);
+
+        // The top 3 bits of the swizzle are handled by the order of the registers here.
+        // Use streaming stores to optimize memory behavior-- this requires aligned memory.
+        __m256i* pAlignedOut = reinterpret_cast<__m256i*>(pImgMicroblock);
+        _mm256_stream_si256(pAlignedOut++, perm0);
+        _mm256_stream_si256(pAlignedOut++, perm1);
+        _mm256_stream_si256(pAlignedOut++, perm4);
+        _mm256_stream_si256(pAlignedOut++, perm5);
+        _mm256_stream_si256(pAlignedOut++, perm2);
+        _mm256_stream_si256(pAlignedOut++, perm3);
+        _mm256_stream_si256(pAlignedOut++, perm6);
+        _mm256_stream_si256(pAlignedOut++, perm7);
+    }
+};
+
+class MicroSw_2D_16BPE_AVX2
+{
+    MicroSw_2D_16BPE_AVX2() = delete;
+public:
+    static constexpr UINT_64       MicroEq[8] = { 0, 0, 0, 0, X0, Y0, X1, Y1 };
+    static constexpr ADDR_EXTENT3D MicroBlockExtent = {4, 4, 1};
+    static constexpr UINT_32       BpeLog2 = 4;
+    static constexpr UINT_32       ExpandX = 2;
+
+    AVX2_FUNC static void CopyMicroBlock(
+        void*       pImgMicroblock,  // Microblock to write to
+        const void* pBuf,            // Pointer to data starting from the first pixel of this block
+        size_t      bufStrideY,      // Stride of each row in pBuf
+        size_t      bufStrideZ)      // Stride of each slice in pBuf
+    {
+        // Unaligned buffer loads for 4 rows * (4 col * 128b = 256bx2)
+        __m256i y0a = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, (bufStrideY * 0) + 0)));
+        __m256i y0b = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, (bufStrideY * 0) + 32)));
+        __m256i y1a = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, (bufStrideY * 1) + 0)));
+        __m256i y1b = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, (bufStrideY * 1) + 32)));
+        __m256i y2a = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, (bufStrideY * 2) + 0)));
+        __m256i y2b = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, (bufStrideY * 2) + 32)));
+        __m256i y3a = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, (bufStrideY * 3) + 0)));
+        __m256i y3b = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, (bufStrideY * 3) + 32)));
+
+        // The top 3 bits of the swizzle are handled by the order of the registers here. The rest are already right.
+        // Use streaming stores to optimize memory behavior-- this requires aligned memory.
+        __m256i* pAlignedOut = reinterpret_cast<__m256i*>(pImgMicroblock);
+        _mm256_stream_si256(pAlignedOut++, y0a);
+        _mm256_stream_si256(pAlignedOut++, y1a);
+        _mm256_stream_si256(pAlignedOut++, y0b);
+        _mm256_stream_si256(pAlignedOut++, y1b);
+        _mm256_stream_si256(pAlignedOut++, y2a);
+        _mm256_stream_si256(pAlignedOut++, y3a);
+        _mm256_stream_si256(pAlignedOut++, y2b);
+        _mm256_stream_si256(pAlignedOut++, y3b);
+    }
+};
+
+
+class MicroSw_3D_1BPE_AVX2
+{
+    MicroSw_3D_1BPE_AVX2() = delete;
+public:
+    static constexpr UINT_64       MicroEq[8] = { X0, X1, Z0, Y0, Y1, Z1, X2, Z2 };
+    static constexpr ADDR_EXTENT3D MicroBlockExtent = {8, 4, 8};
+    static constexpr UINT_32       BpeLog2 = 0;
+    static constexpr UINT_32       ExpandX = 2;
+
+    AVX2_FUNC static void CopyMicroBlock(
+        void*       pImgMicroblock,  // Microblock to write to
+        const void* pBuf,            // Pointer to data starting from the first pixel of this block
+        size_t      bufStrideY,      // Stride of each row in pBuf
+        size_t      bufStrideZ)      // Stride of each slice in pBuf
+    {
+        // Unaligned buffer loads for 8 slices * 4 rows * (8 col * 8b = 64b)
+        // We'll do one 64x4 gather-load for each slice.
+
+        // Pre-compute the y offsets
+        __m256i yOffsets = _mm256_set_epi64x((3 * bufStrideY), (2 * bufStrideY), (1 * bufStrideY), 0);
+
+        // Then gather, incrementing the 'base' address for each slice.
+        // Each reg becomes: [ Y1 Y0 X2 X1 X0 ]
+        __m256i z0 = _mm256_i64gather_epi64(reinterpret_cast<const INT_64*>(VoidPtrInc(pBuf, (bufStrideZ * 0))), yOffsets, 1);
+        __m256i z1 = _mm256_i64gather_epi64(reinterpret_cast<const INT_64*>(VoidPtrInc(pBuf, (bufStrideZ * 1))), yOffsets, 1);
+        __m256i z2 = _mm256_i64gather_epi64(reinterpret_cast<const INT_64*>(VoidPtrInc(pBuf, (bufStrideZ * 2))), yOffsets, 1);
+        __m256i z3 = _mm256_i64gather_epi64(reinterpret_cast<const INT_64*>(VoidPtrInc(pBuf, (bufStrideZ * 3))), yOffsets, 1);
+        __m256i z4 = _mm256_i64gather_epi64(reinterpret_cast<const INT_64*>(VoidPtrInc(pBuf, (bufStrideZ * 4))), yOffsets, 1);
+        __m256i z5 = _mm256_i64gather_epi64(reinterpret_cast<const INT_64*>(VoidPtrInc(pBuf, (bufStrideZ * 5))), yOffsets, 1);
+        __m256i z6 = _mm256_i64gather_epi64(reinterpret_cast<const INT_64*>(VoidPtrInc(pBuf, (bufStrideZ * 6))), yOffsets, 1);
+        __m256i z7 = _mm256_i64gather_epi64(reinterpret_cast<const INT_64*>(VoidPtrInc(pBuf, (bufStrideZ * 7))), yOffsets, 1);
+
+        // First swap x2 and y0 bits, so the following unpack ends up with x2 across registers
+        // Each reg becomes: [ Y1 X2 Y0 X1 X0 ]
+        __m256i shuf0 = _mm256_shuffle_epi32(z0, 0b11011000);
+        __m256i shuf1 = _mm256_shuffle_epi32(z1, 0b11011000);
+        __m256i shuf2 = _mm256_shuffle_epi32(z2, 0b11011000);
+        __m256i shuf3 = _mm256_shuffle_epi32(z3, 0b11011000);
+        __m256i shuf4 = _mm256_shuffle_epi32(z4, 0b11011000);
+        __m256i shuf5 = _mm256_shuffle_epi32(z5, 0b11011000);
+        __m256i shuf6 = _mm256_shuffle_epi32(z6, 0b11011000);
+        __m256i shuf7 = _mm256_shuffle_epi32(z7, 0b11011000);
+
+        // Unpack to 32-bit interleave by z0.
+        // Each reg becomes: [ Y1 Y0 Z0 X1 X0 ]
+        __m256i unpack0 = _mm256_unpacklo_epi32(shuf0, shuf1);
+        __m256i unpack1 = _mm256_unpackhi_epi32(shuf0, shuf1);
+        __m256i unpack2 = _mm256_unpacklo_epi32(shuf2, shuf3);
+        __m256i unpack3 = _mm256_unpackhi_epi32(shuf2, shuf3);
+        __m256i unpack4 = _mm256_unpacklo_epi32(shuf4, shuf5);
+        __m256i unpack5 = _mm256_unpackhi_epi32(shuf4, shuf5);
+        __m256i unpack6 = _mm256_unpacklo_epi32(shuf6, shuf7);
+        __m256i unpack7 = _mm256_unpackhi_epi32(shuf6, shuf7);
+
+        // The top 3 bits of the swizzle are handled by the order of the registers here.
+        // Use streaming stores to optimize memory behavior-- this requires aligned memory.
+        __m256i* pAlignedOut = reinterpret_cast<__m256i*>(pImgMicroblock);
+        _mm256_stream_si256(pAlignedOut++, unpack0);
+        _mm256_stream_si256(pAlignedOut++, unpack2);
+        _mm256_stream_si256(pAlignedOut++, unpack1);
+        _mm256_stream_si256(pAlignedOut++, unpack3);
+        _mm256_stream_si256(pAlignedOut++, unpack4);
+        _mm256_stream_si256(pAlignedOut++, unpack6);
+        _mm256_stream_si256(pAlignedOut++, unpack5);
+        _mm256_stream_si256(pAlignedOut++, unpack7);
+    }
+};
+
+class MicroSw_3D_2BPE_AVX2
+{
+    MicroSw_3D_2BPE_AVX2() = delete;
+public:
+    static constexpr UINT_64       MicroEq[8] = { 0, X0, Z0, Y0, X1, Z1, Y1, Z2 };
+    static constexpr ADDR_EXTENT3D MicroBlockExtent = {4, 4, 8};
+    static constexpr UINT_32       BpeLog2 = 1;
+    static constexpr UINT_32       ExpandX = 2;
+
+    AVX2_FUNC static void CopyMicroBlock(
+        void*       pImgMicroblock,  // Microblock to write to
+        const void* pBuf,            // Pointer to data starting from the first pixel of this block
+        size_t      bufStrideY,      // Stride of each row in pBuf
+        size_t      bufStrideZ)      // Stride of each slice in pBuf
+    {
+        // Unaligned buffer loads for 8 slices * 4 rows * (4 col * 16b = 64b)
+        // We'll do one 64x4 gather-load for each slice.
+
+        // Pre-compute the y offsets, doing a pre-swizzle between y0 and y1 by changing the order of offsets.
+        // The pre-swizzle is done so that y1 gets separated across different registers in the unpack below.
+        __m256i yOffsets = _mm256_set_epi64x((3 * bufStrideY), (1 * bufStrideY), (2 * bufStrideY), 0);
+
+        // Then gather, incrementing the 'base' address for each slice.
+        // Each reg becomes: [ Y0 Y1 X1 X0 0 ]
+        __m256i z0 = _mm256_i64gather_epi64(reinterpret_cast<const INT_64*>(VoidPtrInc(pBuf, (bufStrideZ * 0))), yOffsets, 1);
+        __m256i z1 = _mm256_i64gather_epi64(reinterpret_cast<const INT_64*>(VoidPtrInc(pBuf, (bufStrideZ * 1))), yOffsets, 1);
+        __m256i z2 = _mm256_i64gather_epi64(reinterpret_cast<const INT_64*>(VoidPtrInc(pBuf, (bufStrideZ * 2))), yOffsets, 1);
+        __m256i z3 = _mm256_i64gather_epi64(reinterpret_cast<const INT_64*>(VoidPtrInc(pBuf, (bufStrideZ * 3))), yOffsets, 1);
+        __m256i z4 = _mm256_i64gather_epi64(reinterpret_cast<const INT_64*>(VoidPtrInc(pBuf, (bufStrideZ * 4))), yOffsets, 1);
+        __m256i z5 = _mm256_i64gather_epi64(reinterpret_cast<const INT_64*>(VoidPtrInc(pBuf, (bufStrideZ * 5))), yOffsets, 1);
+        __m256i z6 = _mm256_i64gather_epi64(reinterpret_cast<const INT_64*>(VoidPtrInc(pBuf, (bufStrideZ * 6))), yOffsets, 1);
+        __m256i z7 = _mm256_i64gather_epi64(reinterpret_cast<const INT_64*>(VoidPtrInc(pBuf, (bufStrideZ * 7))), yOffsets, 1);
+
+        // Unpack to 32-bit interleave by z0.
+        // Each reg becomes: [ Y0 X1 Z0 X0 0 ]
+        __m256i unpack0 = _mm256_unpacklo_epi32(z0, z1);
+        __m256i unpack1 = _mm256_unpackhi_epi32(z0, z1);
+        __m256i unpack2 = _mm256_unpacklo_epi32(z2, z3);
+        __m256i unpack3 = _mm256_unpackhi_epi32(z2, z3);
+        __m256i unpack4 = _mm256_unpacklo_epi32(z4, z5);
+        __m256i unpack5 = _mm256_unpackhi_epi32(z4, z5);
+        __m256i unpack6 = _mm256_unpacklo_epi32(z6, z7);
+        __m256i unpack7 = _mm256_unpackhi_epi32(z6, z7);
+
+        // Then do a cross-lane permute to swap y0 and x1
+        // Change [ 0 1 2 3 ] -> [ 0 2 1 3]
+        // Each reg becomes: [ X1 Y0 Z0 X0 0 ]
+        __m256i permute0 = _mm256_permute4x64_epi64(unpack0, 0b11011000);
+        __m256i permute1 = _mm256_permute4x64_epi64(unpack1, 0b11011000);
+        __m256i permute2 = _mm256_permute4x64_epi64(unpack2, 0b11011000);
+        __m256i permute3 = _mm256_permute4x64_epi64(unpack3, 0b11011000);
+        __m256i permute4 = _mm256_permute4x64_epi64(unpack4, 0b11011000);
+        __m256i permute5 = _mm256_permute4x64_epi64(unpack5, 0b11011000);
+        __m256i permute6 = _mm256_permute4x64_epi64(unpack6, 0b11011000);
+        __m256i permute7 = _mm256_permute4x64_epi64(unpack7, 0b11011000);
+
+        // The top 3 bits of the swizzle are handled by the order of the registers here.
+        // Use streaming stores to optimize memory behavior-- this requires aligned memory.
+        __m256i* pAlignedOut = reinterpret_cast<__m256i*>(pImgMicroblock);
+        _mm256_stream_si256(pAlignedOut++, permute0);
+        _mm256_stream_si256(pAlignedOut++, permute2);
+        _mm256_stream_si256(pAlignedOut++, permute1);
+        _mm256_stream_si256(pAlignedOut++, permute3);
+        _mm256_stream_si256(pAlignedOut++, permute4);
+        _mm256_stream_si256(pAlignedOut++, permute6);
+        _mm256_stream_si256(pAlignedOut++, permute5);
+        _mm256_stream_si256(pAlignedOut++, permute7);
+    }
+};
+
+
+class MicroSw_3D_4BPE_AVX2
+{
+    MicroSw_3D_4BPE_AVX2() = delete;
+public:
+    static constexpr UINT_64       MicroEq[8] = { 0, 0, X0, Y0, X1, Z0, Y1, Z1 };
+    static constexpr ADDR_EXTENT3D MicroBlockExtent = {4, 4, 4};
+    static constexpr UINT_32       BpeLog2 = 2;
+    static constexpr UINT_32       ExpandX = 2;
+
+    AVX2_FUNC static void CopyMicroBlock(
+        void*       pImgMicroblock,  // Microblock to write to
+        const void* pBuf,            // Pointer to data starting from the first pixel of this block
+        size_t      bufStrideY,      // Stride of each row in pBuf
+        size_t      bufStrideZ)      // Stride of each slice in pBuf
+    {
+        // Unaligned buffer loads for 4 slices * 4 rows * (4 col * 32b = 128b)
+        // Each reg becomes: [ X1 X0 0 0 ]
+        __m128i z0y0 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, (bufStrideZ * 0) + (bufStrideY * 0)));
+        __m128i z0y1 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, (bufStrideZ * 0) + (bufStrideY * 1)));
+        __m128i z0y2 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, (bufStrideZ * 0) + (bufStrideY * 2)));
+        __m128i z0y3 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, (bufStrideZ * 0) + (bufStrideY * 3)));
+        __m128i z1y0 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, (bufStrideZ * 1) + (bufStrideY * 0)));
+        __m128i z1y1 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, (bufStrideZ * 1) + (bufStrideY * 1)));
+        __m128i z1y2 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, (bufStrideZ * 1) + (bufStrideY * 2)));
+        __m128i z1y3 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, (bufStrideZ * 1) + (bufStrideY * 3)));
+        __m128i z2y0 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, (bufStrideZ * 2) + (bufStrideY * 0)));
+        __m128i z2y1 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, (bufStrideZ * 2) + (bufStrideY * 1)));
+        __m128i z2y2 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, (bufStrideZ * 2) + (bufStrideY * 2)));
+        __m128i z2y3 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, (bufStrideZ * 2) + (bufStrideY * 3)));
+        __m128i z3y0 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, (bufStrideZ * 3) + (bufStrideY * 0)));
+        __m128i z3y1 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, (bufStrideZ * 3) + (bufStrideY * 1)));
+        __m128i z3y2 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, (bufStrideZ * 3) + (bufStrideY * 2)));
+        __m128i z3y3 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, (bufStrideZ * 3) + (bufStrideY * 3)));
+
+        // Concat two 128b together to form one 256b register, along y0.
+        // Each reg becomes: [ Y0 X1 X0 0 0 ]
+        __m256i concat0 = _mm256_set_m128i(z0y1, z0y0);
+        __m256i concat1 = _mm256_set_m128i(z0y3, z0y2);
+        __m256i concat2 = _mm256_set_m128i(z1y1, z1y0);
+        __m256i concat3 = _mm256_set_m128i(z1y3, z1y2);
+        __m256i concat4 = _mm256_set_m128i(z2y1, z2y0);
+        __m256i concat5 = _mm256_set_m128i(z2y3, z2y2);
+        __m256i concat6 = _mm256_set_m128i(z3y1, z3y0);
+        __m256i concat7 = _mm256_set_m128i(z3y3, z3y2);
+
+        // Then do a cross-lane permute to swap y0 and x1
+        // Change [ 0 1 2 3 ] -> [ 0 2 1 3]
+        // Each reg becomes: [ X1 Y0 X0 0 0 ]
+        __m256i perm0 = _mm256_permute4x64_epi64(concat0, 0b11011000);
+        __m256i perm1 = _mm256_permute4x64_epi64(concat1, 0b11011000);
+        __m256i perm2 = _mm256_permute4x64_epi64(concat2, 0b11011000);
+        __m256i perm3 = _mm256_permute4x64_epi64(concat3, 0b11011000);
+        __m256i perm4 = _mm256_permute4x64_epi64(concat4, 0b11011000);
+        __m256i perm5 = _mm256_permute4x64_epi64(concat5, 0b11011000);
+        __m256i perm6 = _mm256_permute4x64_epi64(concat6, 0b11011000);
+        __m256i perm7 = _mm256_permute4x64_epi64(concat7, 0b11011000);
+
+        // The top 3 bits of the swizzle are handled by the order of the registers here.
+        // Use streaming stores to optimize memory behavior-- this requires aligned memory.
+        __m256i* pAlignedOut = reinterpret_cast<__m256i*>(pImgMicroblock);
+        _mm256_stream_si256(pAlignedOut++, perm0);
+        _mm256_stream_si256(pAlignedOut++, perm2);
+        _mm256_stream_si256(pAlignedOut++, perm1);
+        _mm256_stream_si256(pAlignedOut++, perm3);
+        _mm256_stream_si256(pAlignedOut++, perm4);
+        _mm256_stream_si256(pAlignedOut++, perm6);
+        _mm256_stream_si256(pAlignedOut++, perm5);
+        _mm256_stream_si256(pAlignedOut++, perm7);
+    }
+};
+
+class MicroSw_3D_8BPE_AVX2
+{
+    MicroSw_3D_8BPE_AVX2() = delete;
+public:
+    static constexpr UINT_64       MicroEq[8] = { 0, 0, 0, X0, Y0, Z0, X1, Z1 };
+    static constexpr ADDR_EXTENT3D MicroBlockExtent = {4, 2, 4};
+    static constexpr UINT_32       BpeLog2 = 3;
+    static constexpr UINT_32       ExpandX = 2;
+
+    AVX2_FUNC static void CopyMicroBlock(
+        void*       pImgMicroblock,  // Microblock to write to
+        const void* pBuf,            // Pointer to data starting from the first pixel of this block
+        size_t      bufStrideY,      // Stride of each row in pBuf
+        size_t      bufStrideZ)      // Stride of each slice in pBuf
+    {
+        // Unaligned buffer loads for 4 slices * 2 rows * (4 col * 64b = 256b)
+        // Each reg becomes: [ X1 X0 0 0 0 ]
+        __m256i z0y0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, (bufStrideZ * 0))));
+        __m256i z0y1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, (bufStrideZ * 0) + bufStrideY)));
+        __m256i z1y0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, (bufStrideZ * 1))));
+        __m256i z1y1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, (bufStrideZ * 1) + bufStrideY)));
+        __m256i z2y0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, (bufStrideZ * 2))));
+        __m256i z2y1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, (bufStrideZ * 2) + bufStrideY)));
+        __m256i z3y0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, (bufStrideZ * 3))));
+        __m256i z3y1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, (bufStrideZ * 3) + bufStrideY)));
+
+        // Use 128-bit permutes to swap the y0 bit (diff registers) with the x0 bit (128b boundary)
+        // Start: y0/y1: [ 0 1 ] [ 2 3 ] (128b)
+        // Desired:      [ 0 2 ] [ 1 3 ]
+        // The magic mask value lines up with the numbers above in hex, so 0x20 means [ 0 2 ]
+        // Each reg becomes: [ Y0 X0 0 0 0 ]
+        __m256i z0x0 = _mm256_permute2x128_si256(z0y0, z0y1, 0x20);
+        __m256i z0x1 = _mm256_permute2x128_si256(z0y0, z0y1, 0x31);
+        __m256i z1x0 = _mm256_permute2x128_si256(z1y0, z1y1, 0x20);
+        __m256i z1x1 = _mm256_permute2x128_si256(z1y0, z1y1, 0x31);
+        __m256i z2x0 = _mm256_permute2x128_si256(z2y0, z2y1, 0x20);
+        __m256i z2x1 = _mm256_permute2x128_si256(z2y0, z2y1, 0x31);
+        __m256i z3x0 = _mm256_permute2x128_si256(z3y0, z3y1, 0x20);
+        __m256i z3x1 = _mm256_permute2x128_si256(z3y0, z3y1, 0x31);
+
+        // The top 3 bits of the swizzle are handled by the order of the registers here.
+        // Use streaming stores to optimize memory behavior-- this requires aligned memory.
+        __m256i* pAlignedOut = reinterpret_cast<__m256i*>(pImgMicroblock);
+        _mm256_stream_si256(pAlignedOut++, z0x0);
+        _mm256_stream_si256(pAlignedOut++, z1x0);
+        _mm256_stream_si256(pAlignedOut++, z0x1);
+        _mm256_stream_si256(pAlignedOut++, z1x1);
+        _mm256_stream_si256(pAlignedOut++, z2x0);
+        _mm256_stream_si256(pAlignedOut++, z3x0);
+        _mm256_stream_si256(pAlignedOut++, z2x1);
+        _mm256_stream_si256(pAlignedOut++, z3x1);
+    }
+};
+
+
+class MicroSw_3D_16BPE_AVX2
+{
+    MicroSw_3D_16BPE_AVX2() = delete;
+public:
+    static constexpr UINT_64       MicroEq[8] = { 0, 0, 0, 0, X0, Z0, Y0, Z1 };
+    static constexpr ADDR_EXTENT3D MicroBlockExtent = {2, 2, 4};
+    static constexpr UINT_32       BpeLog2 = 4;
+    static constexpr UINT_32       ExpandX = 2;
+
+    AVX2_FUNC static void CopyMicroBlock(
+        void*       pImgMicroblock,  // Microblock to write to
+        const void* pBuf,            // Pointer to data starting from the first pixel of this block
+        size_t      bufStrideY,      // Stride of each row in pBuf
+        size_t      bufStrideZ)      // Stride of each slice in pBuf
+    {
+        // Unaligned buffer loads for 4 slices * 2 rows * (2 col * 128b = 256b)
+        __m256i z0y0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, (bufStrideZ * 0))));
+        __m256i z0y1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, (bufStrideZ * 0) + bufStrideY)));
+        __m256i z1y0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, (bufStrideZ * 1))));
+        __m256i z1y1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, (bufStrideZ * 1) + bufStrideY)));
+        __m256i z2y0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, (bufStrideZ * 2))));
+        __m256i z2y1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, (bufStrideZ * 2) + bufStrideY)));
+        __m256i z3y0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, (bufStrideZ * 3))));
+        __m256i z3y1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, (bufStrideZ * 3) + bufStrideY)));
+
+        // The whole swizzle can be handled by the order of the registers here.
+        // Use streaming stores to optimize memory behavior-- this requires aligned memory.
+        __m256i* pAlignedOut = reinterpret_cast<__m256i*>(pImgMicroblock);
+        _mm256_stream_si256(pAlignedOut++, z0y0);
+        _mm256_stream_si256(pAlignedOut++, z1y0);
+        _mm256_stream_si256(pAlignedOut++, z0y1);
+        _mm256_stream_si256(pAlignedOut++, z1y1);
+        _mm256_stream_si256(pAlignedOut++, z2y0);
+        _mm256_stream_si256(pAlignedOut++, z3y0);
+        _mm256_stream_si256(pAlignedOut++, z2y1);
+        _mm256_stream_si256(pAlignedOut++, z3y1);
+    }
+};
+
+
+class MicroSw_R_1BPE_AVX2
+{
+    MicroSw_R_1BPE_AVX2() = delete;
+public:
+    static constexpr UINT_64       MicroEq[8] = { X0, X1, X2, X3, Y0, Y1, Y2, Y3 };
+    static constexpr ADDR_EXTENT3D MicroBlockExtent = {16, 16, 1};
+    static constexpr UINT_32       BpeLog2 = 0;
+    static constexpr UINT_32       ExpandX = 4;
+
+    AVX2_FUNC static void CopyMicroBlock(
+        void*       pImgMicroblock,  // Microblock to write to
+        const void* pBuf,            // Pointer to data starting from the first pixel of this block
+        size_t      bufStrideY,      // Stride of each row in pBuf
+        size_t      bufStrideZ)      // Stride of each slice in pBuf
+    {
+        // Unaligned buffer loads for 16 rows * (16 col * 8b = 128b)
+        // Yes, that means double the load instructions
+        // Each reg becomes: [ X3 X2 X1 X0 ]
+        __m128i y0  = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 0));
+        __m128i y1  = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 1));
+        __m128i y2  = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 2));
+        __m128i y3  = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 3));
+        __m128i y4  = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 4));
+        __m128i y5  = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 5));
+        __m128i y6  = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 6));
+        __m128i y7  = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 7));
+        __m128i y8  = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 8));
+        __m128i y9  = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 9));
+        __m128i y10 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 10));
+        __m128i y11 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 11));
+        __m128i y12 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 12));
+        __m128i y13 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 13));
+        __m128i y14 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 14));
+        __m128i y15 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 15));
+
+        // Concat two 128b together to form one 256b register, along y0.
+        // Each reg becomes: [ Y0 X3 X2 X1 X0 ]
+        __m256i concat0 = _mm256_set_m128i(y1, y0);
+        __m256i concat1 = _mm256_set_m128i(y3, y2);
+        __m256i concat2 = _mm256_set_m128i(y5, y4);
+        __m256i concat3 = _mm256_set_m128i(y7, y6);
+        __m256i concat4 = _mm256_set_m128i(y9, y8);
+        __m256i concat5 = _mm256_set_m128i(y11, y10);
+        __m256i concat6 = _mm256_set_m128i(y13, y12);
+        __m256i concat7 = _mm256_set_m128i(y15, y14);
+
+        // The top 3 bits of the swizzle are handled by the order of the registers here. The rest are already right.
+        // Use streaming stores to optimize memory behavior-- this requires aligned memory.
+        __m256i* pAlignedOut = reinterpret_cast<__m256i*>(pImgMicroblock);
+        _mm256_stream_si256(pAlignedOut++, concat0);
+        _mm256_stream_si256(pAlignedOut++, concat1);
+        _mm256_stream_si256(pAlignedOut++, concat2);
+        _mm256_stream_si256(pAlignedOut++, concat3);
+        _mm256_stream_si256(pAlignedOut++, concat4);
+        _mm256_stream_si256(pAlignedOut++, concat5);
+        _mm256_stream_si256(pAlignedOut++, concat6);
+        _mm256_stream_si256(pAlignedOut++, concat7);
+    }
+};
+
+class MicroSw_R_2BPE_AVX2
+{
+    MicroSw_R_2BPE_AVX2() = delete;
+public:
+    static constexpr UINT_64       MicroEq[8] = { 0, X0, X1, X2, Y0, Y1, Y2, X3 };
+    static constexpr ADDR_EXTENT3D MicroBlockExtent = {16, 8, 1};
+    static constexpr UINT_32       BpeLog2 = 1;
+    static constexpr UINT_32       ExpandX = 4;
+
+    AVX2_FUNC static void CopyMicroBlock(
+        void*       pImgMicroblock,  // Microblock to write to
+        const void* pBuf,            // Pointer to data starting from the first pixel of this block
+        size_t      bufStrideY,      // Stride of each row in pBuf
+        size_t      bufStrideZ)      // Stride of each slice in pBuf
+    {
+        // Unaligned buffer loads for 8 rows * (16 col * 16b = 256b)
+        // Each reg becomes: [ X3 X2 X1 X0 0 ]
+        __m256i y0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, bufStrideY * 0)));
+        __m256i y1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, bufStrideY * 1)));
+        __m256i y2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, bufStrideY * 2)));
+        __m256i y3 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, bufStrideY * 3)));
+        __m256i y4 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, bufStrideY * 4)));
+        __m256i y5 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, bufStrideY * 5)));
+        __m256i y6 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, bufStrideY * 6)));
+        __m256i y7 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, bufStrideY * 7)));
+
+        // In-register shuffling on the bottom 5 bits of which the bottom 4 are already right: (Y0, *, *, *, *)
+        // Start: y0/y1: [ 0 1 ] [ 2 3 ] (128b)
+        // Desired:      [ 0 2 ] [ 1 3 ]
+        // The magic mask value lines up with the numbers above in hex, so 0x20 means [ 0 2 ]
+        // Each reg becomes: [ Y0 X2 X1 X0 0 ]
+        __m256i perm0 = _mm256_permute2x128_si256(y0, y1, 0x20);
+        __m256i perm1 = _mm256_permute2x128_si256(y0, y1, 0x31);
+        __m256i perm2 = _mm256_permute2x128_si256(y2, y3, 0x20);
+        __m256i perm3 = _mm256_permute2x128_si256(y2, y3, 0x31);
+        __m256i perm4 = _mm256_permute2x128_si256(y4, y5, 0x20);
+        __m256i perm5 = _mm256_permute2x128_si256(y4, y5, 0x31);
+        __m256i perm6 = _mm256_permute2x128_si256(y6, y7, 0x20);
+        __m256i perm7 = _mm256_permute2x128_si256(y6, y7, 0x31);
+
+        // The top 3 bits of the swizzle are handled by the order of the registers here.
+        // Use streaming stores to optimize memory behavior-- this requires aligned memory.
+        __m256i* pAlignedOut = reinterpret_cast<__m256i*>(pImgMicroblock);
+        _mm256_stream_si256(pAlignedOut++, perm0);
+        _mm256_stream_si256(pAlignedOut++, perm2);
+        _mm256_stream_si256(pAlignedOut++, perm4);
+        _mm256_stream_si256(pAlignedOut++, perm6);
+        _mm256_stream_si256(pAlignedOut++, perm1);
+        _mm256_stream_si256(pAlignedOut++, perm3);
+        _mm256_stream_si256(pAlignedOut++, perm5);
+        _mm256_stream_si256(pAlignedOut++, perm7);
+    }
+};
+
+class MicroSw_R_4BPE_AVX2
+{
+    MicroSw_R_4BPE_AVX2() = delete;
+public:
+    static constexpr UINT_64       MicroEq[8] = { 0, 0, X0, X1, Y0, Y1, X2, Y2 };
+    static constexpr ADDR_EXTENT3D MicroBlockExtent = {8, 8, 1};
+    static constexpr UINT_32       BpeLog2 = 2;
+    static constexpr UINT_32       ExpandX = 4;
+
+    AVX2_FUNC static void CopyMicroBlock(
+        void*       pImgMicroblock,  // Microblock to write to
+        const void* pBuf,            // Pointer to data starting from the first pixel of this block
+        size_t      bufStrideY,      // Stride of each row in pBuf
+        size_t      bufStrideZ)      // Stride of each slice in pBuf
+    {
+        // Unaligned buffer loads for 8 rows * (8 col * 32b = 256b)
+        // Each reg becomes: [ X2 X1 X0 0 0 ]
+        __m256i y0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, bufStrideY * 0)));
+        __m256i y1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, bufStrideY * 1)));
+        __m256i y2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, bufStrideY * 2)));
+        __m256i y3 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, bufStrideY * 3)));
+        __m256i y4 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, bufStrideY * 4)));
+        __m256i y5 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, bufStrideY * 5)));
+        __m256i y6 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, bufStrideY * 6)));
+        __m256i y7 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(VoidPtrInc(pBuf, bufStrideY * 7)));
+
+        // In-register shuffling on the bottom 5 bits of which the bottom 4 are already right: (Y0, *, *, *, *)
+        // Start: y0/y1: [ 0 1 ] [ 2 3 ] (128b)
+        // Desired:      [ 0 2 ] [ 1 3 ]
+        // The magic mask value lines up with the numbers above in hex, so 0x20 means [ 0 2 ]
+        // Each reg becomes: [ Y0 X1 X0 0 0 ]
+        __m256i perm0 = _mm256_permute2x128_si256(y0, y1, 0x20);
+        __m256i perm1 = _mm256_permute2x128_si256(y0, y1, 0x31);
+        __m256i perm2 = _mm256_permute2x128_si256(y2, y3, 0x20);
+        __m256i perm3 = _mm256_permute2x128_si256(y2, y3, 0x31);
+        __m256i perm4 = _mm256_permute2x128_si256(y4, y5, 0x20);
+        __m256i perm5 = _mm256_permute2x128_si256(y4, y5, 0x31);
+        __m256i perm6 = _mm256_permute2x128_si256(y6, y7, 0x20);
+        __m256i perm7 = _mm256_permute2x128_si256(y6, y7, 0x31);
+
+        // The top 3 bits of the swizzle are handled by the order of the registers here.
+        // Use streaming stores to optimize memory behavior-- this requires aligned memory.
+        __m256i* pAlignedOut = reinterpret_cast<__m256i*>(pImgMicroblock);
+        _mm256_stream_si256(pAlignedOut++, perm0);
+        _mm256_stream_si256(pAlignedOut++, perm2);
+        _mm256_stream_si256(pAlignedOut++, perm1);
+        _mm256_stream_si256(pAlignedOut++, perm3);
+        _mm256_stream_si256(pAlignedOut++, perm4);
+        _mm256_stream_si256(pAlignedOut++, perm6);
+        _mm256_stream_si256(pAlignedOut++, perm5);
+        _mm256_stream_si256(pAlignedOut++, perm7);
+    }
+};
+
+
+class MicroSw_Z_1BPE_AVX2
+{
+    MicroSw_Z_1BPE_AVX2() = delete;
+public:
+    static constexpr UINT_64       MicroEq[8] = { X0, Y0, X1, Y1, X2, Y2, X3, Y3 };
+    static constexpr ADDR_EXTENT3D MicroBlockExtent = {16, 16, 1};
+    static constexpr UINT_32       BpeLog2 = 0;
+    static constexpr UINT_32       ExpandX = 2;
+
+    AVX2_FUNC static void CopyMicroBlock(
+        void*       pImgMicroblock,  // Microblock to write to
+        const void* pBuf,            // Pointer to data starting from the first pixel of this block
+        size_t      bufStrideY,      // Stride of each row in pBuf
+        size_t      bufStrideZ)      // Stride of each slice in pBuf
+    {
+        // Unaligned buffer loads for 16 rows * (16 col * 8b = 128b)
+        // Each reg becomes: [ X3 X2 X1 X0 ]
+        __m128i y0  = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY*0));
+        __m128i y1  = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY*1));
+        __m128i y2  = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY*2));
+        __m128i y3  = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY*3));
+        __m128i y4  = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY*4));
+        __m128i y5  = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY*5));
+        __m128i y6  = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY*6));
+        __m128i y7  = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY*7));
+        __m128i y8  = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY*8));
+        __m128i y9  = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY*9));
+        __m128i y10 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY*10));
+        __m128i y11 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY*11));
+        __m128i y12 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY*12));
+        __m128i y13 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY*13));
+        __m128i y14 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY*14));
+        __m128i y15 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY*15));
+
+        // First combine two 128-bit values (across y1)
+        // Each reg becomes: [ Y1 X3 X2 X1 X0 ]
+        __m256i concat0 = _mm256_set_m128i(y2, y0);
+        __m256i concat1 = _mm256_set_m128i(y3, y1);
+        __m256i concat2 = _mm256_set_m128i(y6, y4);
+        __m256i concat3 = _mm256_set_m128i(y7, y5);
+        __m256i concat4 = _mm256_set_m128i(y10, y8);
+        __m256i concat5 = _mm256_set_m128i(y11, y9);
+        __m256i concat6 = _mm256_set_m128i(y14, y12);
+        __m256i concat7 = _mm256_set_m128i(y15, y13);
+
+        // Then do a 16-bit interleave across y0. This is done in parallel on each 128b lane.
+        // Each reg becomes: [ Y1 X2 X1 Y0 X0 ]
+        __m256i unpack0 = _mm256_unpacklo_epi16(concat0, concat1);
+        __m256i unpack1 = _mm256_unpackhi_epi16(concat0, concat1);
+        __m256i unpack2 = _mm256_unpacklo_epi16(concat2, concat3);
+        __m256i unpack3 = _mm256_unpackhi_epi16(concat2, concat3);
+        __m256i unpack4 = _mm256_unpacklo_epi16(concat4, concat5);
+        __m256i unpack5 = _mm256_unpackhi_epi16(concat4, concat5);
+        __m256i unpack6 = _mm256_unpacklo_epi16(concat6, concat7);
+        __m256i unpack7 = _mm256_unpackhi_epi16(concat6, concat7);
+
+        // Then do a cross-lane permute to change our 128b interleave across y1 to a 64-bit interleave.
+        // Change [ 0 1 2 3 ] -> [ 0 2 1 3]
+        // Each reg becomes: [ X2 Y1 X1 Y0 X0 ]
+        __m256i permute0 = _mm256_permute4x64_epi64(unpack0, 0b11011000);
+        __m256i permute1 = _mm256_permute4x64_epi64(unpack1, 0b11011000);
+        __m256i permute2 = _mm256_permute4x64_epi64(unpack2, 0b11011000);
+        __m256i permute3 = _mm256_permute4x64_epi64(unpack3, 0b11011000);
+        __m256i permute4 = _mm256_permute4x64_epi64(unpack4, 0b11011000);
+        __m256i permute5 = _mm256_permute4x64_epi64(unpack5, 0b11011000);
+        __m256i permute6 = _mm256_permute4x64_epi64(unpack6, 0b11011000);
+        __m256i permute7 = _mm256_permute4x64_epi64(unpack7, 0b11011000);
+
+        // Finally, the top 3 bits of the swizzle are handled by the order of the registers here.
+        // Use streaming stores to optimize memory behavior-- this requires aligned memory.
+        __m256i* pAlignedOut = reinterpret_cast<__m256i*>(pImgMicroblock);
+        _mm256_stream_si256(pAlignedOut++, permute0);
+        _mm256_stream_si256(pAlignedOut++, permute2);
+        _mm256_stream_si256(pAlignedOut++, permute1);
+        _mm256_stream_si256(pAlignedOut++, permute3);
+        _mm256_stream_si256(pAlignedOut++, permute4);
+        _mm256_stream_si256(pAlignedOut++, permute6);
+        _mm256_stream_si256(pAlignedOut++, permute5);
+        _mm256_stream_si256(pAlignedOut++, permute7);
+    }
+};
+
+
+class MicroSw_D_1BPE_AVX2
+{
+    MicroSw_D_1BPE_AVX2() = delete;
+public:
+    static constexpr UINT_64       MicroEq[8] = { X0, X1, X2, Y1, Y0, Y2, X3, Y3 };
+    static constexpr ADDR_EXTENT3D MicroBlockExtent = {16, 16, 1};
+    static constexpr UINT_32       BpeLog2 = 0;
+    static constexpr UINT_32       ExpandX = 4;
+
+    AVX2_FUNC static void CopyMicroBlock(
+        void*       pImgMicroblock,  // Microblock to write to
+        const void* pBuf,            // Pointer to data starting from the first pixel of this block
+        size_t      bufStrideY,      // Stride of each row in pBuf
+        size_t      bufStrideZ)      // Stride of each slice in pBuf
+    {
+        // Unaligned buffer loads for 16 rows * (16 col * 8b = 128b)
+        // Yes, that means double the load instructions
+        // Each reg becomes: [ X3 X2 X1 X0 ]
+        __m128i y0  = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 0));
+        __m128i y1  = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 1));
+        __m128i y2  = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 2));
+        __m128i y3  = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 3));
+        __m128i y4  = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 4));
+        __m128i y5  = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 5));
+        __m128i y6  = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 6));
+        __m128i y7  = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 7));
+        __m128i y8  = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 8));
+        __m128i y9  = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 9));
+        __m128i y10 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 10));
+        __m128i y11 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 11));
+        __m128i y12 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 12));
+        __m128i y13 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 13));
+        __m128i y14 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 14));
+        __m128i y15 = _mm_loadu_si128((const __m128i*)VoidPtrInc(pBuf, bufStrideY * 15));
+
+        // Concat two 128b together to form one 256b register, along y0.
+        // Each reg becomes: [ Y0 X3 X2 X1 X0 ]
+        __m256i concat0 = _mm256_set_m128i(y1, y0);
+        __m256i concat1 = _mm256_set_m128i(y3, y2);
+        __m256i concat2 = _mm256_set_m128i(y5, y4);
+        __m256i concat3 = _mm256_set_m128i(y7, y6);
+        __m256i concat4 = _mm256_set_m128i(y9, y8);
+        __m256i concat5 = _mm256_set_m128i(y11, y10);
+        __m256i concat6 = _mm256_set_m128i(y13, y12);
+        __m256i concat7 = _mm256_set_m128i(y15, y14);
+
+        // Then do a 64-bit interleave along y1.
+        // Each reg becomes: [ Y0 Y1 X2 X1 X0 ]
+        __m256i unpack0 = _mm256_unpacklo_epi64(concat0, concat1);
+        __m256i unpack1 = _mm256_unpackhi_epi64(concat0, concat1);
+        __m256i unpack2 = _mm256_unpacklo_epi64(concat2, concat3);
+        __m256i unpack3 = _mm256_unpackhi_epi64(concat2, concat3);
+        __m256i unpack4 = _mm256_unpacklo_epi64(concat4, concat5);
+        __m256i unpack5 = _mm256_unpackhi_epi64(concat4, concat5);
+        __m256i unpack6 = _mm256_unpacklo_epi64(concat6, concat7);
+        __m256i unpack7 = _mm256_unpackhi_epi64(concat6, concat7);
+
+        // The top 3 bits of the swizzle are handled by the order of the registers here. The rest are already right.
+        // Use streaming stores to optimize memory behavior-- this requires aligned memory.
+        __m256i* pAlignedOut = reinterpret_cast<__m256i*>(pImgMicroblock);
+        _mm256_stream_si256(pAlignedOut++, unpack0);
+        _mm256_stream_si256(pAlignedOut++, unpack2);
+        _mm256_stream_si256(pAlignedOut++, unpack1);
+        _mm256_stream_si256(pAlignedOut++, unpack3);
+        _mm256_stream_si256(pAlignedOut++, unpack4);
+        _mm256_stream_si256(pAlignedOut++, unpack6);
+        _mm256_stream_si256(pAlignedOut++, unpack5);
+        _mm256_stream_si256(pAlignedOut++, unpack7);
+    }
+};
+#endif // ADDR_HAS_AVX2
+
+#if ADDR_HAS_NEON
+class MicroSw_2D_1BPE_NEON
+{
+    MicroSw_2D_1BPE_NEON() = delete;
+public:
+    static constexpr UINT_64       MicroEq[8] = { X0, X1, Y0, X2, Y1, Y2, X3, Y3 };
+    static constexpr ADDR_EXTENT3D MicroBlockExtent = {16, 16, 1};
+    static constexpr UINT_32       BpeLog2 = 0;
+    static constexpr UINT_32       ExpandX = 4;
+
+    NEON_FUNC static void CopyMicroBlock(
+        void*       pImgMicroblock,  // Microblock to write to
+        const void* pBuf,            // Pointer to data starting from the first pixel of this block
+        size_t      bufStrideY,      // Stride of each row in pBuf
+        size_t      bufStrideZ)      // Stride of each slice in pBuf
+    {
+        // Unaligned buffer loads for 16 rows * (16 col * 8b = 128b)
+        // Each reg becomes: [ X3 X2 X1 X0 ]
+        uint32x4_t y0  = vld1q_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 0))));
+        uint32x4_t y1  = vld1q_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 1))));
+        uint32x4_t y2  = vld1q_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 2))));
+        uint32x4_t y3  = vld1q_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 3))));
+        uint32x4_t y4  = vld1q_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 4))));
+        uint32x4_t y5  = vld1q_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 5))));
+        uint32x4_t y6  = vld1q_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 6))));
+        uint32x4_t y7  = vld1q_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 7))));
+        uint32x4_t y8  = vld1q_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 8))));
+        uint32x4_t y9  = vld1q_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 9))));
+        uint32x4_t y10 = vld1q_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 10))));
+        uint32x4_t y11 = vld1q_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 11))));
+        uint32x4_t y12 = vld1q_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 12))));
+        uint32x4_t y13 = vld1q_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 13))));
+        uint32x4_t y14 = vld1q_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 14))));
+        uint32x4_t y15 = vld1q_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 15))));
+
+        // Do a 32-bit zip/unpack operation to interleave within each 128b reg.
+        // Each reg becomes: [ X2 Y0 X1 X0 ]
+        uint32x4_t comb0a = vzip1q_u32(y0, y1);
+        uint32x4_t comb0b = vzip2q_u32(y0, y1);
+        uint32x4_t comb1a = vzip1q_u32(y2, y3);
+        uint32x4_t comb1b = vzip2q_u32(y2, y3);
+        uint32x4_t comb2a = vzip1q_u32(y4, y5);
+        uint32x4_t comb2b = vzip2q_u32(y4, y5);
+        uint32x4_t comb3a = vzip1q_u32(y6, y7);
+        uint32x4_t comb3b = vzip2q_u32(y6, y7);
+        uint32x4_t comb4a = vzip1q_u32(y8, y9);
+        uint32x4_t comb4b = vzip2q_u32(y8, y9);
+        uint32x4_t comb5a = vzip1q_u32(y10, y11);
+        uint32x4_t comb5b = vzip2q_u32(y10, y11);
+        uint32x4_t comb6a = vzip1q_u32(y12, y13);
+        uint32x4_t comb6b = vzip2q_u32(y12, y13);
+        uint32x4_t comb7a = vzip1q_u32(y14, y15);
+        uint32x4_t comb7b = vzip2q_u32(y14, y15);
+
+        // The top 4 bits of the swizzle are handled by plain reg moves here.
+        uint32x4x4_t out0 = { { comb0a, comb1a, comb2a, comb3a } };
+        uint32x4x4_t out1 = { { comb0b, comb1b, comb2b, comb3b } };
+        uint32x4x4_t out2 = { { comb4a, comb5a, comb6a, comb7a } };
+        uint32x4x4_t out3 = { { comb4b, comb5b, comb6b, comb7b } };
+
+        // And store them using the largest contiguous store we can.
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 0 * 64)), out0);
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 1 * 64)), out1);
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 2 * 64)), out2);
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 3 * 64)), out3);
+    }
+};
+
+class MicroSw_2D_2BPE_NEON
+{
+    MicroSw_2D_2BPE_NEON() = delete;
+public:
+    static constexpr UINT_64       MicroEq[8] = { 0, X0, Y0, X1, Y1, X2, Y2, X3 };
+    static constexpr ADDR_EXTENT3D MicroBlockExtent = {16, 8, 1};
+    static constexpr UINT_32       BpeLog2 = 1;
+    static constexpr UINT_32       ExpandX = 2;
+
+    NEON_FUNC static void CopyMicroBlock(
+        void*       pImgMicroblock,  // Microblock to write to
+        const void* pBuf,            // Pointer to data starting from the first pixel of this block
+        size_t      bufStrideY,      // Stride of each row in pBuf
+        size_t      bufStrideZ)      // Stride of each slice in pBuf
+    {
+        // Unaligned buffer loads for 8 rows * (16 col * 16b = 256b)
+        // ARM can do a 256b load/store, but the actual values are 128b.
+        // Each reg becomes: [ X2 X1 X0 0 ]
+        uint32x4x2_t y0 = vld1q_u32_x2(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 0))));
+        uint32x4x2_t y1 = vld1q_u32_x2(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 1))));
+        uint32x4x2_t y2 = vld1q_u32_x2(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 2))));
+        uint32x4x2_t y3 = vld1q_u32_x2(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 3))));
+        uint32x4x2_t y4 = vld1q_u32_x2(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 4))));
+        uint32x4x2_t y5 = vld1q_u32_x2(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 5))));
+        uint32x4x2_t y6 = vld1q_u32_x2(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 6))));
+        uint32x4x2_t y7 = vld1q_u32_x2(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 7))));
+
+        // Do a 32-bit zip/unpack operation to interleave within each 128b reg.
+        // Each reg becomes: [ X1 Y0 X0 0 ]
+        uint32x4_t comb0a = vzip1q_u32(y0.val[0], y1.val[0]);
+        uint32x4_t comb0b = vzip2q_u32(y0.val[0], y1.val[0]);
+        uint32x4_t comb1a = vzip1q_u32(y0.val[1], y1.val[1]);
+        uint32x4_t comb1b = vzip2q_u32(y0.val[1], y1.val[1]);
+        uint32x4_t comb2a = vzip1q_u32(y2.val[0], y3.val[0]);
+        uint32x4_t comb2b = vzip2q_u32(y2.val[0], y3.val[0]);
+        uint32x4_t comb3a = vzip1q_u32(y2.val[1], y3.val[1]);
+        uint32x4_t comb3b = vzip2q_u32(y2.val[1], y3.val[1]);
+        uint32x4_t comb4a = vzip1q_u32(y4.val[0], y5.val[0]);
+        uint32x4_t comb4b = vzip2q_u32(y4.val[0], y5.val[0]);
+        uint32x4_t comb5a = vzip1q_u32(y4.val[1], y5.val[1]);
+        uint32x4_t comb5b = vzip2q_u32(y4.val[1], y5.val[1]);
+        uint32x4_t comb6a = vzip1q_u32(y6.val[0], y7.val[0]);
+        uint32x4_t comb6b = vzip2q_u32(y6.val[0], y7.val[0]);
+        uint32x4_t comb7a = vzip1q_u32(y6.val[1], y7.val[1]);
+        uint32x4_t comb7b = vzip2q_u32(y6.val[1], y7.val[1]);
+
+        // The top 4 bits of the swizzle are handled by plain reg moves here.
+        uint32x4x4_t out0 = { { comb0a, comb2a, comb0b, comb2b } };
+        uint32x4x4_t out1 = { { comb4a, comb6a, comb4b, comb6b} };
+        uint32x4x4_t out2 = { { comb1a, comb3a, comb1b, comb3b } };
+        uint32x4x4_t out3 = { { comb5a, comb7a, comb5b, comb7b } };
+
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 0 * 64)), out0);
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 1 * 64)), out1);
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 2 * 64)), out2);
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 3 * 64)), out3);
+    }
+};
+class MicroSw_2D_4BPE_NEON
+{
+    MicroSw_2D_4BPE_NEON() = delete;
+public:
+    static constexpr UINT_64       MicroEq[8] = { 0, 0, X0, Y0, X1, Y1, X2, Y2 };
+    static constexpr ADDR_EXTENT3D MicroBlockExtent = { 8, 8, 1};
+    static constexpr UINT_32       BpeLog2 = 2;
+    static constexpr UINT_32       ExpandX = 2;
+
+    NEON_FUNC static void CopyMicroBlock(
+        void*       pImgMicroblock,  // Microblock to write to
+        const void* pBuf,            // Pointer to data starting from the first pixel of this block
+        size_t      bufStrideY,      // Stride of each row in pBuf
+        size_t      bufStrideZ)      // Stride of each slice in pBuf
+    {
+        // Unaligned buffer loads for 8 rows * (8 col * 32b = 256b)
+        // ARM can do a 256b load/store, but the actual values are 128b.
+        // Each reg becomes: [ X1 X0 0 0 ]
+        uint64x2x2_t y0 = vld1q_u64_x2(reinterpret_cast<const uint64_t*>(VoidPtrInc(pBuf, (bufStrideY * 0))));
+        uint64x2x2_t y1 = vld1q_u64_x2(reinterpret_cast<const uint64_t*>(VoidPtrInc(pBuf, (bufStrideY * 1))));
+        uint64x2x2_t y2 = vld1q_u64_x2(reinterpret_cast<const uint64_t*>(VoidPtrInc(pBuf, (bufStrideY * 2))));
+        uint64x2x2_t y3 = vld1q_u64_x2(reinterpret_cast<const uint64_t*>(VoidPtrInc(pBuf, (bufStrideY * 3))));
+        uint64x2x2_t y4 = vld1q_u64_x2(reinterpret_cast<const uint64_t*>(VoidPtrInc(pBuf, (bufStrideY * 4))));
+        uint64x2x2_t y5 = vld1q_u64_x2(reinterpret_cast<const uint64_t*>(VoidPtrInc(pBuf, (bufStrideY * 5))));
+        uint64x2x2_t y6 = vld1q_u64_x2(reinterpret_cast<const uint64_t*>(VoidPtrInc(pBuf, (bufStrideY * 6))));
+        uint64x2x2_t y7 = vld1q_u64_x2(reinterpret_cast<const uint64_t*>(VoidPtrInc(pBuf, (bufStrideY * 7))));
+
+        // Do a 64-bit zip/unpack operation to interleave within each 128b reg.
+        // Each reg becomes: [ Y0 X0 0 0 ]
+        uint64x2_t comb0a = vzip1q_u64(y0.val[0], y1.val[0]);
+        uint64x2_t comb0b = vzip2q_u64(y0.val[0], y1.val[0]);
+        uint64x2_t comb1a = vzip1q_u64(y0.val[1], y1.val[1]);
+        uint64x2_t comb1b = vzip2q_u64(y0.val[1], y1.val[1]);
+        uint64x2_t comb2a = vzip1q_u64(y2.val[0], y3.val[0]);
+        uint64x2_t comb2b = vzip2q_u64(y2.val[0], y3.val[0]);
+        uint64x2_t comb3a = vzip1q_u64(y2.val[1], y3.val[1]);
+        uint64x2_t comb3b = vzip2q_u64(y2.val[1], y3.val[1]);
+        uint64x2_t comb4a = vzip1q_u64(y4.val[0], y5.val[0]);
+        uint64x2_t comb4b = vzip2q_u64(y4.val[0], y5.val[0]);
+        uint64x2_t comb5a = vzip1q_u64(y4.val[1], y5.val[1]);
+        uint64x2_t comb5b = vzip2q_u64(y4.val[1], y5.val[1]);
+        uint64x2_t comb6a = vzip1q_u64(y6.val[0], y7.val[0]);
+        uint64x2_t comb6b = vzip2q_u64(y6.val[0], y7.val[0]);
+        uint64x2_t comb7a = vzip1q_u64(y6.val[1], y7.val[1]);
+        uint64x2_t comb7b = vzip2q_u64(y6.val[1], y7.val[1]);
+
+        // The top 4 bits of the swizzle are handled by plain reg moves here.
+        uint64x2x4_t out0 = { { comb0a, comb0b, comb2a, comb2b } };
+        uint64x2x4_t out1 = { { comb1a, comb1b, comb3a, comb3b } };
+        uint64x2x4_t out2 = { { comb4a, comb4b, comb6a, comb6b } };
+        uint64x2x4_t out3 = { { comb5a, comb5b, comb7a, comb7b } };
+
+        vst1q_u64_x4(reinterpret_cast<uint64_t*>(VoidPtrInc(pImgMicroblock, 0 * 64)), out0);
+        vst1q_u64_x4(reinterpret_cast<uint64_t*>(VoidPtrInc(pImgMicroblock, 1 * 64)), out1);
+        vst1q_u64_x4(reinterpret_cast<uint64_t*>(VoidPtrInc(pImgMicroblock, 2 * 64)), out2);
+        vst1q_u64_x4(reinterpret_cast<uint64_t*>(VoidPtrInc(pImgMicroblock, 3 * 64)), out3);
+    }
+
+};
+
+class MicroSw_2D_8BPE_NEON
+{
+    MicroSw_2D_8BPE_NEON() = delete;
+public:
+    static constexpr UINT_64       MicroEq[8] = { 0, 0, 0, X0, Y0, X1, X2, Y1 };
+    static constexpr ADDR_EXTENT3D MicroBlockExtent = {8, 4, 1};
+    static constexpr UINT_32       BpeLog2 = 3;
+    static constexpr UINT_32       ExpandX = 2;
+
+    NEON_FUNC static void CopyMicroBlock(
+        void*       pImgMicroblock,  // Microblock to write to
+        const void* pBuf,            // Pointer to data starting from the first pixel of this block
+        size_t      bufStrideY,      // Stride of each row in pBuf
+        size_t      bufStrideZ)      // Stride of each slice in pBuf
+    {
+
+        // Unaligned buffer loads for 4 rows * (8 col * 64b = 256bx2)
+        // ARM can do a 512b load/store, but the actual values are 128b.
+        uint32x4x4_t y0 = vld1q_u32_x4(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 0))));
+        uint32x4x4_t y1 = vld1q_u32_x4(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 1))));
+        uint32x4x4_t y2 = vld1q_u32_x4(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 2))));
+        uint32x4x4_t y3 = vld1q_u32_x4(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 3))));
+
+        // The top 4 bits of the swizzle are handled by plain reg moves here. The bottom 4 bits need no handling.
+        uint32x4x4_t out0 = { { y0.val[0],  y1.val[0], y0.val[1], y1.val[1] } };
+        uint32x4x4_t out1 = { { y0.val[2],  y1.val[2], y0.val[3], y1.val[3] } };
+        uint32x4x4_t out2 = { { y2.val[0],  y3.val[0], y2.val[1], y3.val[1] } };
+        uint32x4x4_t out3 = { { y2.val[2],  y3.val[2], y2.val[3], y3.val[3] } };
+
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 0 * 64)), out0);
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 1 * 64)), out1);
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 2 * 64)), out2);
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 3 * 64)), out3);
+    }
+};
+class MicroSw_2D_16BPE_NEON
+{
+    MicroSw_2D_16BPE_NEON() = delete;
+public:
+    static constexpr UINT_64       MicroEq[8] = { 0, 0, 0, 0, X0, Y0, X1, Y1 };
+    static constexpr ADDR_EXTENT3D MicroBlockExtent = {4, 4, 1};
+    static constexpr UINT_32       BpeLog2 = 4;
+    static constexpr UINT_32       ExpandX = 2;
+
+    NEON_FUNC static void CopyMicroBlock(
+        void*       pImgMicroblock,  // Microblock to write to
+        const void* pBuf,            // Pointer to data starting from the first pixel of this block
+        size_t      bufStrideY,      // Stride of each row in pBuf
+        size_t      bufStrideZ)      // Stride of each slice in pBuf
+    {
+
+        // Unaligned buffer loads for 4 rows * (4 col * 128b = 512b)
+        // ARM can do a 512b load/store, but the actual values are 128b.
+        uint32x4x4_t y0 = vld1q_u32_x4(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 0))));
+        uint32x4x4_t y1 = vld1q_u32_x4(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 1))));
+        uint32x4x4_t y2 = vld1q_u32_x4(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 2))));
+        uint32x4x4_t y3 = vld1q_u32_x4(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 3))));
+
+        // The top 3 bits of the swizzle are handled by plain reg moves here. The rest are already right.
+        uint32x4x4_t out0 = { { y0.val[0],  y0.val[1], y1.val[0], y1.val[1] } };
+        uint32x4x4_t out1 = { { y0.val[2],  y0.val[3], y1.val[2], y1.val[3] } };
+        uint32x4x4_t out2 = { { y2.val[0],  y2.val[1], y3.val[0], y3.val[1] } };
+        uint32x4x4_t out3 = { { y2.val[2],  y2.val[3], y3.val[2], y3.val[3] } };
+
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 0 * 64)), out0);
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 1 * 64)), out1);
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 2 * 64)), out2);
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 3 * 64)), out3);
+    }
+};
+class MicroSw_3D_1BPE_NEON
+{
+    MicroSw_3D_1BPE_NEON() = delete;
+public:
+    static constexpr UINT_64       MicroEq[8] = { X0, X1, Z0, Y0, Y1, Z1, X2, Z2 };
+    static constexpr ADDR_EXTENT3D MicroBlockExtent = {8, 4, 8};
+    static constexpr UINT_32       BpeLog2 = 0;
+    static constexpr UINT_32       ExpandX = 2;
+    NEON_FUNC static void CopyMicroBlock(
+        void*       pImgMicroblock,  // Microblock to write to
+        const void* pBuf,            // Pointer to data starting from the first pixel of this block
+        size_t      bufStrideY,      // Stride of each row in pBuf
+        size_t      bufStrideZ)      // Stride of each slice in pBuf
+    {
+        // Unaligned buffer loads for 8 slices * 4 rows * (4 col * 16b = 64b)
+        // Do a lot of 64b (half reg) loads and join them
+        // Each reg becomes: [ Y1 X2 X1 X0 ]
+        uint32x4_t z0y02 = vcombine_u32(
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 0) + (bufStrideY * 0)))),
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 0) + (bufStrideY * 2)))));
+        uint32x4_t z0y13 = vcombine_u32(
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 0) + (bufStrideY * 1)))),
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 0) + (bufStrideY * 3)))));
+        uint32x4_t z1y02 = vcombine_u32(
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 1) + (bufStrideY * 0)))),
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 1) + (bufStrideY * 2)))));
+        uint32x4_t z1y13 = vcombine_u32(
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 1) + (bufStrideY * 1)))),
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 1) + (bufStrideY * 3)))));
+        uint32x4_t z2y02 = vcombine_u32(
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 2) + (bufStrideY * 0)))),
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 2) + (bufStrideY * 2)))));
+        uint32x4_t z2y13 = vcombine_u32(
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 2) + (bufStrideY * 1)))),
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 2) + (bufStrideY * 3)))));
+        uint32x4_t z3y02 = vcombine_u32(
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 3) + (bufStrideY * 0)))),
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 3) + (bufStrideY * 2)))));
+        uint32x4_t z3y13 = vcombine_u32(
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 3) + (bufStrideY * 1)))),
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 3) + (bufStrideY * 3)))));
+        uint32x4_t z4y02 = vcombine_u32(
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 4) + (bufStrideY * 0)))),
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 4) + (bufStrideY * 2)))));
+        uint32x4_t z4y13 = vcombine_u32(
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 4) + (bufStrideY * 1)))),
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 4) + (bufStrideY * 3)))));
+        uint32x4_t z5y02 = vcombine_u32(
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 5) + (bufStrideY * 0)))),
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 5) + (bufStrideY * 2)))));
+        uint32x4_t z5y13 = vcombine_u32(
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 5) + (bufStrideY * 1)))),
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 5) + (bufStrideY * 3)))));
+        uint32x4_t z6y02 = vcombine_u32(
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 6) + (bufStrideY * 0)))),
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 6) + (bufStrideY * 2)))));
+        uint32x4_t z6y13 = vcombine_u32(
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 6) + (bufStrideY * 1)))),
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 6) + (bufStrideY * 3)))));
+        uint32x4_t z7y02 = vcombine_u32(
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 7) + (bufStrideY * 0)))),
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 7) + (bufStrideY * 2)))));
+        uint32x4_t z7y13 = vcombine_u32(
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 7) + (bufStrideY * 1)))),
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 7) + (bufStrideY * 3)))));
+
+        // Now, do two 32-bit interleaves, first for y0, then for z0
+        // Each reg becomes: [ X2 Y0 X1 X0 ]
+        uint32x4_t ycomb0a = vzip1q_u32(z0y02, z0y13);
+        uint32x4_t ycomb0b = vzip2q_u32(z0y02, z0y13);
+        uint32x4_t ycomb1a = vzip1q_u32(z1y02, z1y13);
+        uint32x4_t ycomb1b = vzip2q_u32(z1y02, z1y13);
+        uint32x4_t ycomb2a = vzip1q_u32(z2y02, z2y13);
+        uint32x4_t ycomb2b = vzip2q_u32(z2y02, z2y13);
+        uint32x4_t ycomb3a = vzip1q_u32(z3y02, z3y13);
+        uint32x4_t ycomb3b = vzip2q_u32(z3y02, z3y13);
+        uint32x4_t ycomb4a = vzip1q_u32(z4y02, z4y13);
+        uint32x4_t ycomb4b = vzip2q_u32(z4y02, z4y13);
+        uint32x4_t ycomb5a = vzip1q_u32(z5y02, z5y13);
+        uint32x4_t ycomb5b = vzip2q_u32(z5y02, z5y13);
+        uint32x4_t ycomb6a = vzip1q_u32(z6y02, z6y13);
+        uint32x4_t ycomb6b = vzip2q_u32(z6y02, z6y13);
+        uint32x4_t ycomb7a = vzip1q_u32(z7y02, z7y13);
+        uint32x4_t ycomb7b = vzip2q_u32(z7y02, z7y13);
+
+        // Each reg becomes: [ Y0 Z0 X1 X0 ]
+        uint32x4_t comb0a = vzip1q_u32(ycomb0a, ycomb1a);
+        uint32x4_t comb0b = vzip2q_u32(ycomb0a, ycomb1a);
+        uint32x4_t comb1a = vzip1q_u32(ycomb0b, ycomb1b);
+        uint32x4_t comb1b = vzip2q_u32(ycomb0b, ycomb1b);
+        uint32x4_t comb2a = vzip1q_u32(ycomb2a, ycomb3a);
+        uint32x4_t comb2b = vzip2q_u32(ycomb2a, ycomb3a);
+        uint32x4_t comb3a = vzip1q_u32(ycomb2b, ycomb3b);
+        uint32x4_t comb3b = vzip2q_u32(ycomb2b, ycomb3b);
+        uint32x4_t comb4a = vzip1q_u32(ycomb4a, ycomb5a);
+        uint32x4_t comb4b = vzip2q_u32(ycomb4a, ycomb5a);
+        uint32x4_t comb5a = vzip1q_u32(ycomb4b, ycomb5b);
+        uint32x4_t comb5b = vzip2q_u32(ycomb4b, ycomb5b);
+        uint32x4_t comb6a = vzip1q_u32(ycomb6a, ycomb7a);
+        uint32x4_t comb6b = vzip2q_u32(ycomb6a, ycomb7a);
+        uint32x4_t comb7a = vzip1q_u32(ycomb6b, ycomb7b);
+        uint32x4_t comb7b = vzip2q_u32(ycomb6b, ycomb7b);
+
+        // The top 4 bits of the swizzle are handled by plain reg moves here.
+        uint32x4x4_t out0 = { { comb0a, comb1a, comb2a, comb3a } };
+        uint32x4x4_t out1 = { { comb0b, comb1b, comb2b, comb3b } };
+        uint32x4x4_t out2 = { { comb4a, comb5a, comb6a, comb7a } };
+        uint32x4x4_t out3 = { { comb4b, comb5b, comb6b, comb7b } };
+
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 0 * 64)), out0);
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 1 * 64)), out1);
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 2 * 64)), out2);
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 3 * 64)), out3);
+    }
+};
+
+class MicroSw_3D_2BPE_NEON
+{
+    MicroSw_3D_2BPE_NEON() = delete;
+public:
+    static constexpr UINT_64       MicroEq[8] = { 0, X0, Z0, Y0, X1, Z1, Y1, Z2 };
+    static constexpr ADDR_EXTENT3D MicroBlockExtent = {4, 4, 8};
+    static constexpr UINT_32       BpeLog2 = 1;
+    static constexpr UINT_32       ExpandX = 2;
+
+    NEON_FUNC static void CopyMicroBlock(
+        void*       pImgMicroblock,  // Microblock to write to
+        const void* pBuf,            // Pointer to data starting from the first pixel of this block
+        size_t      bufStrideY,      // Stride of each row in pBuf
+        size_t      bufStrideZ)      // Stride of each slice in pBuf
+    {
+
+        // Unaligned buffer loads for 8 slices * 4 rows * (4 col * 16b = 64b)
+        // Do a lot of 64b (half reg) loads and join them
+        // Each reg becomes: [ Y1 X1 X0 0 ]
+        uint32x4_t z0y02 = vcombine_u32(
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 0) + (bufStrideY * 0)))),
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 0) + (bufStrideY * 2)))));
+        uint32x4_t z0y13 = vcombine_u32(
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 0) + (bufStrideY * 1)))),
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 0) + (bufStrideY * 3)))));
+        uint32x4_t z1y02 = vcombine_u32(
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 1) + (bufStrideY * 0)))),
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 1) + (bufStrideY * 2)))));
+        uint32x4_t z1y13 = vcombine_u32(
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 1) + (bufStrideY * 1)))),
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 1) + (bufStrideY * 3)))));
+        uint32x4_t z2y02 = vcombine_u32(
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 2) + (bufStrideY * 0)))),
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 2) + (bufStrideY * 2)))));
+        uint32x4_t z2y13 = vcombine_u32(
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 2) + (bufStrideY * 1)))),
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 2) + (bufStrideY * 3)))));
+        uint32x4_t z3y02 = vcombine_u32(
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 3) + (bufStrideY * 0)))),
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 3) + (bufStrideY * 2)))));
+        uint32x4_t z3y13 = vcombine_u32(
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 3) + (bufStrideY * 1)))),
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 3) + (bufStrideY * 3)))));
+        uint32x4_t z4y02 = vcombine_u32(
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 4) + (bufStrideY * 0)))),
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 4) + (bufStrideY * 2)))));
+        uint32x4_t z4y13 = vcombine_u32(
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 4) + (bufStrideY * 1)))),
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 4) + (bufStrideY * 3)))));
+        uint32x4_t z5y02 = vcombine_u32(
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 5) + (bufStrideY * 0)))),
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 5) + (bufStrideY * 2)))));
+        uint32x4_t z5y13 = vcombine_u32(
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 5) + (bufStrideY * 1)))),
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 5) + (bufStrideY * 3)))));
+        uint32x4_t z6y02 = vcombine_u32(
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 6) + (bufStrideY * 0)))),
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 6) + (bufStrideY * 2)))));
+        uint32x4_t z6y13 = vcombine_u32(
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 6) + (bufStrideY * 1)))),
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 6) + (bufStrideY * 3)))));
+        uint32x4_t z7y02 = vcombine_u32(
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 7) + (bufStrideY * 0)))),
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 7) + (bufStrideY * 2)))));
+        uint32x4_t z7y13 = vcombine_u32(
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 7) + (bufStrideY * 1)))),
+            vld1_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 7) + (bufStrideY * 3)))));
+
+        // Now, do two 32-bit interleaves, first for y0, then for z0
+        // Each reg becomes: [ X1 Y0 X0 0 ]
+        uint32x4_t ycomb0a = vzip1q_u32(z0y02, z0y13);
+        uint32x4_t ycomb0b = vzip2q_u32(z0y02, z0y13);
+        uint32x4_t ycomb1a = vzip1q_u32(z1y02, z1y13);
+        uint32x4_t ycomb1b = vzip2q_u32(z1y02, z1y13);
+        uint32x4_t ycomb2a = vzip1q_u32(z2y02, z2y13);
+        uint32x4_t ycomb2b = vzip2q_u32(z2y02, z2y13);
+        uint32x4_t ycomb3a = vzip1q_u32(z3y02, z3y13);
+        uint32x4_t ycomb3b = vzip2q_u32(z3y02, z3y13);
+        uint32x4_t ycomb4a = vzip1q_u32(z4y02, z4y13);
+        uint32x4_t ycomb4b = vzip2q_u32(z4y02, z4y13);
+        uint32x4_t ycomb5a = vzip1q_u32(z5y02, z5y13);
+        uint32x4_t ycomb5b = vzip2q_u32(z5y02, z5y13);
+        uint32x4_t ycomb6a = vzip1q_u32(z6y02, z6y13);
+        uint32x4_t ycomb6b = vzip2q_u32(z6y02, z6y13);
+        uint32x4_t ycomb7a = vzip1q_u32(z7y02, z7y13);
+        uint32x4_t ycomb7b = vzip2q_u32(z7y02, z7y13);
+
+        // Each reg becomes: [ Y0 Z0 X0 0 ]
+        uint32x4_t comb0a = vzip1q_u32(ycomb0a, ycomb1a);
+        uint32x4_t comb0b = vzip2q_u32(ycomb0a, ycomb1a);
+        uint32x4_t comb1a = vzip1q_u32(ycomb0b, ycomb1b);
+        uint32x4_t comb1b = vzip2q_u32(ycomb0b, ycomb1b);
+        uint32x4_t comb2a = vzip1q_u32(ycomb2a, ycomb3a);
+        uint32x4_t comb2b = vzip2q_u32(ycomb2a, ycomb3a);
+        uint32x4_t comb3a = vzip1q_u32(ycomb2b, ycomb3b);
+        uint32x4_t comb3b = vzip2q_u32(ycomb2b, ycomb3b);
+        uint32x4_t comb4a = vzip1q_u32(ycomb4a, ycomb5a);
+        uint32x4_t comb4b = vzip2q_u32(ycomb4a, ycomb5a);
+        uint32x4_t comb5a = vzip1q_u32(ycomb4b, ycomb5b);
+        uint32x4_t comb5b = vzip2q_u32(ycomb4b, ycomb5b);
+        uint32x4_t comb6a = vzip1q_u32(ycomb6a, ycomb7a);
+        uint32x4_t comb6b = vzip2q_u32(ycomb6a, ycomb7a);
+        uint32x4_t comb7a = vzip1q_u32(ycomb6b, ycomb7b);
+        uint32x4_t comb7b = vzip2q_u32(ycomb6b, ycomb7b);
+
+        // The top 4 bits of the swizzle are handled by plain reg moves here. The bottom 4 bits need no handling.
+        uint32x4x4_t out0 = { { comb0a, comb0b, comb2a, comb2b } };
+        uint32x4x4_t out1 = { { comb1a, comb1b, comb3a, comb3b } };
+        uint32x4x4_t out2 = { { comb4a, comb4b, comb6a, comb6b } };
+        uint32x4x4_t out3 = { { comb5a, comb5b, comb7a, comb7b } };
+
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 0 * 64)), out0);
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 1 * 64)), out1);
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 2 * 64)), out2);
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 3 * 64)), out3);
+    }
+};
+
+class MicroSw_3D_4BPE_NEON
+{
+    MicroSw_3D_4BPE_NEON() = delete;
+public:
+    static constexpr UINT_64       MicroEq[8] = { 0, 0, X0, Y0, X1, Z0, Y1, Z1 };
+    static constexpr ADDR_EXTENT3D MicroBlockExtent = {4, 4, 4};
+    static constexpr UINT_32       BpeLog2 = 2;
+    static constexpr UINT_32       ExpandX = 2;
+
+    NEON_FUNC static void CopyMicroBlock(
+        void*       pImgMicroblock,  // Microblock to write to
+        const void* pBuf,            // Pointer to data starting from the first pixel of this block
+        size_t      bufStrideY,      // Stride of each row in pBuf
+        size_t      bufStrideZ)      // Stride of each slice in pBuf
+    {
+        // Unaligned buffer loads for 4 rows * (8 col * 64b = 256bx2)
+        // Each reg becomes: [ X1 X0 0 0 ]
+        uint64x2_t z0y0 = vld1q_u64(reinterpret_cast<const uint64_t*>(VoidPtrInc(pBuf, (bufStrideZ * 0) + (bufStrideY * 0))));
+        uint64x2_t z0y1 = vld1q_u64(reinterpret_cast<const uint64_t*>(VoidPtrInc(pBuf, (bufStrideZ * 0) + (bufStrideY * 1))));
+        uint64x2_t z0y2 = vld1q_u64(reinterpret_cast<const uint64_t*>(VoidPtrInc(pBuf, (bufStrideZ * 0) + (bufStrideY * 2))));
+        uint64x2_t z0y3 = vld1q_u64(reinterpret_cast<const uint64_t*>(VoidPtrInc(pBuf, (bufStrideZ * 0) + (bufStrideY * 3))));
+        uint64x2_t z1y0 = vld1q_u64(reinterpret_cast<const uint64_t*>(VoidPtrInc(pBuf, (bufStrideZ * 1) + (bufStrideY * 0))));
+        uint64x2_t z1y1 = vld1q_u64(reinterpret_cast<const uint64_t*>(VoidPtrInc(pBuf, (bufStrideZ * 1) + (bufStrideY * 1))));
+        uint64x2_t z1y2 = vld1q_u64(reinterpret_cast<const uint64_t*>(VoidPtrInc(pBuf, (bufStrideZ * 1) + (bufStrideY * 2))));
+        uint64x2_t z1y3 = vld1q_u64(reinterpret_cast<const uint64_t*>(VoidPtrInc(pBuf, (bufStrideZ * 1) + (bufStrideY * 3))));
+        uint64x2_t z2y0 = vld1q_u64(reinterpret_cast<const uint64_t*>(VoidPtrInc(pBuf, (bufStrideZ * 2) + (bufStrideY * 0))));
+        uint64x2_t z2y1 = vld1q_u64(reinterpret_cast<const uint64_t*>(VoidPtrInc(pBuf, (bufStrideZ * 2) + (bufStrideY * 1))));
+        uint64x2_t z2y2 = vld1q_u64(reinterpret_cast<const uint64_t*>(VoidPtrInc(pBuf, (bufStrideZ * 2) + (bufStrideY * 2))));
+        uint64x2_t z2y3 = vld1q_u64(reinterpret_cast<const uint64_t*>(VoidPtrInc(pBuf, (bufStrideZ * 2) + (bufStrideY * 3))));
+        uint64x2_t z3y0 = vld1q_u64(reinterpret_cast<const uint64_t*>(VoidPtrInc(pBuf, (bufStrideZ * 3) + (bufStrideY * 0))));
+        uint64x2_t z3y1 = vld1q_u64(reinterpret_cast<const uint64_t*>(VoidPtrInc(pBuf, (bufStrideZ * 3) + (bufStrideY * 1))));
+        uint64x2_t z3y2 = vld1q_u64(reinterpret_cast<const uint64_t*>(VoidPtrInc(pBuf, (bufStrideZ * 3) + (bufStrideY * 2))));
+        uint64x2_t z3y3 = vld1q_u64(reinterpret_cast<const uint64_t*>(VoidPtrInc(pBuf, (bufStrideZ * 3) + (bufStrideY * 3))));
+
+        // Then do a 64-bit interleave across y0
+        // Each reg becomes: [ Y0 X0 0 0 ]
+        uint64x2_t comb0a = vzip1q_u64(z0y0, z0y1);
+        uint64x2_t comb0b = vzip2q_u64(z0y0, z0y1);
+        uint64x2_t comb1a = vzip1q_u64(z0y2, z0y3);
+        uint64x2_t comb1b = vzip2q_u64(z0y2, z0y3);
+        uint64x2_t comb2a = vzip1q_u64(z1y0, z1y1);
+        uint64x2_t comb2b = vzip2q_u64(z1y0, z1y1);
+        uint64x2_t comb3a = vzip1q_u64(z1y2, z1y3);
+        uint64x2_t comb3b = vzip2q_u64(z1y2, z1y3);
+        uint64x2_t comb4a = vzip1q_u64(z2y0, z2y1);
+        uint64x2_t comb4b = vzip2q_u64(z2y0, z2y1);
+        uint64x2_t comb5a = vzip1q_u64(z2y2, z2y3);
+        uint64x2_t comb5b = vzip2q_u64(z2y2, z2y3);
+        uint64x2_t comb6a = vzip1q_u64(z3y0, z3y1);
+        uint64x2_t comb6b = vzip2q_u64(z3y0, z3y1);
+        uint64x2_t comb7a = vzip1q_u64(z3y2, z3y3);
+        uint64x2_t comb7b = vzip2q_u64(z3y2, z3y3);
+
+        // The top 4 bits of the swizzle are handled by plain reg moves here. The bottom 4 bits need no handling.
+        uint64x2x4_t out0 = { { comb0a, comb0b, comb2a, comb2b } };
+        uint64x2x4_t out1 = { { comb1a, comb1b, comb3a, comb3b } };
+        uint64x2x4_t out2 = { { comb4a, comb4b, comb6a, comb6b } };
+        uint64x2x4_t out3 = { { comb5a, comb5b, comb7a, comb7b } };
+
+        vst1q_u64_x4(reinterpret_cast<uint64_t*>(VoidPtrInc(pImgMicroblock, 0 * 64)), out0);
+        vst1q_u64_x4(reinterpret_cast<uint64_t*>(VoidPtrInc(pImgMicroblock, 1 * 64)), out1);
+        vst1q_u64_x4(reinterpret_cast<uint64_t*>(VoidPtrInc(pImgMicroblock, 2 * 64)), out2);
+        vst1q_u64_x4(reinterpret_cast<uint64_t*>(VoidPtrInc(pImgMicroblock, 3 * 64)), out3);
+    }
+};
+
+class MicroSw_3D_8BPE_NEON
+{
+    MicroSw_3D_8BPE_NEON() = delete;
+public:
+    static constexpr UINT_64       MicroEq[8] = { 0, 0, 0, X0, Y0, Z0, X1, Z1 };
+    static constexpr ADDR_EXTENT3D MicroBlockExtent = {4, 2, 4};
+    static constexpr UINT_32       BpeLog2 = 3;
+    static constexpr UINT_32       ExpandX = 2;
+
+    NEON_FUNC static void CopyMicroBlock(
+        void*       pImgMicroblock,  // Microblock to write to
+        const void* pBuf,            // Pointer to data starting from the first pixel of this block
+        size_t      bufStrideY,      // Stride of each row in pBuf
+        size_t      bufStrideZ)      // Stride of each slice in pBuf
+    {
+        // Unaligned buffer loads for 4 slices * 2 rows * (4 col * 64b = 256b)
+        // ARM can do a 256b load/store, but the actual values are 128b.
+        // Each reg becomes: [ X0 0 0 0 ]
+        uint32x4x2_t z0y0 = vld1q_u32_x2(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 0))));
+        uint32x4x2_t z0y1 = vld1q_u32_x2(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 0) + bufStrideY)));
+        uint32x4x2_t z1y0 = vld1q_u32_x2(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 1))));
+        uint32x4x2_t z1y1 = vld1q_u32_x2(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 1) + bufStrideY)));
+        uint32x4x2_t z2y0 = vld1q_u32_x2(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 2))));
+        uint32x4x2_t z2y1 = vld1q_u32_x2(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 2) + bufStrideY)));
+        uint32x4x2_t z3y0 = vld1q_u32_x2(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 3))));
+        uint32x4x2_t z3y1 = vld1q_u32_x2(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 3) + bufStrideY)));
+
+        // The top 4 bits of the swizzle are handled by plain reg moves here. The bottom 4 bits need no handling.
+        uint32x4x4_t out0 = { { z0y0.val[0], z0y1.val[0], z1y0.val[0], z1y1.val[0] } };
+        uint32x4x4_t out1 = { { z0y0.val[1], z0y1.val[1], z1y0.val[1], z1y1.val[1] } };
+        uint32x4x4_t out2 = { { z2y0.val[0], z2y1.val[0], z3y0.val[0], z3y1.val[0] } };
+        uint32x4x4_t out3 = { { z2y0.val[1], z2y1.val[1], z3y0.val[1], z3y1.val[1] } };
+
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 0 * 64)), out0);
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 1 * 64)), out1);
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 2 * 64)), out2);
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 3 * 64)), out3);
+    }
+};
+
+class MicroSw_3D_16BPE_NEON
+{
+    MicroSw_3D_16BPE_NEON() = delete;
+public:
+    static constexpr UINT_64       MicroEq[8] = { 0, 0, 0, 0, X0, Z0, Y0, Z1 };
+    static constexpr ADDR_EXTENT3D MicroBlockExtent = {2, 2, 4};
+    static constexpr UINT_32       BpeLog2 = 4;
+    static constexpr UINT_32       ExpandX = 2;
+
+    NEON_FUNC static void CopyMicroBlock(
+        void*       pImgMicroblock,  // Microblock to write to
+        const void* pBuf,            // Pointer to data starting from the first pixel of this block
+        size_t      bufStrideY,      // Stride of each row in pBuf
+        size_t      bufStrideZ)      // Stride of each slice in pBuf
+    {
+        // Unaligned buffer loads for 4 slices * 2 rows * (4 col * 64b = 256b)
+        // ARM can do a 256b load/store, but the actual values are 128b.
+        uint32x4x2_t z0y0 = vld1q_u32_x2(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 0))));
+        uint32x4x2_t z0y1 = vld1q_u32_x2(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 0) + bufStrideY)));
+        uint32x4x2_t z1y0 = vld1q_u32_x2(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 1))));
+        uint32x4x2_t z1y1 = vld1q_u32_x2(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 1) + bufStrideY)));
+        uint32x4x2_t z2y0 = vld1q_u32_x2(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 2))));
+        uint32x4x2_t z2y1 = vld1q_u32_x2(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 2) + bufStrideY)));
+        uint32x4x2_t z3y0 = vld1q_u32_x2(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 3))));
+        uint32x4x2_t z3y1 = vld1q_u32_x2(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideZ * 3) + bufStrideY)));
+
+        // The top 4 bits of the swizzle are handled by plain reg moves here. The bottom 4 bits need no handling.
+        uint32x4x4_t out0 = { { z0y0.val[0], z0y0.val[1], z1y0.val[0], z1y0.val[1] } };
+        uint32x4x4_t out1 = { { z0y1.val[0], z0y1.val[1], z1y1.val[0], z1y1.val[1] } };
+        uint32x4x4_t out2 = { { z2y0.val[0], z2y0.val[1], z3y0.val[0], z3y0.val[1] } };
+        uint32x4x4_t out3 = { { z2y1.val[0], z2y1.val[1], z3y1.val[0], z3y1.val[1] } };
+
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 0 * 64)), out0);
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 1 * 64)), out1);
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 2 * 64)), out2);
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 3 * 64)), out3);
+    }
+};
+
+class MicroSw_R_1BPE_NEON
+{
+    MicroSw_R_1BPE_NEON() = delete;
+public:
+    static constexpr UINT_64       MicroEq[8] = { X0, X1, X2, X3, Y0, Y1, Y2, Y3 };
+    static constexpr ADDR_EXTENT3D MicroBlockExtent = {16, 16, 1};
+    static constexpr UINT_32       BpeLog2 = 0;
+    static constexpr UINT_32       ExpandX = 4;
+    NEON_FUNC static void CopyMicroBlock(
+        void*       pImgMicroblock,  // Microblock to write to
+        const void* pBuf,            // Pointer to data starting from the first pixel of this block
+        size_t      bufStrideY,      // Stride of each row in pBuf
+        size_t      bufStrideZ)      // Stride of each slice in pBuf
+    {
+        // Unaligned buffer loads for 16 rows * (16 col * 8b = 128b)
+        // Each reg becomes: [ X3 X2 X1 X0 ]
+        uint32x4_t y0 = vld1q_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 0))));
+        uint32x4_t y1 = vld1q_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 1))));
+        uint32x4_t y2 = vld1q_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 2))));
+        uint32x4_t y3 = vld1q_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 3))));
+        uint32x4_t y4 = vld1q_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 4))));
+        uint32x4_t y5 = vld1q_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 5))));
+        uint32x4_t y6 = vld1q_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 6))));
+        uint32x4_t y7 = vld1q_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 7))));
+        uint32x4_t y8 = vld1q_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 8))));
+        uint32x4_t y9 = vld1q_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 9))));
+        uint32x4_t y10 = vld1q_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 10))));
+        uint32x4_t y11 = vld1q_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 11))));
+        uint32x4_t y12 = vld1q_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 12))));
+        uint32x4_t y13 = vld1q_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 13))));
+        uint32x4_t y14 = vld1q_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 14))));
+        uint32x4_t y15 = vld1q_u32(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 15))));
+
+        // The top 4 bits of the swizzle are handled by plain reg moves here. The bottom 4 are identity.
+        uint32x4x4_t out0 = { { y0, y1, y2, y3} };
+        uint32x4x4_t out1 = { { y4, y5, y6, y7 } };
+        uint32x4x4_t out2 = { { y8, y9, y10, y11} };
+        uint32x4x4_t out3 = { { y12, y13, y14, y15} };
+
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 0 * 64)), out0);
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 1 * 64)), out1);
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 2 * 64)), out2);
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 3 * 64)), out3);
+    }
+};
+
+class MicroSw_R_2BPE_NEON
+{
+    MicroSw_R_2BPE_NEON() = delete;
+public:
+    static constexpr UINT_64       MicroEq[8] = { 0, X0, X1, X2, Y0, Y1, Y2, X3 };
+    static constexpr ADDR_EXTENT3D MicroBlockExtent = {16, 8, 1};
+    static constexpr UINT_32       BpeLog2 = 1;
+    static constexpr UINT_32       ExpandX = 4;
+    NEON_FUNC static void CopyMicroBlock(
+        void*       pImgMicroblock,  // Microblock to write to
+        const void* pBuf,            // Pointer to data starting from the first pixel of this block
+        size_t      bufStrideY,      // Stride of each row in pBuf
+        size_t      bufStrideZ)      // Stride of each slice in pBuf
+    {
+        // Unaligned buffer loads for 8 rows * (16 col * 16b = 256b)
+        // ARM can do a 256b load/store, but the actual values are 128b.
+        // Each reg becomes: [ X2 X1 X0 0 ]
+        uint32x4x2_t y0 = vld1q_u32_x2(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 0))));
+        uint32x4x2_t y1 = vld1q_u32_x2(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 1))));
+        uint32x4x2_t y2 = vld1q_u32_x2(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 2))));
+        uint32x4x2_t y3 = vld1q_u32_x2(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 3))));
+        uint32x4x2_t y4 = vld1q_u32_x2(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 4))));
+        uint32x4x2_t y5 = vld1q_u32_x2(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 5))));
+        uint32x4x2_t y6 = vld1q_u32_x2(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 6))));
+        uint32x4x2_t y7 = vld1q_u32_x2(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 7))));
+
+        // The top 4 bits of the swizzle are handled by plain reg moves here. The bottom 4 are identity.
+        uint32x4x4_t out0 = { { y0.val[0], y1.val[0], y2.val[0], y3.val[0]} };
+        uint32x4x4_t out1 = { { y4.val[0], y5.val[0], y6.val[0], y7.val[0] } };
+        uint32x4x4_t out2 = { { y0.val[1], y1.val[1], y2.val[1], y3.val[1]} };
+        uint32x4x4_t out3 = { { y4.val[1], y5.val[1], y6.val[1], y7.val[1] } };
+
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 0 * 64)), out0);
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 1 * 64)), out1);
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 2 * 64)), out2);
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 3 * 64)), out3);
+    }
+};
+
+class MicroSw_R_4BPE_NEON
+{
+    MicroSw_R_4BPE_NEON() = delete;
+public:
+    static constexpr UINT_64       MicroEq[8] = { 0, 0, X0, X1, Y0, Y1, X2, Y2 };
+    static constexpr ADDR_EXTENT3D MicroBlockExtent = {8, 8, 1};
+    static constexpr UINT_32       BpeLog2 = 2;
+    static constexpr UINT_32       ExpandX = 4;
+
+    NEON_FUNC static void CopyMicroBlock(
+        void*       pImgMicroblock,  // Microblock to write to
+        const void* pBuf,            // Pointer to data starting from the first pixel of this block
+        size_t      bufStrideY,      // Stride of each row in pBuf
+        size_t      bufStrideZ)      // Stride of each slice in pBuf
+    {
+        // Unaligned buffer loads for 8 rows * (16 col * 16b = 256b)
+        // ARM can do a 256b load/store, but the actual values are 128b.
+        uint32x4x2_t y0 = vld1q_u32_x2(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 0))));
+        uint32x4x2_t y1 = vld1q_u32_x2(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 1))));
+        uint32x4x2_t y2 = vld1q_u32_x2(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 2))));
+        uint32x4x2_t y3 = vld1q_u32_x2(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 3))));
+        uint32x4x2_t y4 = vld1q_u32_x2(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 4))));
+        uint32x4x2_t y5 = vld1q_u32_x2(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 5))));
+        uint32x4x2_t y6 = vld1q_u32_x2(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 6))));
+        uint32x4x2_t y7 = vld1q_u32_x2(reinterpret_cast<const uint32_t*>(VoidPtrInc(pBuf, (bufStrideY * 7))));
+
+        // The top 4 bits of the swizzle are handled by plain reg moves here. The bottom 4 are identity.
+        uint32x4x4_t out0 = { { y0.val[0], y1.val[0], y2.val[0], y3.val[0]} };
+        uint32x4x4_t out1 = { { y0.val[1], y1.val[1], y2.val[1], y3.val[1]} };
+        uint32x4x4_t out2 = { { y4.val[0], y5.val[0], y6.val[0], y7.val[0] } };
+        uint32x4x4_t out3 = { { y4.val[1], y5.val[1], y6.val[1], y7.val[1] } };
+
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 0 * 64)), out0);
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 1 * 64)), out1);
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 2 * 64)), out2);
+        vst1q_u32_x4(reinterpret_cast<uint32_t*>(VoidPtrInc(pImgMicroblock, 3 * 64)), out3);
+    }
+};
+
+class MicroSw_Z_1BPE_NEON
+{
+    MicroSw_Z_1BPE_NEON() = delete;
+public:
+    static constexpr UINT_64       MicroEq[8] = { X0, Y0, X1, Y1, X2, Y2, X3, Y3 };
+    static constexpr ADDR_EXTENT3D MicroBlockExtent = {16, 16, 1};
+    static constexpr UINT_32       BpeLog2 = 0;
+    static constexpr UINT_32       ExpandX = 2;
+
+    NEON_FUNC static void CopyMicroBlock(
+        void*       pImgMicroblock,  // Microblock to write to
+        const void* pBuf,            // Pointer to data starting from the first pixel of this block
+        size_t      bufStrideY,      // Stride of each row in pBuf
+        size_t      bufStrideZ)      // Stride of each slice in pBuf
+    {
+        // Unaligned buffer loads for 16 rows * (16 col * 8b = 128b)
+        // Each reg becomes: [ X3 X2 X1 X0 ]
+        uint16x8_t y0  = vld1q_u16(reinterpret_cast<const uint16_t*>(VoidPtrInc(pBuf, (bufStrideY * 0))));
+        uint16x8_t y1  = vld1q_u16(reinterpret_cast<const uint16_t*>(VoidPtrInc(pBuf, (bufStrideY * 1))));
+        uint16x8_t y2  = vld1q_u16(reinterpret_cast<const uint16_t*>(VoidPtrInc(pBuf, (bufStrideY * 2))));
+        uint16x8_t y3  = vld1q_u16(reinterpret_cast<const uint16_t*>(VoidPtrInc(pBuf, (bufStrideY * 3))));
+        uint16x8_t y4  = vld1q_u16(reinterpret_cast<const uint16_t*>(VoidPtrInc(pBuf, (bufStrideY * 4))));
+        uint16x8_t y5  = vld1q_u16(reinterpret_cast<const uint16_t*>(VoidPtrInc(pBuf, (bufStrideY * 5))));
+        uint16x8_t y6  = vld1q_u16(reinterpret_cast<const uint16_t*>(VoidPtrInc(pBuf, (bufStrideY * 6))));
+        uint16x8_t y7  = vld1q_u16(reinterpret_cast<const uint16_t*>(VoidPtrInc(pBuf, (bufStrideY * 7))));
+        uint16x8_t y8  = vld1q_u16(reinterpret_cast<const uint16_t*>(VoidPtrInc(pBuf, (bufStrideY * 8))));
+        uint16x8_t y9  = vld1q_u16(reinterpret_cast<const uint16_t*>(VoidPtrInc(pBuf, (bufStrideY * 9))));
+        uint16x8_t y10 = vld1q_u16(reinterpret_cast<const uint16_t*>(VoidPtrInc(pBuf, (bufStrideY * 10))));
+        uint16x8_t y11 = vld1q_u16(reinterpret_cast<const uint16_t*>(VoidPtrInc(pBuf, (bufStrideY * 11))));
+        uint16x8_t y12 = vld1q_u16(reinterpret_cast<const uint16_t*>(VoidPtrInc(pBuf, (bufStrideY * 12))));
+        uint16x8_t y13 = vld1q_u16(reinterpret_cast<const uint16_t*>(VoidPtrInc(pBuf, (bufStrideY * 13))));
+        uint16x8_t y14 = vld1q_u16(reinterpret_cast<const uint16_t*>(VoidPtrInc(pBuf, (bufStrideY * 14))));
+        uint16x8_t y15 = vld1q_u16(reinterpret_cast<const uint16_t*>(VoidPtrInc(pBuf, (bufStrideY * 15))));
+
+        // First do a 16-bit interleave across y0 and recast to 64-bits (recast is a nop, just C++-ness)
+        // Each reg becomes: [ X2 X1 Y0 X0 ]
+        uint64x2_t firstcomb0a = vreinterpretq_u64_u16(vzip1q_u16(y0, y1));
+        uint64x2_t firstcomb0b = vreinterpretq_u64_u16(vzip2q_u16(y0, y1));
+        uint64x2_t firstcomb1a = vreinterpretq_u64_u16(vzip1q_u16(y2, y3));
+        uint64x2_t firstcomb1b = vreinterpretq_u64_u16(vzip2q_u16(y2, y3));
+        uint64x2_t firstcomb2a = vreinterpretq_u64_u16(vzip1q_u16(y4, y5));
+        uint64x2_t firstcomb2b = vreinterpretq_u64_u16(vzip2q_u16(y4, y5));
+        uint64x2_t firstcomb3a = vreinterpretq_u64_u16(vzip1q_u16(y6, y7));
+        uint64x2_t firstcomb3b = vreinterpretq_u64_u16(vzip2q_u16(y6, y7));
+        uint64x2_t firstcomb4a = vreinterpretq_u64_u16(vzip1q_u16(y8, y9));
+        uint64x2_t firstcomb4b = vreinterpretq_u64_u16(vzip2q_u16(y8, y9));
+        uint64x2_t firstcomb5a = vreinterpretq_u64_u16(vzip1q_u16(y10, y11));
+        uint64x2_t firstcomb5b = vreinterpretq_u64_u16(vzip2q_u16(y10, y11));
+        uint64x2_t firstcomb6a = vreinterpretq_u64_u16(vzip1q_u16(y12, y13));
+        uint64x2_t firstcomb6b = vreinterpretq_u64_u16(vzip2q_u16(y12, y13));
+        uint64x2_t firstcomb7a = vreinterpretq_u64_u16(vzip1q_u16(y14, y15));
+        uint64x2_t firstcomb7b = vreinterpretq_u64_u16(vzip2q_u16(y14, y15));
+
+        // Then do a 64-bit interleave across y1
+        // Each reg becomes: [ Y1 X1 Y0 X0 ]
+        uint64x2_t comb0a = vzip1q_u64(firstcomb0a, firstcomb1a);
+        uint64x2_t comb0b = vzip2q_u64(firstcomb0a, firstcomb1a);
+        uint64x2_t comb1a = vzip1q_u64(firstcomb0b, firstcomb1b);
+        uint64x2_t comb1b = vzip2q_u64(firstcomb0b, firstcomb1b);
+        uint64x2_t comb2a = vzip1q_u64(firstcomb2a, firstcomb3a);
+        uint64x2_t comb2b = vzip2q_u64(firstcomb2a, firstcomb3a);
+        uint64x2_t comb3a = vzip1q_u64(firstcomb2b, firstcomb3b);
+        uint64x2_t comb3b = vzip2q_u64(firstcomb2b, firstcomb3b);
+        uint64x2_t comb4a = vzip1q_u64(firstcomb4a, firstcomb5a);
+        uint64x2_t comb4b = vzip2q_u64(firstcomb4a, firstcomb5a);
+        uint64x2_t comb5a = vzip1q_u64(firstcomb4b, firstcomb5b);
+        uint64x2_t comb5b = vzip2q_u64(firstcomb4b, firstcomb5b);
+        uint64x2_t comb6a = vzip1q_u64(firstcomb6a, firstcomb7a);
+        uint64x2_t comb6b = vzip2q_u64(firstcomb6a, firstcomb7a);
+        uint64x2_t comb7a = vzip1q_u64(firstcomb6b, firstcomb7b);
+        uint64x2_t comb7b = vzip2q_u64(firstcomb6b, firstcomb7b);
+
+        // Finally, the top 4 bits of the swizzle are handled by plain reg moves here.
+        uint64x2x4_t out0 = { { comb0a, comb0b, comb2a, comb2b } };
+        uint64x2x4_t out1 = { { comb1a, comb1b, comb3a, comb3b } };
+        uint64x2x4_t out2 = { { comb4a, comb4b, comb6a, comb6b } };
+        uint64x2x4_t out3 = { { comb5a, comb5b, comb7a, comb7b } };
+
+        vst1q_u64_x4(reinterpret_cast<uint64_t*>(VoidPtrInc(pImgMicroblock, 0 * 64)), out0);
+        vst1q_u64_x4(reinterpret_cast<uint64_t*>(VoidPtrInc(pImgMicroblock, 1 * 64)), out1);
+        vst1q_u64_x4(reinterpret_cast<uint64_t*>(VoidPtrInc(pImgMicroblock, 2 * 64)), out2);
+        vst1q_u64_x4(reinterpret_cast<uint64_t*>(VoidPtrInc(pImgMicroblock, 3 * 64)), out3);
+    }
+};
+
+class MicroSw_D_1BPE_NEON
+{
+    MicroSw_D_1BPE_NEON() = delete;
+public:
+    static constexpr UINT_64       MicroEq[8] = { X0, X1, X2, Y1, Y0, Y2, X3, Y3 };
+    static constexpr ADDR_EXTENT3D MicroBlockExtent = {16, 16, 1};
+    static constexpr UINT_32       BpeLog2 = 0;
+    static constexpr UINT_32       ExpandX = 4;
+    NEON_FUNC static void CopyMicroBlock(
+        void*       pImgMicroblock,  // Microblock to write to
+        const void* pBuf,            // Pointer to data starting from the first pixel of this block
+        size_t      bufStrideY,      // Stride of each row in pBuf
+        size_t      bufStrideZ)      // Stride of each slice in pBuf
+    {
+        // Unaligned buffer loads for 16 rows * (16 col * 8b = 128b)
+        // Each reg becomes: [ X3 X2 X1 X0 ]
+        uint64x2_t y0  = vld1q_u64(reinterpret_cast<const uint64_t*>(VoidPtrInc(pBuf, (bufStrideY * 0))));
+        uint64x2_t y1  = vld1q_u64(reinterpret_cast<const uint64_t*>(VoidPtrInc(pBuf, (bufStrideY * 1))));
+        uint64x2_t y2  = vld1q_u64(reinterpret_cast<const uint64_t*>(VoidPtrInc(pBuf, (bufStrideY * 2))));
+        uint64x2_t y3  = vld1q_u64(reinterpret_cast<const uint64_t*>(VoidPtrInc(pBuf, (bufStrideY * 3))));
+        uint64x2_t y4  = vld1q_u64(reinterpret_cast<const uint64_t*>(VoidPtrInc(pBuf, (bufStrideY * 4))));
+        uint64x2_t y5  = vld1q_u64(reinterpret_cast<const uint64_t*>(VoidPtrInc(pBuf, (bufStrideY * 5))));
+        uint64x2_t y6  = vld1q_u64(reinterpret_cast<const uint64_t*>(VoidPtrInc(pBuf, (bufStrideY * 6))));
+        uint64x2_t y7  = vld1q_u64(reinterpret_cast<const uint64_t*>(VoidPtrInc(pBuf, (bufStrideY * 7))));
+        uint64x2_t y8  = vld1q_u64(reinterpret_cast<const uint64_t*>(VoidPtrInc(pBuf, (bufStrideY * 8))));
+        uint64x2_t y9  = vld1q_u64(reinterpret_cast<const uint64_t*>(VoidPtrInc(pBuf, (bufStrideY * 9))));
+        uint64x2_t y10 = vld1q_u64(reinterpret_cast<const uint64_t*>(VoidPtrInc(pBuf, (bufStrideY * 10))));
+        uint64x2_t y11 = vld1q_u64(reinterpret_cast<const uint64_t*>(VoidPtrInc(pBuf, (bufStrideY * 11))));
+        uint64x2_t y12 = vld1q_u64(reinterpret_cast<const uint64_t*>(VoidPtrInc(pBuf, (bufStrideY * 12))));
+        uint64x2_t y13 = vld1q_u64(reinterpret_cast<const uint64_t*>(VoidPtrInc(pBuf, (bufStrideY * 13))));
+        uint64x2_t y14 = vld1q_u64(reinterpret_cast<const uint64_t*>(VoidPtrInc(pBuf, (bufStrideY * 14))));
+        uint64x2_t y15 = vld1q_u64(reinterpret_cast<const uint64_t*>(VoidPtrInc(pBuf, (bufStrideY * 15))));
+
+        // Do a 64-bit zip/unpack operation to interleave within each 128b reg.
+        // Each reg becomes: [ Y1 X2 X1 X0 ]
+        uint64x2_t comb0a = vzip1q_u64(y0, y2);
+        uint64x2_t comb0b = vzip2q_u64(y0, y2);
+        uint64x2_t comb1a = vzip1q_u64(y1, y3);
+        uint64x2_t comb1b = vzip2q_u64(y1, y3);
+        uint64x2_t comb2a = vzip1q_u64(y4, y6);
+        uint64x2_t comb2b = vzip2q_u64(y4, y6);
+        uint64x2_t comb3a = vzip1q_u64(y5, y7);
+        uint64x2_t comb3b = vzip2q_u64(y5, y7);
+        uint64x2_t comb4a = vzip1q_u64(y8, y10);
+        uint64x2_t comb4b = vzip2q_u64(y8, y10);
+        uint64x2_t comb5a = vzip1q_u64(y9, y11);
+        uint64x2_t comb5b = vzip2q_u64(y9, y11);
+        uint64x2_t comb6a = vzip1q_u64(y12, y14);
+        uint64x2_t comb6b = vzip2q_u64(y12, y14);
+        uint64x2_t comb7a = vzip1q_u64(y13, y15);
+        uint64x2_t comb7b = vzip2q_u64(y13, y15);
+
+        // The top 4 bits of the swizzle are handled by plain reg moves here.
+        uint64x2x4_t out0 = { { comb0a, comb1a, comb2a, comb3a } };
+        uint64x2x4_t out1 = { { comb0b, comb1b, comb2b, comb3b } };
+        uint64x2x4_t out2 = { { comb4a, comb5a, comb6a, comb7a } };
+        uint64x2x4_t out3 = { { comb4b, comb5b, comb6b, comb7b } };
+
+        vst1q_u64_x4(reinterpret_cast<uint64_t*>(VoidPtrInc(pImgMicroblock, 0 * 64)), out0);
+        vst1q_u64_x4(reinterpret_cast<uint64_t*>(VoidPtrInc(pImgMicroblock, 1 * 64)), out1);
+        vst1q_u64_x4(reinterpret_cast<uint64_t*>(VoidPtrInc(pImgMicroblock, 2 * 64)), out2);
+        vst1q_u64_x4(reinterpret_cast<uint64_t*>(VoidPtrInc(pImgMicroblock, 3 * 64)), out3);
+    }
+};
+#endif // ADDR_HAS_NEON
+
+}
+#endif
diff --git a/src/amd/addrlib/src/gfx10/gfx10addrlib.cpp b/src/amd/addrlib/src/gfx10/gfx10addrlib.cpp
index 190b1b0a5ea..75e47bf2812 100644
--- a/src/amd/addrlib/src/gfx10/gfx10addrlib.cpp
+++ b/src/amd/addrlib/src/gfx10/gfx10addrlib.cpp
@@ -1,7 +1,7 @@
 /*
 ************************************************************************************************************************
 *
-*  Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved.
+*  Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved.
 *  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
@@ -2782,9 +2782,11 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlGetPreferredSurfaceSetting(
 
             if ((forbid64KbBlockType == FALSE) && (forbidVarBlockType == FALSE))
             {
+                UINT_32 ratioLow;
+                UINT_32 ratioHi;
+                GetSwizzleModePreferenceRatio(pIn, &ratioLow, &ratioHi);
+
                 const UINT_8  maxFmaskSwizzleModeType = 2;
-                const UINT_32 ratioLow                = pIn->flags.minimizeAlign ? 1 : (pIn->flags.opt4space ? 3 : 2);
-                const UINT_32 ratioHi                 = pIn->flags.minimizeAlign ? 1 : (pIn->flags.opt4space ? 2 : 1);
                 const UINT_32 fmaskBpp                = GetFmaskBpp(pIn->numSamples, pIn->numFrags);
                 const UINT_32 numSlices               = Max(pIn->numSlices, 1u);
                 const UINT_32 width                   = Max(pIn->width, 1u);
@@ -3097,8 +3099,10 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlGetPreferredSurfaceSetting(
                         // Tracks the size of each valid swizzle mode's surface in bytes
                         UINT_64 padSize[AddrBlockMaxTiledType] = {};
 
-                        const UINT_32 ratioLow           = computeMinSize ? 1 : (pIn->flags.opt4space ? 3 : 2);
-                        const UINT_32 ratioHi            = computeMinSize ? 1 : (pIn->flags.opt4space ? 2 : 1);
+                        UINT_32 ratioLow;
+                        UINT_32 ratioHi;
+                        GetSwizzleModePreferenceRatio(pIn, &ratioLow, &ratioHi);
+
                         const UINT_64 sizeAlignInElement = Max(NextPow2(pIn->minSizeAlign) / (bpp >> 3), 1u);
                         UINT_32       minSizeBlk         = AddrBlockMicro; // Tracks the most optimal block to use
                         UINT_64       minSize            = 0;              // Tracks the minimum acceptable block type
@@ -4111,7 +4115,7 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlComputeSurfaceAddrFromCoordTiled(
 *   Gfx10Lib::HwlCopyMemToSurface
 *
 *   @brief
-*       Copy multiple regions from memory to a non-linear surface. 
+*       Copy multiple regions from memory to a non-linear surface.
 *
 *   @return
 *       Error or success.
@@ -4177,7 +4181,7 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlCopyMemToSurface(
 
     LutAddresser addresser = LutAddresser();
     addresser.Init(fullSwizzlePattern, ADDR_MAX_EQUATION_BIT, blockExtent, blkSizeLog2);
-    UnalignedCopyMemImgFunc pfnCopyUnaligned = addresser.GetCopyMemImgFunc();
+    UnalignedCopyMemImgFunc pfnCopyUnaligned = addresser.GetCopyMemImgFunc(pIn->copyFlags);
     if (pfnCopyUnaligned == nullptr)
     {
         ADDR_ASSERT_ALWAYS();
@@ -4192,35 +4196,27 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlCopyMemToSurface(
             const ADDR2_MIP_INFO* pMipInfo = &mipInfo[pCurRegion->mipId];
             UINT_64 mipOffset = pIn->singleSubres ? 0 : pMipInfo->macroBlockOffset;
             UINT_32 yBlks = pMipInfo->pitch / localOut.blockWidth;
+            UINT_32 zBlks = localOut.sliceSize >> (addresser.GetBlockBits() - addresser.GetBlockZBits());
 
-            UINT_32 xStart = pCurRegion->x + pMipInfo->mipTailCoordX;
-            UINT_32 yStart = pCurRegion->y + pMipInfo->mipTailCoordY;
-            UINT_32 sliceStart = pCurRegion->slice + pMipInfo->mipTailCoordZ;
+            ADDR_COORD3D rawOrigin = {
+                pCurRegion->x + pMipInfo->mipTailCoordX,
+                pCurRegion->y + pMipInfo->mipTailCoordY,
+                pCurRegion->slice + pMipInfo->mipTailCoordZ
+            };
 
-            for (UINT_32 slice = sliceStart; slice < (sliceStart + pCurRegion->copyDims.depth); slice++)
-            {
-                // The copy functions take the base address of the hardware slice, not the logical slice. Those are
-                // not the same thing in 3D swizzles. Logical slices within 3D swizzles are handled by sliceXor
-                // for unaligned copies.
-                UINT_32 sliceBlkStart = PowTwoAlignDown(slice, localOut.blockSlices);
-                UINT_32 sliceXor = pIn->pbXor ^ addresser.GetAddressZ(slice);
-
-                UINT_64 memOffset = ((slice - pCurRegion->slice) * pCurRegion->memSlicePitch);
-                UINT_64 imgOffset = mipOffset + (sliceBlkStart * localOut.sliceSize);
-
-                ADDR_COORD2D sliceOrigin = { xStart, yStart };
-                ADDR_EXTENT2D sliceExtent = { pCurRegion->copyDims.width, pCurRegion->copyDims.height };
-
-                pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, imgOffset),
-                                 VoidPtrInc(pCurRegion->pMem, memOffset),
-                                 pCurRegion->memRowPitch,
-                                 yBlks,
-                                 sliceOrigin,
-                                 sliceExtent,
-                                 sliceXor,
-                                 addresser);
-            }
+            pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, mipOffset),
+                             pCurRegion->pMem,
+                             pCurRegion->memRowPitch,
+                             pCurRegion->memSlicePitch,
+                             yBlks,
+                             zBlks,
+                             rawOrigin,
+                             pCurRegion->copyDims,
+                             pIn->pbXor,
+                             (pCurRegion->mipId >= localOut.firstMipIdInTail),
+                             addresser);
         }
+        addresser.DoCopyMemImgPostFlushes(pIn->copyFlags);
     }
     return returnCode;
 }
@@ -4230,7 +4226,7 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlCopyMemToSurface(
 *   Gfx10Lib::HwlCopySurfaceToMem
 *
 *   @brief
-*       Copy multiple regions from a non-linear surface to memory. 
+*       Copy multiple regions from a non-linear surface to memory.
 *
 *   @return
 *       Error or success.
@@ -4296,7 +4292,7 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlCopySurfaceToMem(
 
     LutAddresser addresser = LutAddresser();
     addresser.Init(fullSwizzlePattern, ADDR_MAX_EQUATION_BIT, blockExtent, blkSizeLog2);
-    UnalignedCopyMemImgFunc pfnCopyUnaligned = addresser.GetCopyImgMemFunc();
+    UnalignedCopyMemImgFunc pfnCopyUnaligned = addresser.GetCopyImgMemFunc(pIn->copyFlags);
     if (pfnCopyUnaligned == nullptr)
     {
         ADDR_ASSERT_ALWAYS();
@@ -4305,40 +4301,32 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlCopySurfaceToMem(
 
     if (returnCode == ADDR_OK)
     {
+        addresser.DoCopyImgMemPreFlushes(pIn->copyFlags);
         for (UINT_32  regionIdx = 0; regionIdx < regionCount; regionIdx++)
         {
             const ADDR2_COPY_MEMSURFACE_REGION* pCurRegion = &pRegions[regionIdx];
             const ADDR2_MIP_INFO* pMipInfo = &mipInfo[pCurRegion->mipId];
             UINT_64 mipOffset = pIn->singleSubres ? 0 : pMipInfo->macroBlockOffset;
             UINT_32 yBlks = pMipInfo->pitch / localOut.blockWidth;
+            UINT_32 zBlks = localOut.sliceSize >> (addresser.GetBlockBits() - addresser.GetBlockZBits());
 
-            UINT_32 xStart = pCurRegion->x + pMipInfo->mipTailCoordX;
-            UINT_32 yStart = pCurRegion->y + pMipInfo->mipTailCoordY;
-            UINT_32 sliceStart = pCurRegion->slice + pMipInfo->mipTailCoordZ;
+            ADDR_COORD3D rawOrigin = {
+                pCurRegion->x + pMipInfo->mipTailCoordX,
+                pCurRegion->y + pMipInfo->mipTailCoordY,
+                pCurRegion->slice + pMipInfo->mipTailCoordZ
+            };
 
-            for (UINT_32 slice = sliceStart; slice < (sliceStart + pCurRegion->copyDims.depth); slice++)
-            {
-                // The copy functions take the base address of the hardware slice, not the logical slice. Those are
-                // not the same thing in 3D swizzles. Logical slices within 3D swizzles are handled by sliceXor
-                // for unaligned copies.
-                UINT_32 sliceBlkStart = PowTwoAlignDown(slice, localOut.blockSlices);
-                UINT_32 sliceXor = pIn->pbXor ^ addresser.GetAddressZ(slice);
-
-                UINT_64 memOffset = ((slice - pCurRegion->slice) * pCurRegion->memSlicePitch);
-                UINT_64 imgOffset = mipOffset + (sliceBlkStart * localOut.sliceSize);
-
-                ADDR_COORD2D sliceOrigin = { xStart, yStart };
-                ADDR_EXTENT2D sliceExtent = { pCurRegion->copyDims.width, pCurRegion->copyDims.height };
-
-                pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, imgOffset),
-                                 VoidPtrInc(pCurRegion->pMem, memOffset),
-                                 pCurRegion->memRowPitch,
-                                 yBlks,
-                                 sliceOrigin,
-                                 sliceExtent,
-                                 sliceXor,
-                                 addresser);
-            }
+            pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, mipOffset),
+                             pCurRegion->pMem,
+                             pCurRegion->memRowPitch,
+                             pCurRegion->memSlicePitch,
+                             yBlks,
+                             zBlks,
+                             rawOrigin,
+                             pCurRegion->copyDims,
+                             pIn->pbXor,
+                             (pCurRegion->mipId >= localOut.firstMipIdInTail),
+                             addresser);
         }
     }
     return returnCode;
diff --git a/src/amd/addrlib/src/gfx11/gfx11addrlib.cpp b/src/amd/addrlib/src/gfx11/gfx11addrlib.cpp
index 4fd4fc19474..83dfeee09dc 100644
--- a/src/amd/addrlib/src/gfx11/gfx11addrlib.cpp
+++ b/src/amd/addrlib/src/gfx11/gfx11addrlib.cpp
@@ -1,7 +1,7 @@
 /*
 ************************************************************************************************************************
 *
-*  Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved.
+*  Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved.
 *  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
@@ -758,6 +758,14 @@ ChipFamily Gfx11Lib::HwlConvertChipFamily(
         case FAMILY_PHX:
             m_settings.isPhoenix = 1;
             break;
+        case FAMILY_GFX1170:
+        {
+            if (ASICREV_IS_GFX1170(chipRevision))
+            {
+                m_settings.isGfx1170 = 1;
+            }
+        }
+        break;
         default:
             ADDR_ASSERT(!"Unknown chip family");
             break;
@@ -2651,10 +2659,13 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlGetPreferredSurfaceSetting(
 
                         UINT_64 padSize[AddrBlockMaxTiledType] = {};
 
-                        const UINT_32 ratioLow           = computeMinSize ? 1 : (pIn->flags.opt4space ? 3 : 2);
-                        const UINT_32 ratioHi            = computeMinSize ? 1 : (pIn->flags.opt4space ? 2 : 1);
+                        UINT_32 ratioLow;
+                        UINT_32 ratioHi;
+                        GetSwizzleModePreferenceRatio(pIn, &ratioLow, &ratioHi);
+
                         const UINT_64 sizeAlignInElement = Max(NextPow2(pIn->minSizeAlign) / (bpp >> 3), 1u);
                         UINT_32       minSizeBlk         = AddrBlockMicro;
+                        UINT_32       selectedBlk        = AddrBlockMaxTiledType;
                         UINT_64       minSize            = 0;
 
                         ADDR2_COMPUTE_SURFACE_INFO_OUTPUT localOut = {};
@@ -2678,11 +2689,66 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlGetPreferredSurfaceSetting(
                                 {
                                     padSize[i] = localOut.surfSize;
 
-                                    if ((minSize == 0) ||
-                                        Addr2BlockTypeWithinMemoryBudget(minSize, padSize[i], ratioLow, ratioHi))
+                                    if (pIn->useBlockBasedHeuristic)
                                     {
-                                        minSize    = padSize[i];
-                                        minSizeBlk = i;
+                                        const UINT_32 blockCountX = localOut.pitch / localOut.blockWidth;
+                                        const UINT_32 blockCountY = localOut.height / localOut.blockHeight;
+                                        const UINT_32 blockCountZ = localOut.numSlices / localOut.blockSlices;
+
+                                        UINT_32 requiredBlockCountX = 1;
+                                        UINT_32 requiredBlockCountY = 1;
+                                        UINT_32 requiredBlockCountZ = 1;
+
+                                        switch (pIn->resourceType)
+                                        {
+                                        case ADDR_RSRC_TEX_1D:
+                                            requiredBlockCountX = 2;
+                                            break;
+                                        case ADDR_RSRC_TEX_2D:
+                                            requiredBlockCountX = 2;
+                                            requiredBlockCountY = 2;
+                                            break;
+                                        case ADDR_RSRC_TEX_3D:
+                                            requiredBlockCountX = 2;
+                                            requiredBlockCountY = 2;
+                                            if (IsThick(pIn->resourceType, localIn.swizzleMode))
+                                            {
+                                                requiredBlockCountZ = 2;
+                                            }
+                                            break;
+                                        default:
+                                            ADDR_ASSERT_ALWAYS();
+                                        }
+
+                                        // If the block count is sufficient, select this block type. Otherwise, track the block type with minimum size to
+                                        // fall back to it, in case no block type can satisfy the block count requirement.
+                                        if ((blockCountX >= requiredBlockCountX) &&
+                                            (blockCountY >= requiredBlockCountY) &&
+                                            (blockCountZ >= requiredBlockCountZ) &&
+                                            (localIn.swizzleMode != ADDR_SW_LINEAR))
+                                        {
+                                            selectedBlk = i;
+                                        }
+                                        else
+                                        {
+                                            const bool has3DThick = (allowedSwModeSet.value & Gfx11Rsrc3dThickSwModeMask) != 0;
+                                            const bool is3DThin = (pOut->resourceType == ADDR_RSRC_TEX_3D) &&
+                                                                   IsThin(pOut->resourceType, swMode[i]);
+                                            if (((has3DThick && is3DThin) == FALSE) && (minSize == 0 || (padSize[i] < minSize)))
+                                            {
+                                                minSize = padSize[i];
+                                                minSizeBlk = i;
+                                            }
+                                        }
+                                    }
+                                    else
+                                    {
+                                        if ((minSize == 0) ||
+                                            Addr2BlockTypeWithinMemoryBudget(minSize, padSize[i], ratioLow, ratioHi))
+                                        {
+                                            minSize = padSize[i];
+                                            minSizeBlk = i;
+                                        }
                                     }
                                 }
                                 else
@@ -2693,63 +2759,77 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlGetPreferredSurfaceSetting(
                             }
                         }
 
-                        if (pIn->memoryBudget > 1.0)
+                        if (pIn->useBlockBasedHeuristic)
                         {
-                            // If minimum size is given by swizzle mode with bigger-block type, then don't ever check
-                            // smaller-block type again in coming loop
-                            switch (minSizeBlk)
+                            // If there was no block size that would satisfy block based heuristic, fall back to the budget-based heuristic.
+                            if (selectedBlk == AddrBlockMaxTiledType)
                             {
-                                case AddrBlockThick256KB:
-                                    allowedBlockSet.gfx11.thin256KB = 0;
-                                case AddrBlockThin256KB:
-                                    allowedBlockSet.macroThick64KB = 0;
-                                case AddrBlockThick64KB:
-                                    allowedBlockSet.macroThin64KB = 0;
-                                case AddrBlockThin64KB:
-                                    allowedBlockSet.macroThick4KB = 0;
-                                case AddrBlockThick4KB:
-                                    allowedBlockSet.macroThin4KB = 0;
-                                case AddrBlockThin4KB:
-                                    allowedBlockSet.micro  = 0;
-                                case AddrBlockMicro:
-                                    allowedBlockSet.linear = 0;
-                                case AddrBlockLinear:
-                                    break;
-
-                                default:
-                                    ADDR_ASSERT_ALWAYS();
-                                    break;
+                                selectedBlk = minSizeBlk;
                             }
-
-                            for (UINT_32 i = AddrBlockMicro; i < AddrBlockMaxTiledType; i++)
+                        }
+                        else
+                        {
+                            if (pIn->memoryBudget > 1.0)
                             {
-                                if ((i != minSizeBlk) &&
-                                    Addr2IsBlockTypeAvailable(allowedBlockSet, static_cast<::AddrBlockType>(i)))
+                                // If minimum size is given by swizzle mode with bigger-block type, then don't ever check
+                                // smaller-block type again in coming loop
+                                switch (minSizeBlk)
                                 {
-                                    if (Addr2BlockTypeWithinMemoryBudget(minSize, padSize[i], 0, 0, pIn->memoryBudget) == FALSE)
+                                    case AddrBlockThick256KB:
+                                        allowedBlockSet.gfx11.thin256KB = 0;
+                                    case AddrBlockThin256KB:
+                                        allowedBlockSet.macroThick64KB = 0;
+                                    case AddrBlockThick64KB:
+                                        allowedBlockSet.macroThin64KB = 0;
+                                    case AddrBlockThin64KB:
+                                        allowedBlockSet.macroThick4KB = 0;
+                                    case AddrBlockThick4KB:
+                                        allowedBlockSet.macroThin4KB = 0;
+                                    case AddrBlockThin4KB:
+                                        allowedBlockSet.micro  = 0;
+                                    case AddrBlockMicro:
+                                        allowedBlockSet.linear = 0;
+                                    case AddrBlockLinear:
+                                        break;
+
+                                    default:
+                                        ADDR_ASSERT_ALWAYS();
+                                        break;
+                                }
+
+                                for (UINT_32 i = AddrBlockMicro; i < AddrBlockMaxTiledType; i++)
+                                {
+                                    if ((i != minSizeBlk) &&
+                                        Addr2IsBlockTypeAvailable(allowedBlockSet, static_cast<AddrBlockType>(i)))
                                     {
-                                        // Clear the block type if the memory waste is unacceptable
-                                        allowedBlockSet.value &= ~(1u << (i - 1));
+                                        if (Addr2BlockTypeWithinMemoryBudget(minSize, padSize[i], 0, 0, pIn->memoryBudget) == FALSE)
+                                        {
+                                            // Clear the block type if the memory waste is unacceptable
+                                            allowedBlockSet.value &= ~(1u << (i - 1));
+                                        }
                                     }
                                 }
+
+                                // Remove linear block type if 2 or more block types are allowed
+                                if (IsPow2(allowedBlockSet.value) == FALSE)
+                                {
+                                    allowedBlockSet.linear = 0;
+                                }
+
+                                // Select the biggest allowed block type
+                                minSizeBlk = Log2(allowedBlockSet.value) + 1;
+
+                                if (minSizeBlk == static_cast<UINT_32>(AddrBlockMaxTiledType))
+                                {
+                                    minSizeBlk = AddrBlockLinear;
+                                }
                             }
 
-                            // Remove linear block type if 2 or more block types are allowed
-                            if (IsPow2(allowedBlockSet.value) == FALSE)
-                            {
-                                allowedBlockSet.linear = 0;
-                            }
-
-                            // Select the biggest allowed block type
-                            minSizeBlk = Log2(allowedBlockSet.value) + 1;
-
-                            if (minSizeBlk == static_cast<UINT_32>(AddrBlockMaxTiledType))
-                            {
-                                minSizeBlk = AddrBlockLinear;
-                            }
+                            selectedBlk = minSizeBlk;
                         }
 
-                        switch (minSizeBlk)
+
+                        switch (selectedBlk)
                         {
                             case AddrBlockLinear:
                                 allowedSwModeSet.value &= Gfx11LinearSwModeMask;
@@ -3685,7 +3765,7 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlComputeSurfaceAddrFromCoordTiled(
 *   Gfx11Lib::HwlCopyMemToSurface
 *
 *   @brief
-*       Copy multiple regions from memory to a non-linear surface. 
+*       Copy multiple regions from memory to a non-linear surface.
 *
 *   @return
 *       Error or success.
@@ -3751,7 +3831,7 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlCopyMemToSurface(
 
     LutAddresser addresser = LutAddresser();
     addresser.Init(fullSwizzlePattern, ADDR_MAX_EQUATION_BIT, blockExtent, blkSizeLog2);
-    UnalignedCopyMemImgFunc pfnCopyUnaligned = addresser.GetCopyMemImgFunc();
+    UnalignedCopyMemImgFunc pfnCopyUnaligned = addresser.GetCopyMemImgFunc(pIn->copyFlags);
     if (pfnCopyUnaligned == nullptr)
     {
         ADDR_ASSERT_ALWAYS();
@@ -3766,35 +3846,27 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlCopyMemToSurface(
             const ADDR2_MIP_INFO* pMipInfo = &mipInfo[pCurRegion->mipId];
             UINT_64 mipOffset = pIn->singleSubres ? 0 : pMipInfo->macroBlockOffset;
             UINT_32 yBlks = pMipInfo->pitch / localOut.blockWidth;
+            UINT_32 zBlks = localOut.sliceSize >> (addresser.GetBlockBits() - addresser.GetBlockZBits());
 
-            UINT_32 xStart = pCurRegion->x + pMipInfo->mipTailCoordX;
-            UINT_32 yStart = pCurRegion->y + pMipInfo->mipTailCoordY;
-            UINT_32 sliceStart = pCurRegion->slice + pMipInfo->mipTailCoordZ;
+            ADDR_COORD3D rawOrigin = {
+                pCurRegion->x + pMipInfo->mipTailCoordX,
+                pCurRegion->y + pMipInfo->mipTailCoordY,
+                pCurRegion->slice + pMipInfo->mipTailCoordZ
+            };
 
-            for (UINT_32 slice = sliceStart; slice < (sliceStart + pCurRegion->copyDims.depth); slice++)
-            {
-                // The copy functions take the base address of the hardware slice, not the logical slice. Those are
-                // not the same thing in 3D swizzles. Logical slices within 3D swizzles are handled by sliceXor
-                // for unaligned copies.
-                UINT_32 sliceBlkStart = PowTwoAlignDown(slice, localOut.blockSlices);
-                UINT_32 sliceXor = pIn->pbXor ^ addresser.GetAddressZ(slice);
-
-                UINT_64 memOffset = ((slice - pCurRegion->slice) * pCurRegion->memSlicePitch);
-                UINT_64 imgOffset = mipOffset + (sliceBlkStart * localOut.sliceSize);
-
-                ADDR_COORD2D sliceOrigin = { xStart, yStart };
-                ADDR_EXTENT2D sliceExtent = { pCurRegion->copyDims.width, pCurRegion->copyDims.height };
-
-                pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, imgOffset),
-                                 VoidPtrInc(pCurRegion->pMem, memOffset),
-                                 pCurRegion->memRowPitch,
-                                 yBlks,
-                                 sliceOrigin,
-                                 sliceExtent,
-                                 sliceXor,
-                                 addresser);
-            }
+            pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, mipOffset),
+                             pCurRegion->pMem,
+                             pCurRegion->memRowPitch,
+                             pCurRegion->memSlicePitch,
+                             yBlks,
+                             zBlks,
+                             rawOrigin,
+                             pCurRegion->copyDims,
+                             pIn->pbXor,
+                             (pCurRegion->mipId >= localOut.firstMipIdInTail),
+                             addresser);
         }
+        addresser.DoCopyMemImgPostFlushes(pIn->copyFlags);
     }
     return returnCode;
 }
@@ -3804,7 +3876,7 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlCopyMemToSurface(
 *   Gfx11Lib::HwlCopySurfaceToMem
 *
 *   @brief
-*       Copy multiple regions from a non-linear surface to memory. 
+*       Copy multiple regions from a non-linear surface to memory.
 *
 *   @return
 *       Error or success.
@@ -3870,7 +3942,7 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlCopySurfaceToMem(
 
     LutAddresser addresser = LutAddresser();
     addresser.Init(fullSwizzlePattern, ADDR_MAX_EQUATION_BIT, blockExtent, blkSizeLog2);
-    UnalignedCopyMemImgFunc pfnCopyUnaligned = addresser.GetCopyImgMemFunc();
+    UnalignedCopyMemImgFunc pfnCopyUnaligned = addresser.GetCopyImgMemFunc(pIn->copyFlags);
     if (pfnCopyUnaligned == nullptr)
     {
         ADDR_ASSERT_ALWAYS();
@@ -3879,40 +3951,32 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlCopySurfaceToMem(
 
     if (returnCode == ADDR_OK)
     {
+        addresser.DoCopyImgMemPreFlushes(pIn->copyFlags);
         for (UINT_32  regionIdx = 0; regionIdx < regionCount; regionIdx++)
         {
             const ADDR2_COPY_MEMSURFACE_REGION* pCurRegion = &pRegions[regionIdx];
             const ADDR2_MIP_INFO* pMipInfo = &mipInfo[pCurRegion->mipId];
             UINT_64 mipOffset = pIn->singleSubres ? 0 : pMipInfo->macroBlockOffset;
             UINT_32 yBlks = pMipInfo->pitch / localOut.blockWidth;
+            UINT_32 zBlks = localOut.sliceSize >> (addresser.GetBlockBits() - addresser.GetBlockZBits());
 
-            UINT_32 xStart = pCurRegion->x + pMipInfo->mipTailCoordX;
-            UINT_32 yStart = pCurRegion->y + pMipInfo->mipTailCoordY;
-            UINT_32 sliceStart = pCurRegion->slice + pMipInfo->mipTailCoordZ;
+            ADDR_COORD3D rawOrigin = {
+                pCurRegion->x + pMipInfo->mipTailCoordX,
+                pCurRegion->y + pMipInfo->mipTailCoordY,
+                pCurRegion->slice + pMipInfo->mipTailCoordZ
+            };
 
-            for (UINT_32 slice = sliceStart; slice < (sliceStart + pCurRegion->copyDims.depth); slice++)
-            {
-                // The copy functions take the base address of the hardware slice, not the logical slice. Those are
-                // not the same thing in 3D swizzles. Logical slices within 3D swizzles are handled by sliceXor
-                // for unaligned copies.
-                UINT_32 sliceBlkStart = PowTwoAlignDown(slice, localOut.blockSlices);
-                UINT_32 sliceXor = pIn->pbXor ^ addresser.GetAddressZ(slice);
-
-                UINT_64 memOffset = ((slice - pCurRegion->slice) * pCurRegion->memSlicePitch);
-                UINT_64 imgOffset = mipOffset + (sliceBlkStart * localOut.sliceSize);
-
-                ADDR_COORD2D sliceOrigin = { xStart, yStart };
-                ADDR_EXTENT2D sliceExtent = { pCurRegion->copyDims.width, pCurRegion->copyDims.height };
-
-                pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, imgOffset),
-                                 VoidPtrInc(pCurRegion->pMem, memOffset),
-                                 pCurRegion->memRowPitch,
-                                 yBlks,
-                                 sliceOrigin,
-                                 sliceExtent,
-                                 sliceXor,
-                                 addresser);
-            }
+            pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, mipOffset),
+                             pCurRegion->pMem,
+                             pCurRegion->memRowPitch,
+                             pCurRegion->memSlicePitch,
+                             yBlks,
+                             zBlks,
+                             rawOrigin,
+                             pCurRegion->copyDims,
+                             pIn->pbXor,
+                             (pCurRegion->mipId >= localOut.firstMipIdInTail),
+                             addresser);
         }
     }
     return returnCode;
diff --git a/src/amd/addrlib/src/gfx11/gfx11addrlib.h b/src/amd/addrlib/src/gfx11/gfx11addrlib.h
index fd6508903c1..e0596c656c1 100644
--- a/src/amd/addrlib/src/gfx11/gfx11addrlib.h
+++ b/src/amd/addrlib/src/gfx11/gfx11addrlib.h
@@ -1,7 +1,7 @@
 /*
 ************************************************************************************************************************
 *
-*  Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved.
+*  Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved.
 *  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
@@ -36,7 +36,8 @@ struct Gfx11ChipSettings
     {
         UINT_32 isStrix             :  1;
         UINT_32 isPhoenix           :  1;
-        UINT_32 reserved1           : 30;
+        UINT_32 isGfx1170           :  1;
+        UINT_32 reserved1           : 29;
 
         // Misc configuration bits
         UINT_32 reserved2           : 32;
diff --git a/src/amd/addrlib/src/gfx12/gfx12addrlib.cpp b/src/amd/addrlib/src/gfx12/gfx12addrlib.cpp
index 755b3c6adbb..0fcc8545194 100644
--- a/src/amd/addrlib/src/gfx12/gfx12addrlib.cpp
+++ b/src/amd/addrlib/src/gfx12/gfx12addrlib.cpp
@@ -1,7 +1,7 @@
 /*
 ************************************************************************************************************************
 *
-*  Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
+*  Copyright (C) 2022-2026 Advanced Micro Devices, Inc. All rights reserved.
 *  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
@@ -71,8 +71,7 @@ const SwizzleModeFlags Gfx12Lib::SwizzleModeTable[ADDR3_MAX_TYPE] =
 Gfx12Lib::Gfx12Lib(
     const Client* pClient)
     :
-    Lib(pClient),
-    m_numSwizzleBits(0)
+    Lib(pClient)
 {
     memcpy(m_swizzleModeTable, SwizzleModeTable, sizeof(SwizzleModeTable));
 }
@@ -878,7 +877,7 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlComputeSurfaceAddrFromCoordTiled(
 *   Gfx12Lib::HwlCopyMemToSurface
 *
 *   @brief
-*       Copy multiple regions from memory to a non-linear surface. 
+*       Copy multiple regions from memory to a non-linear surface.
 *
 *   @return
 *       Error or success.
@@ -925,7 +924,7 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlCopyMemToSurface(
     }
 
     LutAddresser addresser = LutAddresser();
-    UnalignedCopyMemImgFunc pfnCopyUnaligned = nullptr; 
+    UnalignedCopyMemImgFunc pfnCopyUnaligned = nullptr;
     if (returnCode == ADDR_OK)
     {
         const UINT_32 blkSizeLog2 = GetBlockSizeLog2(pIn->swizzleMode);
@@ -936,7 +935,7 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlCopyMemToSurface(
         ADDR_BIT_SETTING fullSwizzlePattern[Log2Size256K] = {};
         GetSwizzlePatternFromPatternInfo(pPatInfo, fullSwizzlePattern);
         addresser.Init(fullSwizzlePattern, Log2Size256K, localOut.blockExtent, blkSizeLog2);
-        pfnCopyUnaligned = addresser.GetCopyMemImgFunc();
+        pfnCopyUnaligned = addresser.GetCopyMemImgFunc(pIn->copyFlags);
         if (pfnCopyUnaligned == nullptr)
         {
             ADDR_ASSERT_ALWAYS(); // What format is this?
@@ -952,35 +951,27 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlCopyMemToSurface(
             const ADDR3_MIP_INFO* pMipInfo = &mipInfo[pCurRegion->mipId];
             UINT_64 mipOffset = pIn->singleSubres ? 0 : pMipInfo->macroBlockOffset;
             UINT_32 yBlks = pMipInfo->pitch / localOut.blockExtent.width;
+            UINT_32 zBlks = localOut.sliceSize >> (addresser.GetBlockBits() - addresser.GetBlockZBits());
 
-            UINT_32 xStart = pCurRegion->x + pMipInfo->mipTailCoordX;
-            UINT_32 yStart = pCurRegion->y + pMipInfo->mipTailCoordY;
-            UINT_32 sliceStart = pCurRegion->slice + pMipInfo->mipTailCoordZ;
+            ADDR_COORD3D rawOrigin = {
+                pCurRegion->x + pMipInfo->mipTailCoordX,
+                pCurRegion->y + pMipInfo->mipTailCoordY,
+                pCurRegion->slice + pMipInfo->mipTailCoordZ
+            };
 
-            for (UINT_32 slice = sliceStart; slice < (sliceStart + pCurRegion->copyDims.depth); slice++)
-            {
-                // The copy functions take the base address of the hardware slice, not the logical slice. Those are
-                // not the same thing in 3D swizzles. Logical slices within 3D swizzles are handled by sliceXor
-                // for unaligned copies.
-                UINT_32 sliceBlkStart = PowTwoAlignDown(slice, localOut.blockExtent.depth);
-                UINT_32 sliceXor = pIn->pbXor ^ addresser.GetAddressZ(slice);
-
-                UINT_64 memOffset = ((slice - pCurRegion->slice) * pCurRegion->memSlicePitch);
-                UINT_64 imgOffset = mipOffset + (sliceBlkStart * localOut.sliceSize);
-
-                ADDR_COORD2D sliceOrigin = { xStart, yStart };
-                ADDR_EXTENT2D sliceExtent = { pCurRegion->copyDims.width, pCurRegion->copyDims.height };
-
-                pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, imgOffset),
-                                 VoidPtrInc(pCurRegion->pMem, memOffset),
-                                 pCurRegion->memRowPitch,
-                                 yBlks,
-                                 sliceOrigin,
-                                 sliceExtent,
-                                 sliceXor,
-                                 addresser);
-            }
+            pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, mipOffset),
+                             pCurRegion->pMem,
+                             pCurRegion->memRowPitch,
+                             pCurRegion->memSlicePitch,
+                             yBlks,
+                             zBlks,
+                             rawOrigin,
+                             pCurRegion->copyDims,
+                             pIn->pbXor,
+                             (pCurRegion->mipId >= localOut.firstMipIdInTail),
+                             addresser);
         }
+        addresser.DoCopyMemImgPostFlushes(pIn->copyFlags);
     }
     return returnCode;
 }
@@ -990,7 +981,7 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlCopyMemToSurface(
 *   Gfx12Lib::HwlCopySurfaceToMem
 *
 *   @brief
-*       Copy multiple regions from a non-linear surface to memory. 
+*       Copy multiple regions from a non-linear surface to memory.
 *
 *   @return
 *       Error or success.
@@ -1037,7 +1028,7 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlCopySurfaceToMem(
     }
 
     LutAddresser addresser = LutAddresser();
-    UnalignedCopyMemImgFunc pfnCopyUnaligned = nullptr; 
+    UnalignedCopyMemImgFunc pfnCopyUnaligned = nullptr;
     if (returnCode == ADDR_OK)
     {
         const UINT_32 blkSizeLog2 = GetBlockSizeLog2(pIn->swizzleMode);
@@ -1048,7 +1039,7 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlCopySurfaceToMem(
         ADDR_BIT_SETTING fullSwizzlePattern[Log2Size256K] = {};
         GetSwizzlePatternFromPatternInfo(pPatInfo, fullSwizzlePattern);
         addresser.Init(fullSwizzlePattern, Log2Size256K, localOut.blockExtent, blkSizeLog2);
-        pfnCopyUnaligned = addresser.GetCopyImgMemFunc();
+        pfnCopyUnaligned = addresser.GetCopyImgMemFunc(pIn->copyFlags);
         if (pfnCopyUnaligned == nullptr)
         {
             ADDR_ASSERT_ALWAYS(); // What format is this?
@@ -1058,78 +1049,37 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlCopySurfaceToMem(
 
     if (returnCode == ADDR_OK)
     {
+        addresser.DoCopyImgMemPreFlushes(pIn->copyFlags);
         for (UINT_32  regionIdx = 0; regionIdx < regionCount; regionIdx++)
         {
             const ADDR3_COPY_MEMSURFACE_REGION* pCurRegion = &pRegions[regionIdx];
             const ADDR3_MIP_INFO* pMipInfo = &mipInfo[pCurRegion->mipId];
             UINT_64 mipOffset = pIn->singleSubres ? 0 : pMipInfo->macroBlockOffset;
             UINT_32 yBlks = pMipInfo->pitch / localOut.blockExtent.width;
+            UINT_32 zBlks = localOut.sliceSize >> (addresser.GetBlockBits() - addresser.GetBlockZBits());
 
-            UINT_32 xStart = pCurRegion->x + pMipInfo->mipTailCoordX;
-            UINT_32 yStart = pCurRegion->y + pMipInfo->mipTailCoordY;
-            UINT_32 sliceStart = pCurRegion->slice + pMipInfo->mipTailCoordZ;
+            ADDR_COORD3D rawOrigin = {
+                pCurRegion->x + pMipInfo->mipTailCoordX,
+                pCurRegion->y + pMipInfo->mipTailCoordY,
+                pCurRegion->slice + pMipInfo->mipTailCoordZ
+            };
 
-            for (UINT_32 slice = sliceStart; slice < (sliceStart + pCurRegion->copyDims.depth); slice++)
-            {
-                // The copy functions take the base address of the hardware slice, not the logical slice. Those are
-                // not the same thing in 3D swizzles. Logical slices within 3D swizzles are handled by sliceXor
-                // for unaligned copies.
-                UINT_32 sliceBlkStart = PowTwoAlignDown(slice, localOut.blockExtent.depth);
-                UINT_32 sliceXor = pIn->pbXor ^ addresser.GetAddressZ(slice);
-
-                UINT_64 memOffset = ((slice - pCurRegion->slice) * pCurRegion->memSlicePitch);
-                UINT_64 imgOffset = mipOffset + (sliceBlkStart * localOut.sliceSize);
-
-                ADDR_COORD2D sliceOrigin = { xStart, yStart };
-                ADDR_EXTENT2D sliceExtent = { pCurRegion->copyDims.width, pCurRegion->copyDims.height };
-
-                pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, imgOffset),
-                                 VoidPtrInc(pCurRegion->pMem, memOffset),
-                                 pCurRegion->memRowPitch,
-                                 yBlks,
-                                 sliceOrigin,
-                                 sliceExtent,
-                                 sliceXor,
-                                 addresser);
-            }
+            pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, mipOffset),
+                             pCurRegion->pMem,
+                             pCurRegion->memRowPitch,
+                             pCurRegion->memSlicePitch,
+                             yBlks,
+                             zBlks,
+                             rawOrigin,
+                             pCurRegion->copyDims,
+                             pIn->pbXor,
+                             (pCurRegion->mipId >= localOut.firstMipIdInTail),
+                             addresser);
         }
     }
     return returnCode;
 }
 
-
-/**
-************************************************************************************************************************
-*   Gfx12Lib::HwlComputePipeBankXor
-*
-*   @brief
-*       Generate a PipeBankXor value to be ORed into bits above numSwizzleBits of address
-*
-*   @return
-*       PipeBankXor value
-************************************************************************************************************************
-*/
-ADDR_E_RETURNCODE Gfx12Lib::HwlComputePipeBankXor(
-    const ADDR3_COMPUTE_PIPEBANKXOR_INPUT* pIn,     ///< [in] input structure
-    ADDR3_COMPUTE_PIPEBANKXOR_OUTPUT*      pOut     ///< [out] output structure
-    ) const
-{
-    if ((m_numSwizzleBits != 0)               && // does this configuration support swizzling
-        //         base address XOR in GFX12 will be applied to all blk_size = 4KB, 64KB, or 256KB swizzle modes,
-        //         Note that Linear and 256B are excluded.
-        (IsLinear(pIn->swizzleMode) == FALSE) &&
-        (IsBlock256b(pIn->swizzleMode) == FALSE))
-    {
-        pOut->pipeBankXor = pIn->surfIndex % (1 << m_numSwizzleBits);
-    }
-    else
-    {
-        pOut->pipeBankXor = 0;
-    }
-
-    return ADDR_OK;
-}
-
 /**
 ************************************************************************************************************************
 *   Gfx12Lib::GetSwizzlePatternInfo
@@ -1263,72 +1213,13 @@ const ADDR_SW_PATINFO* Gfx12Lib::GetSwizzlePatternInfo(
 BOOL_32 Gfx12Lib::HwlInitGlobalParams(
     const ADDR_CREATE_INPUT* pCreateIn) ///< [in] create input
 {
-    BOOL_32              valid = TRUE;
-    GB_ADDR_CONFIG_GFX12 gbAddrConfig;
-
-    gbAddrConfig.u32All = pCreateIn->regValue.gbAddrConfig;
-
-    switch (gbAddrConfig.bits.NUM_PIPES)
-    {
-        case ADDR_CONFIG_1_PIPE:
-            m_pipesLog2 = 0;
-            break;
-        case ADDR_CONFIG_2_PIPE:
-            m_pipesLog2 = 1;
-            break;
-        case ADDR_CONFIG_4_PIPE:
-            m_pipesLog2 = 2;
-            break;
-        case ADDR_CONFIG_8_PIPE:
-            m_pipesLog2 = 3;
-            break;
-        case ADDR_CONFIG_16_PIPE:
-            m_pipesLog2 = 4;
-            break;
-        case ADDR_CONFIG_32_PIPE:
-            m_pipesLog2 = 5;
-            break;
-        case ADDR_CONFIG_64_PIPE:
-            m_pipesLog2 = 6;
-            break;
-        default:
-            ADDR_ASSERT_ALWAYS();
-            valid = FALSE;
-            break;
-    }
-
-    switch (gbAddrConfig.bits.PIPE_INTERLEAVE_SIZE)
-    {
-        case ADDR_CONFIG_PIPE_INTERLEAVE_256B:
-            m_pipeInterleaveLog2 = 8;
-            break;
-        case ADDR_CONFIG_PIPE_INTERLEAVE_512B:
-            m_pipeInterleaveLog2 = 9;
-            break;
-        case ADDR_CONFIG_PIPE_INTERLEAVE_1KB:
-            m_pipeInterleaveLog2 = 10;
-            break;
-        case ADDR_CONFIG_PIPE_INTERLEAVE_2KB:
-            m_pipeInterleaveLog2 = 11;
-            break;
-        default:
-            ADDR_ASSERT_ALWAYS();
-            valid = FALSE;
-            break;
-    }
-
-    m_numSwizzleBits = ((m_pipesLog2 >= 3) ? m_pipesLog2 - 2 : 0);
-
     // Gfx10+ chips treat packed 8-bit 422 formats as 32bpe with 2pix/elem.
     m_configFlags.use32bppFor422Fmt = TRUE;
 
-    if (valid)
-    {
-        InitEquationTable();
-        InitBlockDimensionTable();
-    }
+    InitEquationTable();
+    InitBlockDimensionTable();
 
-    return valid;
+    return TRUE;
 }
 
 /**
@@ -1579,10 +1470,10 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlComputeSlicePipeBankXor(
                                                                             pIn->slice,
                                                                             0);
 
-                const UINT_32 pipeBankXor = pipeBankXorOffset >> m_pipeInterleaveLog2;
+                const UINT_32 pipeBankXor = pipeBankXorOffset >> PipeInterleaveLog2;
 
                 // Should have no bit set under pipe interleave
-                ADDR_ASSERT((pipeBankXor << m_pipeInterleaveLog2) == pipeBankXorOffset);
+                ADDR_ASSERT((pipeBankXor << PipeInterleaveLog2) == pipeBankXorOffset);
 
                 pOut->pipeBankXor = pIn->basePipeBankXor ^ pipeBankXor;
             }
@@ -2043,7 +1934,7 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlComputeStereoInfo(
         UINT_32 yPosMask = 0;
 
         // First get "max y bit"
-        for (UINT_32 i = m_pipeInterleaveLog2; i < blkSizeLog2; i++)
+        for (UINT_32 i = PipeInterleaveLog2; i < blkSizeLog2; i++)
         {
             ADDR_ASSERT(m_equationTable[eqIndex].addr[i].valid == 1);
 
@@ -2055,7 +1946,7 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlComputeStereoInfo(
         }
 
         // Then loop again for populating a position mask of "max Y bit"
-        for (UINT_32 i = m_pipeInterleaveLog2; i < blkSizeLog2; i++)
+        for (UINT_32 i = PipeInterleaveLog2; i < blkSizeLog2; i++)
         {
             if ((m_equationTable[eqIndex].addr[i].channel == 1) &&
                 (m_equationTable[eqIndex].addr[i].index == yMax))
@@ -2074,7 +1965,7 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlComputeStereoInfo(
 
             if ((alignedHeight >> yMax) & 1)
             {
-                *pRightXor = yPosMask >> m_pipeInterleaveLog2;
+                *pRightXor = yPosMask >> PipeInterleaveLog2;
             }
         }
     }
diff --git a/src/amd/addrlib/src/gfx12/gfx12addrlib.h b/src/amd/addrlib/src/gfx12/gfx12addrlib.h
index fec4db0ce49..22cdb4959c5 100644
--- a/src/amd/addrlib/src/gfx12/gfx12addrlib.h
+++ b/src/amd/addrlib/src/gfx12/gfx12addrlib.h
@@ -1,7 +1,7 @@
 /*
 ************************************************************************************************************************
 *
-*  Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
+*  Copyright (C) 2022-2026 Advanced Micro Devices, Inc. All rights reserved.
 *  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
@@ -147,10 +147,6 @@ private:
     static const UINT_32 MaxImageDim  = 32768; // Max image size is 32k
     static const UINT_32 MaxMipLevels = 16;
 
-    virtual ADDR_E_RETURNCODE HwlComputePipeBankXor(
-        const ADDR3_COMPUTE_PIPEBANKXOR_INPUT* pIn,
-        ADDR3_COMPUTE_PIPEBANKXOR_OUTPUT*      pOut) const override;
-
     virtual BOOL_32 HwlInitGlobalParams(const ADDR_CREATE_INPUT* pCreateIn) override;
 
     virtual ADDR_E_RETURNCODE HwlComputeStereoInfo(
@@ -172,8 +168,6 @@ private:
         const ADDR3_COPY_MEMSURFACE_REGION* pRegions,
         UINT_32                             regionCount) const override;
 
-    UINT_32           m_numSwizzleBits;
-
     // Initialize equation table
     VOID InitEquationTable();