amd: import gfx11.7 addrlib

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40958>
2026-05-07 11:28:05 +02:00 · 2026-03-23 22:14:28 +00:00 · 2026-03-23 22:14:28 +00:00 · d778ede72c
commit d778ede72c
parent dfca417db8
20 changed files with 3407 additions and 509 deletions
--- a/src/amd/addrlib/inc/addrinterface.h
+++ b/src/amd/addrlib/inc/addrinterface.h
@ -1,7 +1,7 @@
 /*
 ************************************************************************************************************************
 *
-*  Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved.
+*  Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved.
 *  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
@ -24,7 +24,7 @@ extern "C"
 #endif

 #define ADDRLIB_VERSION_MAJOR 10
-#define ADDRLIB_VERSION_MINOR 1
+#define ADDRLIB_VERSION_MINOR 6
 #define ADDRLIB_MAKE_VERSION(major, minor) ((major << 16) | minor)
 #define ADDRLIB_VERSION                    ADDRLIB_MAKE_VERSION(ADDRLIB_VERSION_MAJOR, ADDRLIB_VERSION_MINOR)

@ -107,6 +107,11 @@ typedef struct _ADDR_EXTENT3D
 *     AddrComputeFmaskAddrFromCoord()
 *     AddrComputeFmaskCoordFromAddr()
 *
+* /////////////////////////////////////////////////////////////////////////////////////////////////
+* //                                   Format properties functions
+* /////////////////////////////////////////////////////////////////////////////////////////////////
+*     AddrFormatProperties()
+*
 **/
 /**
 * /////////////////////////////////////////////////////////////////////////////////////////////////
@ -452,6 +457,49 @@ ADDR_E_RETURNCODE ADDR_API AddrCreate(
 ADDR_E_RETURNCODE ADDR_API AddrDestroy(
    ADDR_HANDLE hLib);

+/**
+****************************************************************************************************
+* ADDR_FORMAT_PROPERTIES_IN
+*
+*   @brief
+*       Input structure to the AddrFormatProperties routine.
+*
+****************************************************************************************************
+*/
+typedef struct _ADDR_FORMAT_PROPERTIES_IN {
+    UINT_32     size;     ///< Size of this structure in bytes
+    AddrFormat  format;   ///< If format is set to valid one, bpp/width/height
+                          ///  might be overwritten
+} ADDR_FORMAT_PROPERTIES_IN;
+
+/**
+****************************************************************************************************
+* ADDR_FORMAT_PROPERTIES_OUT
+*
+*   @brief
+*       Output structure from the AddrFormatProperties routine.
+*
+****************************************************************************************************
+*/
+typedef struct _ADDR_FORMAT_PROPERTIES_OUT {
+    UINT_32        size;     ///< Size of this structure in bytes
+    UINT_32        bpp;      ///< Bits per pixel as laid out in memory (eg. 128bpp for BC7)
+    ADDR_EXTENT2D  expand;   ///< Dimensions of one macro pixel block
+} ADDR_FORMAT_PROPERTIES_OUT;
+
+/**
+****************************************************************************************************
+*   AddrFormatProperties
+*
+*   @brief
+*       Gets a list of format properties
+*
+****************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrFormatProperties(
+    ADDR_HANDLE                       hLib,
+    const ADDR_FORMAT_PROPERTIES_IN*  in,
+    ADDR_FORMAT_PROPERTIES_OUT*       pOut);

 ////////////////////////////////////////////////////////////////////////////////////////////////////
 //                                    Surface functions
@ -2463,6 +2511,7 @@ typedef union _ADDR2_SURFACE_FLAGS
        UINT_32 rotated           :  1; ///< This resource is rotated and displayable
        UINT_32 needEquation      :  1; ///< This resource needs equation to be generated if possible
        UINT_32 opt4space         :  1; ///< This resource should be optimized for space
+        UINT_32 computeMaxSize    :  1; ///< This resource should select the largest swizzle possible
        UINT_32 minimizeAlign     :  1; ///< This resource should use minimum alignment
        UINT_32 noMetadata        :  1; ///< This resource has no metadata
        UINT_32 metaRbUnaligned   :  1; ///< This resource has rb unaligned metadata
@ -2470,7 +2519,7 @@ typedef union _ADDR2_SURFACE_FLAGS
        UINT_32 view3dAs2dArray   :  1; ///< This resource is a 3D resource viewed as 2D array
        UINT_32 allowExtEquation  :  1; ///< If unset, only legacy DX eqs are allowed (2 XORs)
        UINT_32 requireMetadata   :  1; ///< This resource must support metadata
-        UINT_32 reserved          : 11; ///< Reserved bits
+        UINT_32 reserved          : 10; ///< Reserved bits
    };

    UINT_32 value;
@ -2666,6 +2715,31 @@ ADDR_E_RETURNCODE ADDR_API Addr2ComputeSurfaceAddrFromCoord(
    const ADDR2_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT*    pIn,
    ADDR2_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT*         pOut);

+/**
+****************************************************************************************************
+*   ADDR_COPY_FLAGS
+*
+*   @brief
+*       Options controlling image copy functions.
+****************************************************************************************************
+*/
+typedef union _ADDR_COPY_FLAGS {
+    struct
+    {
+        UINT_32 blockMemcpy  : 1; ///< Memory layout is pre-swizzled and stored block-by-block.
+                                  ///  For regions in the miptail, this uses hybrid memcpy.
+                                  ///  Regions must cover full width/height of the subresource.
+        UINT_32 hybridMemcpy : 1; ///< Memory layout is partially pre-swizzled and stored
+                                  ///  microblock-by-microblock. Data in this format is agnostic to
+                                  ///  chip harvesting and block size. Regions will be padded out
+                                  ///  to microblock boundaries for alignment.
+                                  ///  Mutually exclusive with 'blockMemcpy'.
+        UINT_32 reserved    : 30; ///< Reserved bits
+    };
+
+    UINT_32 value;
+} ADDR_COPY_FLAGS;
+
 /**
 ****************************************************************************************************
 *   ADDR2_COPY_MEMSURFACE_REGION
@ -2718,6 +2792,7 @@ typedef struct _ADDR2_COPY_MEMSURFACE_INPUT
                                         ///   - copyDims.depth == 1
                                         ///   - all copy regions target the same mip
                                         ///   - all copy regions target the same slice/depth
+    ADDR_COPY_FLAGS     copyFlags;       ///< Controls how the copy is performed.
 } ADDR2_COPY_MEMSURFACE_INPUT;

 /**
@ -4008,30 +4083,34 @@ typedef union _ADDR2_SWMODE_SET
 */
 typedef struct _ADDR2_GET_PREFERRED_SURF_SETTING_INPUT
 {
-    UINT_32               size;              ///< Size of this structure in bytes
+    UINT_32               size;                   ///< Size of this structure in bytes

-    ADDR2_SURFACE_FLAGS   flags;             ///< Surface flags
-    AddrResourceType      resourceType;      ///< Surface type
-    AddrFormat            format;            ///< Surface format
-    AddrResrouceLocation  resourceLoction;   ///< Surface heap choice
-    ADDR2_BLOCK_SET       forbiddenBlock;    ///< Client can use it to disable some block setting
-                                             ///< such as linear for DXTn, tiled for YUV
-    ADDR2_SWTYPE_SET      preferredSwSet;    ///< Client can use it to specify sw type(s) wanted
-    BOOL_32               noXor;             ///< Do not use xor mode for this resource
-    UINT_32               bpp;               ///< bits per pixel
-    UINT_32               width;             ///< Width (of mip0), in pixels
-    UINT_32               height;            ///< Height (of mip0), in pixels
-    UINT_32               numSlices;         ///< Number surface slice/depth (of mip0),
-    UINT_32               numMipLevels;      ///< Total mipmap levels.
-    UINT_32               numSamples;        ///< Number of samples
-    UINT_32               numFrags;          ///< Number of fragments, leave it zero or the same as
-                                             ///  number of samples for normal AA; Set it to the
-                                             ///  number of fragments for EQAA
-    UINT_32               maxAlign;          ///< maximum base/size alignment requested by client
-    UINT_32               minSizeAlign;      ///< memory allocated for surface in client driver will
-                                             ///  be padded to multiple of this value (in bytes)
-    DOUBLE                memoryBudget;      ///< Memory consumption ratio based on minimum possible
-                                             ///  size.
+    ADDR2_SURFACE_FLAGS   flags;                  ///< Surface flags
+    AddrResourceType      resourceType;           ///< Surface type
+    AddrFormat            format;                 ///< Surface format
+    AddrResrouceLocation  resourceLoction;        ///< Surface heap choice
+    ADDR2_BLOCK_SET       forbiddenBlock;         ///< Client can use it to disable some block setting
+                                                  ///< such as linear for DXTn, tiled for YUV
+    ADDR2_SWTYPE_SET      preferredSwSet;         ///< Client can use it to specify sw type(s) wanted
+    BOOL_32               noXor;                  ///< Do not use xor mode for this resource
+    UINT_32               bpp;                    ///< bits per pixel
+    UINT_32               width;                  ///< Width (of mip0), in pixels
+    UINT_32               height;                 ///< Height (of mip0), in pixels
+    UINT_32               numSlices;              ///< Number surface slice/depth (of mip0),
+    UINT_32               numMipLevels;           ///< Total mipmap levels.
+    UINT_32               numSamples;             ///< Number of samples
+    UINT_32               numFrags;               ///< Number of fragments, leave it zero or the same as
+                                                  ///  number of samples for normal AA; Set it to the
+                                                  ///  number of fragments for EQAA
+    UINT_32               maxAlign;               ///< maximum base/size alignment requested by client
+    UINT_32               minSizeAlign;           ///< memory allocated for surface in client driver will
+                                                  ///  be padded to multiple of this value (in bytes)
+    DOUBLE                memoryBudget;           ///< Memory consumption ratio based on minimum possible
+                                                  ///  size.
+    bool                  useBlockBasedHeuristic; ///< Use the block-based heuristic for swizzle mode selection.
+                                                  ///  The heuristic has the property of image size predictably
+                                                  ///  with image extents, which is needed for Vulkan. It ignores
+                                                  ///  minSizeAlign, maxAlign and memoryBudget options
 } ADDR2_GET_PREFERRED_SURF_SETTING_INPUT;

 /**
@ -4488,6 +4567,7 @@ typedef struct _ADDR3_COPY_MEMSURFACE_INPUT
                                         ///   - copyDims.depth == 1
                                         ///   - all copy regions target the same mip
                                         ///   - all copy regions target the same slice/depth
+    ADDR_COPY_FLAGS     copyFlags;       ///< Controls how the copy is performed.
 } ADDR3_COPY_MEMSURFACE_INPUT;

 /**
--- a/src/amd/addrlib/inc/addrtypes.h
+++ b/src/amd/addrlib/inc/addrtypes.h
@ -1,7 +1,7 @@
 /*
 ************************************************************************************************************************
 *
-*  Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved.
+*  Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved.
 *  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
--- a/src/amd/addrlib/meson.build
+++ b/src/amd/addrlib/meson.build
@ -20,6 +20,7 @@ files_addrlib = files(
  'src/core/addrobject.h',
  'src/core/addrswizzler.cpp',
  'src/core/addrswizzler.h',
+  'src/core/addrswizzlersimd.h',
  'src/core/coord.cpp',
  'src/core/coord.h',
  'src/gfx9/gfx9addrlib.cpp',
--- a/src/amd/addrlib/src/addrinterface.cpp
+++ b/src/amd/addrlib/src/addrinterface.cpp
@ -1,7 +1,7 @@
 /*
 ************************************************************************************************************************
 *
-*  Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved.
+*  Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved.
 *  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
@ -81,6 +81,34 @@ ADDR_E_RETURNCODE ADDR_API AddrDestroy(
    return returnCode;
 }

+/**
+****************************************************************************************************
+*   AddrFormatProperties
+*
+*   @brief
+*       Retreives properties of the specified format.
+*
+****************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrFormatProperties(
+    ADDR_HANDLE                       hLib,
+    const ADDR_FORMAT_PROPERTIES_IN&  in,
+    ADDR_FORMAT_PROPERTIES_OUT*       pOut)
+{
+    ADDR_E_RETURNCODE  retCode = ADDR_INVALIDPARAMS;
+
+    if (hLib)
+    {
+        Lib* pLib = Lib::GetLib(hLib);
+
+        if (pLib != NULL)
+        {
+            retCode = pLib->GetFormatProperties(in, pOut);
+        }
+    }
+
+    return retCode;
+}

 ////////////////////////////////////////////////////////////////////////////////////////////////////
 //                                    Surface functions
--- a/src/amd/addrlib/src/amdgpu_asic_addr.h
+++ b/src/amd/addrlib/src/amdgpu_asic_addr.h
@ -1,7 +1,7 @@
 /*
 ************************************************************************************************************************
 *
-*  Copyright (C) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
+*  Copyright (C) 2017-2026 Advanced Micro Devices, Inc. All rights reserved.
 *  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
@ -29,6 +29,7 @@
 #define FAMILY_NV3     0x91 //# 145 / Navi: 3x
 #define FAMILY_STX     0x96
 #define FAMILY_PHX     0x94 //# 148 / Phoenix
+#define FAMILY_GFX1170 0x9A
 #define FAMILY_RMB     0x92 //# 146 / Rembrandt
 #define FAMILY_RPL     0x95 //# 149 / Raphael
 #define FAMILY_MDN     0x97 //# 151 / Mendocino
@ -109,6 +110,7 @@
 #define AMDGPU_PHOENIX2_RANGE      0x80, 0xC0 //# 128 <= x < 192
 #define AMDGPU_HAWK_POINT1_RANGE   0xC0, 0xF0 //# 192 <= x < 240
 #define AMDGPU_HAWK_POINT2_RANGE   0xF0, 0xFF //# 240 <= x < 255
+#define AMDGPU_GFX1170_RANGE        0x01, 0x40 //# 1 <= x < 64

 #define AMDGPU_REMBRANDT_RANGE  0x01, 0xFF //# 01 <= x < 255
 #define AMDGPU_RAPHAEL_RANGE    0x01, 0xFF //# 1 <= x < max
@ -189,6 +191,7 @@
 #define ASICREV_IS_PHOENIX2(r)         ASICREV_IS(r, PHOENIX2)
 #define ASICREV_IS_HAWK_POINT1(r)      ASICREV_IS(r, HAWK_POINT1)
 #define ASICREV_IS_HAWK_POINT2(r)      ASICREV_IS(r, HAWK_POINT2)
+#define ASICREV_IS_GFX1170(r)         ASICREV_IS(r, GFX1170)

 #define ASICREV_IS_REMBRANDT(r)        ASICREV_IS(r, REMBRANDT)
 #define ASICREV_IS_RAPHAEL(r)          ASICREV_IS(r, RAPHAEL)
--- a/src/amd/addrlib/src/core/addrcommon.h
+++ b/src/amd/addrlib/src/core/addrcommon.h
@ -1,7 +1,7 @@
 /*
 ************************************************************************************************************************
 *
-*  Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved.
+*  Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved.
 *  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
@ -360,7 +360,7 @@ static inline UINT_32 BitMaskScanForward(
 {
    ADDR_ASSERT(mask > 0);
    unsigned long out = 0;
-#if (defined(_WIN64) && defined(_M_X64)) || (defined(_WIN32) && defined(_M_IX64))
+#if ((defined(_WIN64) && defined(_M_X64)) || (defined(_WIN32) && defined(_M_IX64))) && !defined(_M_ARM64EC)
    out = ::_tzcnt_u32(mask);
 #elif (defined(_WIN32) || defined(_WIN64))
    ::_BitScanForward(&out, mask);
@ -436,6 +436,22 @@ static inline UINT_64 IsPow2(
    return !(dim & (dim - 1));
 }

+/**
+****************************************************************************************************
+*   RoundUpToMultiple
+*
+*   @brief
+*       Rounds up the specified integer to the nearest multiple of the specified alignment value.
+****************************************************************************************************
+*/
+template <typename T>
+constexpr T RoundUpToMultiple(
+    T operand,   ///< Value to be aligned.
+    T alignment) ///< Alignment desired.
+{
+    return (((operand + (alignment - 1)) / alignment) * alignment);
+}
+
 /**
 ****************************************************************************************************
 *   PowTwoAlign
@ -647,6 +663,25 @@ static inline UINT_32 Log2(
    return (x != 0) ? (31 ^ BitMaskScanReverse(x)) : 0;
 }

+/**
+****************************************************************************************************
+*   ConstexprLog2
+*
+*   @brief
+*       Compute log of base 2 no matter the target is power of 2 or not. Returns 0 if 0.
+****************************************************************************************************
+*/
+static constexpr inline UINT_32 ConstexprLog2(
+    UINT_32 x)      ///< [in] the value should calculate log based 2
+{
+    UINT_32 out = 0;
+    while (x >>= 1)
+    {
+        out++;
+    }
+    return out;
+}
+
 /**
 ****************************************************************************************************
 *   QLog2
--- a/src/amd/addrlib/src/core/addrlib.cpp
+++ b/src/amd/addrlib/src/core/addrlib.cpp
@ -1,7 +1,7 @@
 /*
 ************************************************************************************************************************
 *
-*  Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved.
+*  Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved.
 *  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
@ -222,6 +222,7 @@ ADDR_E_RETURNCODE Lib::Create(
                    case FAMILY_NV3:
                    case FAMILY_STX:
                    case FAMILY_PHX:
+                    case FAMILY_GFX1170:
                        pLib = Gfx11HwlInit(&client);
                        break;
                    case FAMILY_NV4:
@ -304,6 +305,44 @@ ADDR_E_RETURNCODE Lib::Create(
    return returnCode;
 }

+/**
+****************************************************************************************************
+*   Lib::GetFormatProperties
+*
+*   @brief
+*       Returns the properties of the format as specifed in the input.
+*   @return
+*       ADDR_E_RETURNCODE
+****************************************************************************************************
+*/
+ADDR_E_RETURNCODE Lib::GetFormatProperties(
+    const ADDR_FORMAT_PROPERTIES_IN&  in,
+    ADDR_FORMAT_PROPERTIES_OUT*       pOut
+    ) const
+{
+    ADDR_E_RETURNCODE  returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((in.size    != sizeof(ADDR_FORMAT_PROPERTIES_IN)) ||
+            (pOut->size != sizeof(ADDR_FORMAT_PROPERTIES_OUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        pOut->bpp = GetElemLib()->GetBitsPerPixel(in.format,
+                                                  nullptr,                  // elemMode, unused
+                                                  &pOut->expand.width,
+                                                  &pOut->expand.height,
+                                                  nullptr);                 // unused bits
+    }
+
+    return returnCode;
+}
+
 /**
 ****************************************************************************************************
 *   Lib::SetChipFamily
@ -315,7 +354,7 @@ ADDR_E_RETURNCODE Lib::Create(
 ****************************************************************************************************
 */
 VOID Lib::SetChipFamily(
-    UINT_32 uChipFamily,        ///< [in] chip family defined in atiih.h
+    UINT_32 uChipFamily,        ///< [in] chip family defined in atiid.h
    UINT_32 uChipRevision)      ///< [in] chip revision defined in "asic_family"_id.h
 {
    ChipFamily family = HwlConvertChipFamily(uChipFamily, uChipRevision);
@ -668,6 +707,47 @@ UINT_32 Lib::GetBpe(AddrFormat format) const
    return GetElemLib()->GetBitsPerPixel(format);
 }

+/**
+****************************************************************************************************
+*   Lib::GetSwizzleModePreferenceRatio
+*
+*   @brief
+*       Get ratio driving swizzle mode selection heuristic. Ratio is returned as fraction nominator
+*       and denominator
+*   @return
+*       void
+****************************************************************************************************
+*/
+void Lib::GetSwizzleModePreferenceRatio(
+    const ADDR2_GET_PREFERRED_SURF_SETTING_INPUT* pIn,
+    UINT_32*                                      pOutRatioLo,
+    UINT_32*                                      pOutRatioHi
+    ) const
+{
+    const BOOL_32 computeMinSize = (pIn->flags.minimizeAlign == 1) || (pIn->memoryBudget >= 1.0);
+
+    if (computeMinSize)
+    {
+        *pOutRatioLo = 1;
+        *pOutRatioHi = 1;
+    }
+    else if (pIn->flags.opt4space)
+    {
+        *pOutRatioLo = 3;
+        *pOutRatioHi = 2;
+    }
+    else if (pIn->flags.computeMaxSize)
+    {
+        *pOutRatioLo = 1024;
+        *pOutRatioHi = 1;
+    }
+    else
+    {
+        *pOutRatioLo = 2;
+        *pOutRatioHi = 1;
+    }
+}
+
 /**
 ************************************************************************************************************************
 *   Lib::ComputeOffsetFromSwizzlePattern
--- a/src/amd/addrlib/src/core/addrlib.h
+++ b/src/amd/addrlib/src/core/addrlib.h
@ -1,7 +1,7 @@
 /*
 ************************************************************************************************************************
 *
-*  Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved.
+*  Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved.
 *  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
@ -300,6 +300,10 @@ public:
        delete this;
    }

+    ADDR_E_RETURNCODE GetFormatProperties(
+        const ADDR_FORMAT_PROPERTIES_IN&  in,
+        ADDR_FORMAT_PROPERTIES_OUT*       pOut) const;
+
    static Lib* GetLib(ADDR_HANDLE hLib);
    
    /// Returns which version of addrlib functions should be used.
@ -333,6 +337,10 @@ public:

    UINT_32 GetBpe(AddrFormat format) const;

+    void GetSwizzleModePreferenceRatio(
+        const ADDR2_GET_PREFERRED_SURF_SETTING_INPUT* pIn,
+        UINT_32*                                      pOutRatioLo,
+        UINT_32*                                      pOutRatioHi) const;

    static UINT_32 ComputeOffsetFromSwizzlePattern(
        const UINT_64* pPattern,
--- a/src/amd/addrlib/src/core/addrlib1.cpp
+++ b/src/amd/addrlib/src/core/addrlib1.cpp
@ -1,7 +1,7 @@
 /*
 ************************************************************************************************************************
 *
-*  Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved.
+*  Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved.
 *  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
@ -2473,7 +2473,7 @@ UINT_64 Lib::HwlComputeXmaskAddrFromCoord(
    //
    macroTileIndexX = x / macroTileWidth;
    macroTileIndexY = y / macroTileHeight;
-    macroTileOffset = ((macroTileIndexY * macroTilesPerRow) + macroTileIndexX) * macroTileBytes;
+    macroTileOffset = (static_cast<UINT_64>(macroTileIndexY * macroTilesPerRow) + macroTileIndexX) * macroTileBytes;

    //
    // Compute the pixel offset within the macro tile.
@ -2675,7 +2675,7 @@ VOID Lib::ComputeSurfaceCoordFromAddrMicroTiled(
    //
    sliceBits = static_cast<UINT_64>(pitch) * height * microTileThickness * bpp * numSamples;

-    rowBits   = (pitch / MicroTileWidth) * microTileBits;
+    rowBits   = static_cast<UINT_64>(pitch / MicroTileWidth) * microTileBits;

    //
    // Extract the slice index.
@ -3559,11 +3559,11 @@ BOOL_32 Lib::DegradeTo1D(
    if (degrade == FALSE)
    {
        // Only check width and height as slices are aligned to thickness
-        UINT_64 unalignedSize = width * height;
+        UINT_64 unalignedSize = static_cast<UINT_64>(width) * height;

        UINT_32 alignedPitch = PowTwoAlign(width, macroTilePitchAlign);
        UINT_32 alignedHeight = PowTwoAlign(height, macroTileHeightAlign);
-        UINT_64 alignedSize = alignedPitch * alignedHeight;
+        UINT_64 alignedSize = static_cast<UINT_64>(alignedPitch) * alignedHeight;

        // alignedSize > 1.5 * unalignedSize
        if (2 * alignedSize > 3 * unalignedSize)
--- a/src/amd/addrlib/src/core/addrlib2.cpp
+++ b/src/amd/addrlib/src/core/addrlib2.cpp
@ -1,7 +1,7 @@
 /*
 ************************************************************************************************************************
 *
-*  Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved.
+*  Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved.
 *  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
@ -207,7 +207,7 @@ ADDR_E_RETURNCODE Lib::ComputeSurfaceInfo(
            // Overwrite these parameters if we have a valid format
        }

-        if (localIn.bpp != 0)
+        if (localIn.bpp >= 8)
        {
            localIn.width  = Max(localIn.width, 1u);
            localIn.height = Max(localIn.height, 1u);
@ -444,8 +444,8 @@ ADDR_E_RETURNCODE Lib::CopyLinearSurface(
            void* pMipBase = VoidPtrInc(pIn->pMappedSurface,
                                        (pIn->singleSubres ? 0 : mipInfo[pCurRegion->mipId].offset));

-            const size_t lineSizeBytes = (localIn.bpp >> 3) * pCurRegion->copyDims.width;
-            const size_t lineImgPitchBytes = (localIn.bpp >> 3) * mipInfo[pCurRegion->mipId].pitch;
+            const size_t lineSizeBytes = (static_cast<size_t>(localIn.bpp) >> 3) * pCurRegion->copyDims.width;
+            const size_t lineImgPitchBytes = (static_cast<size_t>(localIn.bpp) >> 3) * mipInfo[pCurRegion->mipId].pitch;

            for (UINT_32 sliceIdx = 0; sliceIdx < pCurRegion->copyDims.depth; sliceIdx++)
            {
@ -504,6 +504,11 @@ ADDR_E_RETURNCODE Lib::CopyMemToSurface(
        {
            returnCode = ADDR_INVALIDPARAMS;
        }
+        else if (pIn->copyFlags.blockMemcpy && pIn->copyFlags.hybridMemcpy)
+        {
+            // Invalid to specify conflicting copy modes.
+            returnCode = ADDR_INVALIDPARAMS;
+        }
        else
        {
            UINT_32 baseSlice = pRegions[0].slice;
@ -573,6 +578,11 @@ ADDR_E_RETURNCODE Lib::CopySurfaceToMem(
        {
            returnCode = ADDR_INVALIDPARAMS;
        }
+        else if (pIn->copyFlags.blockMemcpy && pIn->copyFlags.hybridMemcpy)
+        {
+            // Invalid to specify conflicting copy modes.
+            returnCode = ADDR_INVALIDPARAMS;
+        }
        else
        {
            UINT_32 baseSlice = pRegions[0].slice;
@ -1424,7 +1434,7 @@ ADDR_E_RETURNCODE Lib::ComputeSurfaceAddrFromCoordLinear(
        {
            pOut->addr        = (localOut.sliceSize * pIn->slice) +
                                mipInfo[pIn->mipId].offset +
-                                (pIn->y * mipInfo[pIn->mipId].pitch + pIn->x) * (pIn->bpp >> 3);
+                                (static_cast<size_t>(pIn->y) * mipInfo[pIn->mipId].pitch + pIn->x) * (pIn->bpp >> 3);
            pOut->bitPosition = 0;
        }
        else
--- a/src/amd/addrlib/src/core/addrlib3.cpp
+++ b/src/amd/addrlib/src/core/addrlib3.cpp
@ -1,7 +1,7 @@
 /*
 ************************************************************************************************************************
 *
-*  Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved.
+*  Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved.
 *  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
@ -39,8 +39,6 @@ namespace V3
 Lib::Lib()
    :
    Addr::Lib(),
-    m_pipesLog2(0),
-    m_pipeInterleaveLog2(0),
    m_numEquations(0)
 {
    Init();
@ -59,8 +57,6 @@ Lib::Lib(
    const Client* pClient)
    :
    Addr::Lib(pClient),
-    m_pipesLog2(0),
-    m_pipeInterleaveLog2(0),
    m_numEquations(0)
 {
    Init();
@ -265,7 +261,7 @@ ADDR_E_RETURNCODE Lib::ComputeSurfaceInfo(
            // Overwrite these parameters if we have a valid format
        }

-        if (localIn.bpp != 0)
+        if (localIn.bpp >= 8)
        {
            localIn.width  = Max(localIn.width, 1u);
            localIn.height = Max(localIn.height, 1u);
@ -547,8 +543,8 @@ ADDR_E_RETURNCODE Lib::CopyLinearSurface(
            void* pMipBase = VoidPtrInc(pIn->pMappedSurface,
                                        (pIn->singleSubres ? 0 : mipInfo[pCurRegion->mipId].offset));

-            const size_t lineSizeBytes = (localIn.bpp >> 3) * pCurRegion->copyDims.width;
-            const size_t lineImgPitchBytes = (localIn.bpp >> 3) * mipInfo[pCurRegion->mipId].pitch;
+            const size_t lineSizeBytes = (static_cast<size_t>(localIn.bpp) >> 3) * pCurRegion->copyDims.width;
+            const size_t lineImgPitchBytes = (static_cast<size_t>(localIn.bpp) >> 3) * mipInfo[pCurRegion->mipId].pitch;

            for (UINT_32 sliceIdx = 0; sliceIdx < pCurRegion->copyDims.depth; sliceIdx++)
            {
@ -611,6 +607,11 @@ ADDR_E_RETURNCODE Lib::CopyMemToSurface(
        {
            returnCode = ADDR_INVALIDPARAMS;
        }
+        else if (pIn->copyFlags.blockMemcpy && pIn->copyFlags.hybridMemcpy)
+        {
+            // Invalid to specify conflicting copy modes.
+            returnCode = ADDR_INVALIDPARAMS;
+        }
        else
        {
            UINT_32 baseSlice    = pRegions[0].slice;
@ -680,6 +681,11 @@ ADDR_E_RETURNCODE Lib::CopySurfaceToMem(
        {
            returnCode = ADDR_INVALIDPARAMS;
        }
+        else if (pIn->copyFlags.blockMemcpy && pIn->copyFlags.hybridMemcpy)
+        {
+            // Invalid to specify conflicting copy modes.
+            returnCode = ADDR_INVALIDPARAMS;
+        }
        else
        {
            UINT_32 baseSlice    = pRegions[0].slice;
@ -736,7 +742,7 @@ ADDR_E_RETURNCODE Lib::ComputePipeBankXor(
    const ADDR3_COMPUTE_PIPEBANKXOR_INPUT* pIn,
    ADDR3_COMPUTE_PIPEBANKXOR_OUTPUT*      pOut)
 {
-    ADDR_E_RETURNCODE returnCode;
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;

    if ((GetFillSizeFieldsFlags() == TRUE) &&
        ((pIn->size  != sizeof(ADDR3_COMPUTE_PIPEBANKXOR_INPUT)) ||
@ -746,7 +752,23 @@ ADDR_E_RETURNCODE Lib::ComputePipeBankXor(
    }
    else
    {
-        returnCode = HwlComputePipeBankXor(pIn, pOut);
+        // The swizzle mode determines how many unused bits there are in the address.  We never (ok, rarely...) program
+        // the low eight bits of the address, so the "numSwizzleBits" effectively represents the number of "guaranteed
+        // zero" programmed bits in the address.
+        const UINT_32  numSwizzleBits = GetBlockSizeLog2(pIn->swizzleMode, FALSE) - 8;
+
+        // make sure this configuration supports swizzling
+        if (numSwizzleBits != 0) 
+        {
+            // These cases should have been excluded with the "numSwizzleBits" calculation above, but make sure here.
+            ADDR_ASSERT((IsLinear(pIn->swizzleMode) == FALSE) && (IsBlock256b(pIn->swizzleMode) == FALSE));
+
+            pOut->pipeBankXor = pIn->surfIndex % (1 << numSwizzleBits);
+        }
+        else
+        {
+            pOut->pipeBankXor = 0;
+        }
    }

    return returnCode;
@ -1167,7 +1189,6 @@ ADDR_E_RETURNCODE Lib::ComputeSurfaceInfoSanityCheck(
    return HwlValidateNonSwModeParams(&localIn) ? ADDR_OK : ADDR_INVALIDPARAMS;
 }

-
 /**
 ************************************************************************************************************************
 *   Lib::ComputeOffsetFromEquation
--- a/src/amd/addrlib/src/core/addrlib3.h
+++ b/src/amd/addrlib/src/core/addrlib3.h
@ -1,7 +1,7 @@
 /*
 ************************************************************************************************************************
 *
-*  Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
+*  Copyright (C) 2022-2026 Advanced Micro Devices, Inc. All rights reserved.
 *  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
@ -44,7 +44,6 @@ struct ADDR3_COORD
 struct ADDR3_COMPUTE_SURFACE_INFO_PARAMS_INPUT
 {
    const ADDR3_COMPUTE_SURFACE_INFO_INPUT* pSurfInfo;
-    void*                                   pvAddrParams;
 };

 /**
@ -155,14 +154,16 @@ protected:
    Lib();  // Constructor is protected
    Lib(const Client* pClient);

-    UINT_32 m_pipesLog2;                ///< Number of pipe per shader engine Log2
-    UINT_32 m_pipeInterleaveLog2;       ///< Log2 of pipe interleave bytes
-
    SwizzleModeFlags m_swizzleModeTable[ADDR3_MAX_TYPE];  ///< Swizzle mode table

    // Number of unique MSAA sample rates (1/2/4/8)
    static const UINT_32 MaxNumMsaaRates     = 4;

+    //# These fields exist in the GB_ADDR_CONFIG register; however, the HW does not care about them.
+    //# The HW acts as if the log2(pipes)==5 and log2(pi) == 8, always.
+    static const UINT_32  NumPipesLog2       = 5;
+    static const UINT_32  PipeInterleaveLog2 = 8;
+
    // Number of equation entries in the table
    UINT_32              m_numEquations;

@ -444,4 +445,4 @@ private:
 } // V3
 } // Addr

-#endif
+#endif
--- a/src/amd/addrlib/src/core/addrswizzler.cpp
+++ b/src/amd/addrlib/src/core/addrswizzler.cpp
@ -2,7 +2,8 @@
 /*
 ************************************************************************************************************************
 *
-*  Copyright (C) 2024 Advanced Micro Devices, Inc.  All rights reserved.
+*  Copyright (C) 2024-2026 Advanced Micro Devices, Inc.  All rights reserved.
+*  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/

@ -14,6 +15,7 @@
 */

 #include "addrswizzler.h"
+#include "addrswizzlersimd.h"

 namespace Addr
 {
@ -98,6 +100,23 @@ void LutAddresser::InitSwizzleProps()
        m_sLutMask |= m_bit[i].s;
    }

+    // Derive the microblock size from the swizzle equation.
+    UINT_32 xMbMask = 0;
+    UINT_32 yMbMask = 0;
+    UINT_32 zMbMask = 0;
+    for (UINT_32 i = 0; i < 8; i++)
+    {
+        xMbMask |= m_bit[i].x;
+        yMbMask |= m_bit[i].y;
+        zMbMask |= m_bit[i].z;
+    }
+    m_microBlockSize.width  = xMbMask + 1;
+    m_microBlockSize.height = yMbMask + 1;
+    m_microBlockSize.depth  = zMbMask + 1;
+    ADDR_ASSERT(IsPow2(m_microBlockSize.width));
+    ADDR_ASSERT(IsPow2(m_microBlockSize.height));
+    ADDR_ASSERT(IsPow2(m_microBlockSize.depth));
+
    // An expandX of 1 is a no-op
    m_maxExpandX = 1;
    if (m_sLutMask == 0)
@ -153,7 +172,7 @@ void LutAddresser::InitLuts()
        m_pYLut = &m_lutData[0];
        ADDR_ASSERT(m_pYLut[0] == 0);
    }
-    
+
    if (m_zLutMask != 0)
    {
        m_pZLut = &m_lutData[curOffset];
@ -269,82 +288,33 @@ UINT_32 LutAddresser::EvalEquation(

 /**
 ****************************************************************************************************
-*   Copy2DSliceUnaligned
+*   CopyRowUnaligned
 *
 *   @brief
-*       Copies an arbitrary 2D pixel region to or from a surface.
+*       Copies a single row to or from a surface.
 ****************************************************************************************************
 */
 template <int BPELog2, int ExpandX, bool ImgIsDest>
-void Copy2DSliceUnaligned(
-    void*               pImgBlockSliceStart, // Block corresponding to beginning of slice
-    void*               pBuf,                // Pointer to data starting from the copy origin.
-    size_t              bufStrideY,          // Stride of each row in pBuf
-    UINT_32             imageBlocksY,        // Width of the image slice, in blocks.
-    ADDR_COORD2D        origin,              // Absolute origin, in elements
-    ADDR_EXTENT2D       extent,              // Size to copy, in elements
-    UINT_32             sliceXor,            // Includes pipeBankXor and z XOR
+void CopyRowUnaligned(
+    void*               pRowImgBlockStart, // Pointer to the image block at x=0
+    void*               pBuf,              // Pointer to data at x=0
+    UINT_32             xStart,            // x value to start at
+    UINT_32             xEnd,              // x value to finish at (not inclusive)
+    UINT_32             rowXor,            // Value to XOR in for each address (makes up PBX and y/z coords)
    const LutAddresser& addresser)
 {
-    UINT_32  xStart = origin.x;
-    UINT_32  xEnd   = origin.x + extent.width;
-
+    UINT_32 x = xStart;
    constexpr UINT_32  PixBytes = (1 << BPELog2);

-    // Apply a negative offset now so later code can do eg. pBuf[x] instead of pBuf[x - origin.x]
-    pBuf = VoidPtrDec(pBuf, xStart * PixBytes);
-
-    // Do things one row at a time for unaligned regions.
-    for (UINT_32 y = origin.y; y < (origin.y + extent.height); y++)
+    // Most swizzles pack 2-4 pixels horizontally. Take advantage of this even in non-microblock-aligned
+    // regions to commonly do 2-4x less work. This is still way less good than copying by whole microblocks though.
+    if (ExpandX > 1)
    {
-        UINT_32 yBlk = (y >> addresser.GetBlockYBits()) * imageBlocksY;
-        UINT_32 rowXor = sliceXor ^ addresser.GetAddressY(y);
-
-        UINT_32 x = xStart;
-
-        // Most swizzles pack 2-4 pixels horizontally. Take advantage of this even in non-microblock-aligned
-        // regions to commonly do 2-4x less work. This is still way less good than copying by whole microblocks though.
-        if (ExpandX > 1)
+        // Unaligned left edge
+        for (; x < Min(xEnd, PowTwoAlign(xStart, ExpandX)); x++)
        {
-            // Unaligned left edge
-            for (; x < Min(xEnd, PowTwoAlign(xStart, ExpandX)); x++)
-            {
-                UINT_32 blk = (yBlk + (x >> addresser.GetBlockXBits()));
-                void* pImgBlock = VoidPtrInc(pImgBlockSliceStart, blk << addresser.GetBlockBits());
-                void* pPix = VoidPtrInc(pImgBlock, rowXor ^ addresser.GetAddressX(x));
-                if (ImgIsDest)
-                {
-                    memcpy(pPix, VoidPtrInc(pBuf, x * PixBytes), PixBytes);
-                }
-                else
-                {
-                    memcpy(VoidPtrInc(pBuf, x * PixBytes), pPix, PixBytes);
-                }
-            }
-            // Aligned middle
-            for (; x < PowTwoAlignDown(xEnd, ExpandX); x += ExpandX)
-            {
-                UINT_32 blk = (yBlk + (x >> addresser.GetBlockXBits()));
-                void* pImgBlock = VoidPtrInc(pImgBlockSliceStart, blk << addresser.GetBlockBits());
-                void* pPix = VoidPtrInc(pImgBlock, rowXor ^ addresser.GetAddressX(x));
-                if (ImgIsDest)
-                {
-                    memcpy(pPix, VoidPtrInc(pBuf, x * PixBytes), PixBytes * ExpandX);
-                }
-                else
-                {
-                    memcpy(VoidPtrInc(pBuf, x * PixBytes), pPix, PixBytes * ExpandX);
-                }
-            }
-        }
-        // Unaligned end (or the whole thing when ExpandX == 1)
-        for (; x < xEnd; x++)
-        {
-            // Get the index of the block within the slice
-            UINT_32 blk = (yBlk + (x >> addresser.GetBlockXBits()));
-            // Apply that index to get the base address of the current block. 
-            void* pImgBlock = VoidPtrInc(pImgBlockSliceStart, blk << addresser.GetBlockBits());
-            // Grab the x-xor and XOR it all together, adding to get the final address
+            UINT_32 blk = (x >> addresser.GetBlockXBits());
+            void* pImgBlock = VoidPtrInc(pRowImgBlockStart, blk << addresser.GetBlockBits());
            void* pPix = VoidPtrInc(pImgBlock, rowXor ^ addresser.GetAddressX(x));
            if (ImgIsDest)
            {
@ -355,8 +325,478 @@ void Copy2DSliceUnaligned(
                memcpy(VoidPtrInc(pBuf, x * PixBytes), pPix, PixBytes);
            }
        }
+        // Aligned middle
+        for (; x < PowTwoAlignDown(xEnd, ExpandX); x += ExpandX)
+        {
+            UINT_32 blk = (x >> addresser.GetBlockXBits());
+            void* pImgBlock = VoidPtrInc(pRowImgBlockStart, blk << addresser.GetBlockBits());
+            void* pPix = VoidPtrInc(pImgBlock, rowXor ^ addresser.GetAddressX(x));
+            if (ImgIsDest)
+            {
+                memcpy(pPix, VoidPtrInc(pBuf, x * PixBytes), PixBytes * ExpandX);
+            }
+            else
+            {
+                memcpy(VoidPtrInc(pBuf, x * PixBytes), pPix, PixBytes * ExpandX);
+            }
+        }
+    }
+    // Unaligned end (or the whole thing when ExpandX == 1)
+    for (; x < xEnd; x++)
+    {
+        // Get the index of the block within the slice
+        UINT_32 blk = (x >> addresser.GetBlockXBits());
+        // Apply that index to get the base address of the current block.
+        void* pImgBlock = VoidPtrInc(pRowImgBlockStart, blk << addresser.GetBlockBits());
+        // Grab the x-xor and XOR it all together, adding to get the final address
+        void* pPix = VoidPtrInc(pImgBlock, rowXor ^ addresser.GetAddressX(x));
+        if (ImgIsDest)
+        {
+            memcpy(pPix, VoidPtrInc(pBuf, x * PixBytes), PixBytes);
+        }
+        else
+        {
+            memcpy(VoidPtrInc(pBuf, x * PixBytes), pPix, PixBytes);
+        }
+    }
+}

-        pBuf = VoidPtrInc(pBuf, bufStrideY);
+/**
+****************************************************************************************************
+*   CopyImgUnaligned
+*
+*   @brief
+*       Copies an arbitrary 3D pixel region to or from a surface.
+****************************************************************************************************
+*/
+template <int BPELog2, int ExpandX, bool ImgIsDest>
+void CopyImgUnaligned(
+    void*               pImgBlockStart, // Block corresponding to beginning of image
+    void*               pBuf,           // Pointer to data starting from the copy origin.
+    size_t              bufStrideY,     // Stride of each row in pBuf
+    size_t              bufStrideZ,     // Stride of each slice in pBuf
+    UINT_32             imageBlocksY,   // Width of the image slice, in blocks.
+    UINT_32             imageBlocksZ,   // Depth pitch of the image slice, in blocks.
+    ADDR_COORD3D        origin,         // Absolute origin, in elements
+    ADDR_EXTENT3D       extent,         // Size to copy, in elements
+    UINT_32             pipeBankXor,    // Final value to xor in
+    BOOL_32             isInMipTail,    // True if this is in the mip tail.
+    const LutAddresser& addresser)
+{
+    constexpr UINT_32  PixBytes = (1 << BPELog2);
+
+    // Apply a negative x/y offset now so later code can do eg. pBuf[x] instead of pBuf[x - origin.x]
+    // Keep the z offset.
+    pBuf = VoidPtrDec(pBuf, origin.x * PixBytes);
+
+    void* pSliceBuf = pBuf;
+    // Do things one slice/row at a time for unaligned regions.
+    for (UINT_32 z = origin.z; z < (origin.z + extent.depth); z++)
+    {
+        UINT_32 sliceXor = pipeBankXor ^ addresser.GetAddressZ(z);
+        UINT_32 zBlk = (z >> addresser.GetBlockZBits()) * imageBlocksZ;
+        void* pRowBuf = pSliceBuf;
+        for (UINT_32 y = origin.y; y < (origin.y + extent.height); y++)
+        {
+            UINT_32 yBlk = (y >> addresser.GetBlockYBits()) * imageBlocksY;
+            UINT_32 rowXor = sliceXor ^ addresser.GetAddressY(y);
+            UINT_64 rowOffset = ((zBlk + yBlk) << addresser.GetBlockBits());
+
+            void* pImgBlockRow = VoidPtrInc(pImgBlockStart, rowOffset);
+
+            CopyRowUnaligned<BPELog2, ExpandX, ImgIsDest>(
+                pImgBlockRow,
+                pRowBuf,
+                origin.x,
+                origin.x + extent.width,
+                rowXor,
+                addresser);
+
+            pRowBuf = VoidPtrInc(pRowBuf, bufStrideY);
+        }
+        pSliceBuf = VoidPtrInc(pSliceBuf, bufStrideZ);
+    }
+}
+
+
+/**
+****************************************************************************************************
+*   HandleUnalignedRegions
+*
+*   @brief
+*       Does unaligned copies for any X/Y/Z edges that are not fully aligned, fixing up the
+*       copy region and pointer to point at the aligned region that remains.
+****************************************************************************************************
+*/
+template <int BPELog2, int ExpandX>
+void HandleUnalignedRegions(
+    void*               pImgBlockStart, // Block corresponding to beginning of image
+    void**              ppBuf,          // Pointer to pointer to data starting from the copy origin.
+    size_t              bufStrideY,     // Stride of each row in pBuf
+    size_t              bufStrideZ,     // Stride of each slice in pBuf
+    UINT_32             imageBlocksY,   // Width of the image slice, in blocks.
+    UINT_32             imageBlocksZ,   // Depth pitch of the image slice, in blocks.
+    ADDR_COORD3D*       pOrigin,        // Absolute origin, in elements
+    ADDR_EXTENT3D*      pExtent,        // Size to copy, in elements
+    ADDR_EXTENT3D       align,          // Size to align on, in elements
+    UINT_32             pipeBankXor,    // Final value to xor in
+    BOOL_32             isInMipTail,    // True if this is in the mip tail.
+    const LutAddresser& addresser)
+{
+    constexpr bool ImgIsDest = true;
+
+    // Go through the start/end of the x/y/z extents and copy the parts that aren't aligned.
+    if (pOrigin->x != PowTwoAlign(pOrigin->x, align.width))
+    {
+        UINT_32 xSize = Min(pOrigin->x + pExtent->width, PowTwoAlign(pOrigin->x, align.width)) - pOrigin->x;
+        CopyImgUnaligned<BPELog2, ExpandX, ImgIsDest>(
+            pImgBlockStart,
+            *ppBuf,
+            bufStrideY,
+            bufStrideZ,
+            imageBlocksY,
+            imageBlocksZ,
+            *pOrigin,
+            { xSize, pExtent->height, pExtent->depth},
+            pipeBankXor,
+            isInMipTail,
+            addresser);
+        pExtent->width -= xSize;
+        pOrigin->x     += xSize;
+        *ppBuf = VoidPtrInc(*ppBuf, xSize << BPELog2);
+    }
+    if (pOrigin->y != PowTwoAlign(pOrigin->y, align.height))
+    {
+        UINT_32 ySize = Min(pOrigin->y + pExtent->height, PowTwoAlign(pOrigin->y, align.height)) - pOrigin->y;
+        CopyImgUnaligned<BPELog2, ExpandX, ImgIsDest>(
+            pImgBlockStart,
+            *ppBuf,
+            bufStrideY,
+            bufStrideZ,
+            imageBlocksY,
+            imageBlocksZ,
+            *pOrigin,
+            { pExtent->width, ySize, pExtent->depth},
+            pipeBankXor,
+            isInMipTail,
+            addresser);
+        pExtent->height -= ySize;
+        pOrigin->y      += ySize;
+        *ppBuf = VoidPtrInc(*ppBuf, ySize * bufStrideY);
+    }
+    if (pOrigin->z != PowTwoAlign(pOrigin->z, align.depth))
+    {
+        UINT_32 zSize = Min(pOrigin->z + pExtent->depth, PowTwoAlign(pOrigin->z, align.depth)) - pOrigin->z;
+        CopyImgUnaligned<BPELog2, ExpandX, ImgIsDest>(
+            pImgBlockStart,
+            *ppBuf,
+            bufStrideY,
+            bufStrideZ,
+            imageBlocksY,
+            imageBlocksZ,
+            *pOrigin,
+            { pExtent->width, pExtent->height, zSize },
+            pipeBankXor,
+            isInMipTail,
+            addresser);
+        pExtent->depth -= zSize;
+        pOrigin->z     += zSize;
+        *ppBuf = VoidPtrInc(*ppBuf, zSize * bufStrideZ);
+    }
+
+    // At this point the starts are aligned, so we can care about just size rather than origin+size.
+    if ((pExtent->width) != PowTwoAlignDown(pExtent->width, align.width))
+    {
+        UINT_32 xAlignedSize = PowTwoAlignDown(pOrigin->x + pExtent->width, align.width) - pOrigin->x;
+        void* pBuf = VoidPtrInc(*ppBuf, xAlignedSize << BPELog2);
+        CopyImgUnaligned<BPELog2, ExpandX, ImgIsDest>(
+            pImgBlockStart,
+            pBuf,
+            bufStrideY,
+            bufStrideZ,
+            imageBlocksY,
+            imageBlocksZ,
+            { pOrigin->x + xAlignedSize, pOrigin->y, pOrigin->z},
+            { pExtent->width - xAlignedSize, pExtent->height, pExtent->depth },
+            pipeBankXor,
+            isInMipTail,
+            addresser);
+        pExtent->width = xAlignedSize;
+    }
+
+    if ((pExtent->height) != PowTwoAlignDown(pExtent->height, align.height))
+    {
+        UINT_32 yAlignedSize = PowTwoAlignDown(pOrigin->y + pExtent->height, align.height) - pOrigin->y;
+        void* pBuf = VoidPtrInc(*ppBuf, yAlignedSize * bufStrideY);
+        CopyImgUnaligned<BPELog2, ExpandX, ImgIsDest>(
+            pImgBlockStart,
+            pBuf,
+            bufStrideY,
+            bufStrideZ,
+            imageBlocksY,
+            imageBlocksZ,
+            { pOrigin->x, pOrigin->y + yAlignedSize, pOrigin->z},
+            { pExtent->width, pExtent->height - yAlignedSize, pExtent->depth },
+            pipeBankXor,
+            isInMipTail,
+            addresser);
+        pExtent->height = yAlignedSize;
+    }
+
+    if ((pExtent->depth) != PowTwoAlignDown(pExtent->depth, align.depth))
+    {
+        UINT_32 zAlignedSize = PowTwoAlignDown(pOrigin->z + pExtent->depth, align.depth) - pOrigin->z;
+        void* pBuf = VoidPtrInc(*ppBuf, zAlignedSize * bufStrideZ);
+        CopyImgUnaligned<BPELog2, ExpandX, ImgIsDest>(
+            pImgBlockStart,
+            pBuf,
+            bufStrideY,
+            bufStrideZ,
+            imageBlocksY,
+            imageBlocksZ,
+            { pOrigin->x, pOrigin->y, pOrigin->z + zAlignedSize },
+            { pExtent->width, pExtent->height, pExtent->depth - zAlignedSize },
+            pipeBankXor,
+            isInMipTail,
+            addresser);
+        pExtent->depth = zAlignedSize;
+    }
+}
+
+/**
+****************************************************************************************************
+*   CopyMemImgHybrid
+*
+*   @brief
+*       Copies a 3D pixel region to a surface. Uses fast copies for fully covered microblocks.
+****************************************************************************************************
+*/
+template <class MicroSw>
+AVX2_FUNC NEON_FUNC void CopyMemImgHybrid(
+    void*               pImgBlockStart, // Block corresponding to beginning of image
+    void*               pBuf,           // Pointer to data starting from the copy origin.
+    size_t              bufStrideY,     // Stride of each row in pBuf
+    size_t              bufStrideZ,     // Stride of each slice in pBuf
+    UINT_32             imageBlocksY,   // Width of the image slice, in blocks.
+    UINT_32             imageBlocksZ,   // Depth pitch of the image slice, in blocks.
+    ADDR_COORD3D        origin,         // Absolute origin, in elements
+    ADDR_EXTENT3D       extent,         // Size to copy, in elements
+    UINT_32             pipeBankXor,    // Final value to xor in
+    BOOL_32             isInMipTail,    // True if this is in the mip tail.
+    const LutAddresser& addresser)
+{
+    // Handle unaligned edges in x/y/z and fixup the extents to match.
+    HandleUnalignedRegions<MicroSw::BpeLog2, MicroSw::ExpandX>(
+        pImgBlockStart,
+        &pBuf,
+        bufStrideY,
+        bufStrideZ,
+        imageBlocksY,
+        imageBlocksZ,
+        &origin,
+        &extent,
+        MicroSw::MicroBlockExtent,
+        pipeBankXor,
+        isInMipTail,
+        addresser
+    );
+
+    // Apply a negative x/y offset now so later code can do eg. pBuf[x] instead of pBuf[x - origin.x]
+    // Keep the z offset.
+    pBuf = VoidPtrDec(pBuf, origin.x << MicroSw::BpeLog2);
+
+    void* pSliceBuf = pBuf;
+    // Do things one slice/row at a time for unaligned regions.
+    for (UINT_32 z = origin.z; z < (origin.z + extent.depth); z += MicroSw::MicroBlockExtent.depth)
+    {
+        UINT_32 sliceXor = pipeBankXor ^ addresser.GetAddressZ(z);
+        UINT_32 zBlk = (z >> addresser.GetBlockZBits()) * imageBlocksZ;
+        void* pRowBuf = pSliceBuf;
+        for (UINT_32 y = origin.y; y < (origin.y + extent.height); y += MicroSw::MicroBlockExtent.height)
+        {
+            UINT_32 yBlk = ((y >> addresser.GetBlockYBits()) * imageBlocksY) + zBlk;
+            UINT_32 rowXor = sliceXor ^ addresser.GetAddressY(y);
+
+            for (UINT_32 x = origin.x; x < (origin.x + extent.width); x += MicroSw::MicroBlockExtent.width)
+            {
+                UINT_32 xBlk = (x >> addresser.GetBlockXBits()) + yBlk;
+                UINT_64 offset = (xBlk << addresser.GetBlockBits());
+                offset ^= rowXor;
+                offset ^= addresser.GetAddressX(x);
+
+                void* pPix = VoidPtrInc(pImgBlockStart, offset);
+                void* pPixBuf = VoidPtrInc(pRowBuf, x << MicroSw::BpeLog2);
+
+                MicroSw::CopyMicroBlock(
+                    pPix,
+                    pPixBuf,
+                    bufStrideY,
+                    bufStrideZ
+                );
+            }
+            pRowBuf = VoidPtrInc(pRowBuf, bufStrideY * MicroSw::MicroBlockExtent.height);
+        }
+        pSliceBuf = VoidPtrInc(pSliceBuf, bufStrideZ * MicroSw::MicroBlockExtent.depth);
+    }
+}
+
+/**
+****************************************************************************************************
+*   CopyMemImgMicroblocks
+*
+*   @brief
+*       Copies the microblocks of a 3D pixel region to/from a surface.
+****************************************************************************************************
+*/
+template <bool ImgIsDest, bool NonTemporal>
+AVX2_FUNC NEON_FUNC void CopyMemImgMicroblocks(
+    void*               pImgBlockStart, // Block corresponding to beginning of image
+    void*               pBuf,           // Pointer to data starting from the copy origin.
+    size_t              bufStrideY,     // Stride of each row in pBuf, ignored.
+    size_t              bufStrideZ,     // Stride of each slice in pBuf, ignored.
+    UINT_32             imageBlocksY,   // Width of the image slice, in blocks.
+    UINT_32             imageBlocksZ,   // Depth pitch of the image slice, in blocks.
+    ADDR_COORD3D        origin,         // Absolute origin, in elements
+    ADDR_EXTENT3D       extent,         // Size to copy, in elements
+    UINT_32             pipeBankXor,    // Final value to xor in
+    BOOL_32             isInMipTail,    // True if this is in the mip tail.
+    const LutAddresser& addresser)
+{
+    // Pad out our dims to microblock boundaries.
+    origin.x = PowTwoAlignDown(origin.x, addresser.GetMicroBlockX());
+    origin.y = PowTwoAlignDown(origin.y, addresser.GetMicroBlockY());
+    origin.z = PowTwoAlignDown(origin.z, addresser.GetMicroBlockZ());
+    extent.width  = PowTwoAlign(extent.width,  addresser.GetMicroBlockX());
+    extent.height = PowTwoAlign(extent.height, addresser.GetMicroBlockY());
+    extent.depth  = PowTwoAlign(extent.depth,  addresser.GetMicroBlockZ());
+
+    // Calculate the address of the first pixel in each microblock (256B), then copy it.
+    for (UINT_32 z = origin.z; z < (origin.z + extent.depth); z += addresser.GetMicroBlockZ())
+    {
+        UINT_32 sliceXor = pipeBankXor ^ addresser.GetAddressZ(z);
+        UINT_32 zBlk = (z >> addresser.GetBlockZBits()) * imageBlocksZ;
+        for (UINT_32 y = origin.y; y < (origin.y + extent.height); y += addresser.GetMicroBlockY())
+        {
+            UINT_32 yBlk = ((y >> addresser.GetBlockYBits()) * imageBlocksY) + zBlk;
+            UINT_32 rowXor = sliceXor ^ addresser.GetAddressY(y);
+
+            for (UINT_32 x = origin.x; x < (origin.x + extent.width); x += addresser.GetMicroBlockX())
+            {
+                UINT_32 xBlk = (x >> addresser.GetBlockXBits()) + yBlk;
+                UINT_64 offset = (xBlk << addresser.GetBlockBits());
+                offset ^= rowXor;
+                offset ^= addresser.GetAddressX(x);
+
+                void* pPix = VoidPtrInc(pImgBlockStart, offset);
+                constexpr UINT_32 CopySize = 1 << 8;
+
+#if ADDR_HAS_AVX2
+                if (NonTemporal && ImgIsDest)
+                {
+                    StreamCopyToImgAligned(pPix, pBuf, CopySize);
+                }
+                else if (NonTemporal)
+                {
+                    StreamCopyFromImgAligned(pBuf, pPix, CopySize);
+                }
+                else
+#endif
+                if (ImgIsDest)
+                {
+                    memcpy(pPix, pBuf, CopySize);
+                }
+                else
+                {
+                    memcpy(pBuf, pPix, CopySize);
+                }
+                pBuf = VoidPtrInc(pBuf, CopySize);
+            }
+        }
+    }
+}
+
+/**
+****************************************************************************************************
+*   CopyMemImgBlocks
+*
+*   @brief
+*       Copies the blocks of a 3D pixel region to/from a surface.
+****************************************************************************************************
+*/
+template <bool ImgIsDest, bool NonTemporal>
+AVX2_FUNC NEON_FUNC void CopyMemImgBlocks(
+    void*               pImgBlockStart, // Block corresponding to beginning of image
+    void*               pBuf,           // Pointer to data starting from the copy origin.
+    size_t              bufStrideY,     // Stride of each row in pBuf, ignored.
+    size_t              bufStrideZ,     // Stride of each slice in pBuf, ignored.
+    UINT_32             imageBlocksY,   // Width of the image slice, in blocks.
+    UINT_32             imageBlocksZ,   // Depth pitch of the image slice, in blocks.
+    ADDR_COORD3D        origin,         // Absolute origin, in elements
+    ADDR_EXTENT3D       extent,         // Size to copy, in elements
+    UINT_32             pipeBankXor,    // Final value to xor in
+    BOOL_32             isInMipTail,    // True if this is in the mip tail.
+    const LutAddresser& addresser)
+{
+    if (isInMipTail)
+    {
+        return CopyMemImgMicroblocks<ImgIsDest, NonTemporal>(
+            pImgBlockStart,
+            pBuf,
+            bufStrideY,
+            bufStrideZ,
+            imageBlocksY,
+            imageBlocksZ,
+            origin,
+            extent,
+            pipeBankXor,
+            isInMipTail,
+            addresser
+        );
+    }
+
+    // Pad out our dims to block boundaries.
+    origin.x = PowTwoAlignDown(origin.x, addresser.GetBlockX());
+    origin.y = PowTwoAlignDown(origin.y, addresser.GetBlockY());
+    origin.z = PowTwoAlignDown(origin.z, addresser.GetBlockZ());
+    extent.width  = PowTwoAlign(extent.width,  addresser.GetBlockX());
+    extent.height = PowTwoAlign(extent.height, addresser.GetBlockY());
+    extent.depth  = PowTwoAlign(extent.depth,  addresser.GetBlockZ());
+
+    // Copy block by block. No complex swizzling here, everything is in (strided) typewriter order.
+    for (UINT_32 z = origin.z; z < (origin.z + extent.depth); z += addresser.GetBlockZ())
+    {
+        UINT_32 zBlk = (z >> addresser.GetBlockZBits()) * imageBlocksZ;
+        for (UINT_32 y = origin.y; y < (origin.y + extent.height); y += addresser.GetBlockY())
+        {
+            UINT_32 yBlk = ((y >> addresser.GetBlockYBits()) * imageBlocksY) + zBlk;
+            UINT_32 xBlkStart = (origin.x >> addresser.GetBlockXBits()) + yBlk;
+            UINT_32 numXBlk = extent.width >> addresser.GetBlockXBits();
+            UINT_64 offset = (xBlkStart << addresser.GetBlockBits());
+
+            void* pPix = VoidPtrInc(pImgBlockStart, offset);
+            UINT_32 copySize = numXBlk << addresser.GetBlockBits();
+
+#if ADDR_HAS_AVX2
+            if (NonTemporal && ImgIsDest)
+            {
+                StreamCopyToImgAligned(pPix, pBuf, copySize);
+            }
+            else if (NonTemporal)
+            {
+                StreamCopyFromImgAligned(pBuf, pPix, copySize);
+            }
+            else
+#endif
+            if (ImgIsDest)
+            {
+                memcpy(pPix, pBuf, copySize);
+            }
+            else
+            {
+                memcpy(pBuf, pPix, copySize);
+            }
+            pBuf = VoidPtrInc(pBuf, copySize);
+        }
    }
 }

@ -368,33 +808,130 @@ void Copy2DSliceUnaligned(
 *       Determines and returns which copy function to use for copying to images
 ****************************************************************************************************
 */
-UnalignedCopyMemImgFunc LutAddresser::GetCopyMemImgFunc() const
+UnalignedCopyMemImgFunc LutAddresser::GetCopyMemImgFunc(
+    ADDR_COPY_FLAGS flags
+    ) const
 {
+    UnalignedCopyMemImgFunc pfnRet = nullptr;
+    // This key encodes how the bottom 8 bits (256B) are formed, so we can match to the correct optimized
+    // swizzle function (they are all swizzle-agnostic beyond those 256B).
+    UINT_64 microSwKey = GetMicroSwKey(reinterpret_cast<const UINT_64*>(&m_bit[0]));
+
+    if (flags.blockMemcpy)
+    {
+#if ADDR_HAS_AVX2
+        if (CpuSupportsAvx2())
+        {
+            pfnRet = CopyMemImgBlocks<true, true>;
+        }
+        else
+#endif
+        {
+            pfnRet = CopyMemImgBlocks<true, false>;
+        }
+    }
+
+    if ((pfnRet == nullptr) && flags.hybridMemcpy)
+    {
+#if ADDR_HAS_AVX2
+        if (CpuSupportsAvx2())
+        {
+            pfnRet = CopyMemImgMicroblocks<true, true>;
+        }
+        else
+#endif
+        {
+            pfnRet = CopyMemImgMicroblocks<true, false>;
+        }
+    }
+
+    // If this is one of the known microswizzles and CPU support is present, use a hybrid copy that does
+    // SIMD swizzling for aligned regions and falls back for unaligned edges.
+#if ADDR_HAS_AVX2
+    static constexpr struct {
+        UINT_64                 microSwKey;
+        UnalignedCopyMemImgFunc pfn;
+    } AvxFuncs[] = {
+        { GetMicroSwKey(MicroSw_2D_1BPE_AVX2::MicroEq),  CopyMemImgHybrid<MicroSw_2D_1BPE_AVX2>},
+        { GetMicroSwKey(MicroSw_2D_2BPE_AVX2::MicroEq),  CopyMemImgHybrid<MicroSw_2D_2BPE_AVX2>},
+        { GetMicroSwKey(MicroSw_2D_4BPE_AVX2::MicroEq),  CopyMemImgHybrid<MicroSw_2D_4BPE_AVX2>},
+        { GetMicroSwKey(MicroSw_2D_8BPE_AVX2::MicroEq),  CopyMemImgHybrid<MicroSw_2D_8BPE_AVX2>},
+        { GetMicroSwKey(MicroSw_2D_16BPE_AVX2::MicroEq), CopyMemImgHybrid<MicroSw_2D_16BPE_AVX2>},
+        { GetMicroSwKey(MicroSw_3D_1BPE_AVX2::MicroEq),  CopyMemImgHybrid<MicroSw_3D_1BPE_AVX2>},
+        { GetMicroSwKey(MicroSw_3D_2BPE_AVX2::MicroEq),  CopyMemImgHybrid<MicroSw_3D_2BPE_AVX2>},
+        { GetMicroSwKey(MicroSw_3D_4BPE_AVX2::MicroEq),  CopyMemImgHybrid<MicroSw_3D_4BPE_AVX2>},
+        { GetMicroSwKey(MicroSw_3D_8BPE_AVX2::MicroEq),  CopyMemImgHybrid<MicroSw_3D_8BPE_AVX2>},
+        { GetMicroSwKey(MicroSw_3D_16BPE_AVX2::MicroEq), CopyMemImgHybrid<MicroSw_3D_16BPE_AVX2>},
+        { GetMicroSwKey(MicroSw_R_1BPE_AVX2::MicroEq),   CopyMemImgHybrid<MicroSw_R_1BPE_AVX2>},
+        { GetMicroSwKey(MicroSw_R_2BPE_AVX2::MicroEq),   CopyMemImgHybrid<MicroSw_R_2BPE_AVX2>},
+        { GetMicroSwKey(MicroSw_R_4BPE_AVX2::MicroEq),   CopyMemImgHybrid<MicroSw_R_4BPE_AVX2>},
+        { GetMicroSwKey(MicroSw_Z_1BPE_AVX2::MicroEq),   CopyMemImgHybrid<MicroSw_Z_1BPE_AVX2>},
+        { GetMicroSwKey(MicroSw_D_1BPE_AVX2::MicroEq),   CopyMemImgHybrid<MicroSw_D_1BPE_AVX2>}
+    };
+    if ((pfnRet == nullptr) && CpuSupportsAvx2())
+    {
+        for (const auto& func : AvxFuncs)
+        {
+            if (func.microSwKey == microSwKey)
+            {
+                pfnRet = func.pfn;
+                break;
+            }
+        }
+    }
+#endif // ADDR_HAS_AVX2
+
+#if ADDR_HAS_NEON
+    static constexpr struct {
+        UINT_64                 microSwKey;
+        UnalignedCopyMemImgFunc pfn;
+    } NeonFuncs[] = {
+        { GetMicroSwKey(MicroSw_2D_1BPE_NEON::MicroEq),  CopyMemImgHybrid<MicroSw_2D_1BPE_NEON>},
+        { GetMicroSwKey(MicroSw_2D_2BPE_NEON::MicroEq),  CopyMemImgHybrid<MicroSw_2D_2BPE_NEON>},
+        { GetMicroSwKey(MicroSw_2D_4BPE_NEON::MicroEq),  CopyMemImgHybrid<MicroSw_2D_4BPE_NEON>},
+        { GetMicroSwKey(MicroSw_2D_8BPE_NEON::MicroEq),  CopyMemImgHybrid<MicroSw_2D_8BPE_NEON>},
+        { GetMicroSwKey(MicroSw_2D_16BPE_NEON::MicroEq), CopyMemImgHybrid<MicroSw_2D_16BPE_NEON>},
+        { GetMicroSwKey(MicroSw_3D_1BPE_NEON::MicroEq),  CopyMemImgHybrid<MicroSw_3D_1BPE_NEON>},
+        { GetMicroSwKey(MicroSw_3D_2BPE_NEON::MicroEq),  CopyMemImgHybrid<MicroSw_3D_2BPE_NEON>},
+        { GetMicroSwKey(MicroSw_3D_4BPE_NEON::MicroEq),  CopyMemImgHybrid<MicroSw_3D_4BPE_NEON>},
+        { GetMicroSwKey(MicroSw_3D_8BPE_NEON::MicroEq),  CopyMemImgHybrid<MicroSw_3D_8BPE_NEON>},
+        { GetMicroSwKey(MicroSw_3D_16BPE_NEON::MicroEq), CopyMemImgHybrid<MicroSw_3D_16BPE_NEON>},
+        { GetMicroSwKey(MicroSw_R_1BPE_NEON::MicroEq),   CopyMemImgHybrid<MicroSw_R_1BPE_NEON>},
+        { GetMicroSwKey(MicroSw_R_2BPE_NEON::MicroEq),   CopyMemImgHybrid<MicroSw_R_2BPE_NEON>},
+        { GetMicroSwKey(MicroSw_R_4BPE_NEON::MicroEq),   CopyMemImgHybrid<MicroSw_R_4BPE_NEON>},
+        { GetMicroSwKey(MicroSw_Z_1BPE_NEON::MicroEq),   CopyMemImgHybrid<MicroSw_Z_1BPE_NEON>},
+        { GetMicroSwKey(MicroSw_D_1BPE_NEON::MicroEq),   CopyMemImgHybrid<MicroSw_D_1BPE_NEON>}
+    };
+    if ((pfnRet == nullptr) && CpuSupportsNeon())
+    {
+        for (const auto& func : NeonFuncs)
+        {
+            if (func.microSwKey == microSwKey)
+            {
+                pfnRet = func.pfn;
+                break;
+            }
+        }
+    }
+#endif // ADDR_HAS_NEON
+
    // While these are all the same function, the codegen gets really bad if the size of each pixel
    // is not known at compile time. Hence, templates.
    const UnalignedCopyMemImgFunc Funcs[MaxElementBytesLog2][3] =
    {
        // ExpandX =  1, 2, 4
-        { Copy2DSliceUnaligned<0, 1, true>, Copy2DSliceUnaligned<0, 2, true>, Copy2DSliceUnaligned<0, 4, true> }, // 1BPE
-        { Copy2DSliceUnaligned<1, 1, true>, Copy2DSliceUnaligned<1, 2, true>, Copy2DSliceUnaligned<1, 4, true> }, // 2BPE
-        { Copy2DSliceUnaligned<2, 1, true>, Copy2DSliceUnaligned<2, 2, true>, Copy2DSliceUnaligned<2, 4, true> }, // 4BPE
-        { Copy2DSliceUnaligned<3, 1, true>, Copy2DSliceUnaligned<3, 2, true>, Copy2DSliceUnaligned<3, 4, true> }, // 8BPE
-        { Copy2DSliceUnaligned<4, 1, true>, Copy2DSliceUnaligned<4, 2, true>, Copy2DSliceUnaligned<4, 4, true> }, // 16BPE
+        { CopyImgUnaligned<0, 1, true>, CopyImgUnaligned<0, 2, true>, CopyImgUnaligned<0, 4, true> }, // 1BPE
+        { CopyImgUnaligned<1, 1, true>, CopyImgUnaligned<1, 2, true>, CopyImgUnaligned<1, 4, true> }, // 2BPE
+        { CopyImgUnaligned<2, 1, true>, CopyImgUnaligned<2, 2, true>, CopyImgUnaligned<2, 4, true> }, // 4BPE
+        { CopyImgUnaligned<3, 1, true>, CopyImgUnaligned<3, 2, true>, CopyImgUnaligned<3, 4, true> }, // 8BPE
+        { CopyImgUnaligned<4, 1, true>, CopyImgUnaligned<4, 2, true>, CopyImgUnaligned<4, 4, true> }, // 16BPE
    };

-    UnalignedCopyMemImgFunc pfnRet = nullptr;
-    ADDR_ASSERT(m_bpeLog2 < MaxElementBytesLog2);
-    if (m_maxExpandX >= 4)
+    // Fallback functions
+    if (pfnRet == nullptr)
    {
-        pfnRet = Funcs[m_bpeLog2][2];
-    }
-    else if (m_maxExpandX >= 2)
-    {
-        pfnRet = Funcs[m_bpeLog2][1];
-    }
-    else
-    {
-        pfnRet = Funcs[m_bpeLog2][0];
+        ADDR_ASSERT(m_bpeLog2 < MaxElementBytesLog2);
+        pfnRet = Funcs[m_bpeLog2][Min(2U, Log2(m_maxExpandX))];
    }
    return pfnRet;
 }
@ -407,35 +944,139 @@ UnalignedCopyMemImgFunc LutAddresser::GetCopyMemImgFunc() const
 *       Determines and returns which copy function to use for copying from images
 ****************************************************************************************************
 */
-UnalignedCopyMemImgFunc LutAddresser::GetCopyImgMemFunc() const
+UnalignedCopyMemImgFunc LutAddresser::GetCopyImgMemFunc(
+    ADDR_COPY_FLAGS flags
+    ) const
 {
+    UnalignedCopyMemImgFunc pfnRet = nullptr;
+    if (flags.blockMemcpy)
+    {
+#if ADDR_HAS_AVX2
+        if (CpuSupportsAvx2())
+        {
+            pfnRet = CopyMemImgBlocks<false, true>;
+        }
+        else
+#endif
+        {
+            pfnRet = CopyMemImgBlocks<false, false>;
+        }
+    }
+
+    if ((pfnRet == nullptr) && flags.hybridMemcpy)
+    {
+#if ADDR_HAS_AVX2
+        if (CpuSupportsAvx2())
+        {
+            pfnRet = CopyMemImgMicroblocks<false, true>;
+        }
+        else
+#endif
+        {
+            pfnRet = CopyMemImgMicroblocks<false, false>;
+        }
+    }
    // While these are all the same function, the codegen gets really bad if the size of each pixel
    // is not known at compile time. Hence, templates.
    const UnalignedCopyMemImgFunc Funcs[MaxElementBytesLog2][3] =
    {
        // ExpandX =  1, 2, 4
-        { Copy2DSliceUnaligned<0, 1, false>, Copy2DSliceUnaligned<0, 2, false>, Copy2DSliceUnaligned<0, 4, false> }, // 1BPE
-        { Copy2DSliceUnaligned<1, 1, false>, Copy2DSliceUnaligned<1, 2, false>, Copy2DSliceUnaligned<1, 4, false> }, // 2BPE
-        { Copy2DSliceUnaligned<2, 1, false>, Copy2DSliceUnaligned<2, 2, false>, Copy2DSliceUnaligned<2, 4, false> }, // 4BPE
-        { Copy2DSliceUnaligned<3, 1, false>, Copy2DSliceUnaligned<3, 2, false>, Copy2DSliceUnaligned<3, 4, false> }, // 8BPE
-        { Copy2DSliceUnaligned<4, 1, false>, Copy2DSliceUnaligned<4, 2, false>, Copy2DSliceUnaligned<4, 4, false> }, // 16BPE
+        { CopyImgUnaligned<0, 1, false>, CopyImgUnaligned<0, 2, false>, CopyImgUnaligned<0, 4, false> }, // 1BPE
+        { CopyImgUnaligned<1, 1, false>, CopyImgUnaligned<1, 2, false>, CopyImgUnaligned<1, 4, false> }, // 2BPE
+        { CopyImgUnaligned<2, 1, false>, CopyImgUnaligned<2, 2, false>, CopyImgUnaligned<2, 4, false> }, // 4BPE
+        { CopyImgUnaligned<3, 1, false>, CopyImgUnaligned<3, 2, false>, CopyImgUnaligned<3, 4, false> }, // 8BPE
+        { CopyImgUnaligned<4, 1, false>, CopyImgUnaligned<4, 2, false>, CopyImgUnaligned<4, 4, false> }, // 16BPE
    };

-    UnalignedCopyMemImgFunc pfnRet = nullptr;
    ADDR_ASSERT(m_bpeLog2 < MaxElementBytesLog2);
-    if (m_maxExpandX >= 4)
+    if (pfnRet == nullptr)
    {
-        pfnRet = Funcs[m_bpeLog2][2];
-    }
-    else if (m_maxExpandX >= 2)
-    {
-        pfnRet = Funcs[m_bpeLog2][1];
-    }
-    else
-    {
-        pfnRet = Funcs[m_bpeLog2][0];
+        pfnRet = Funcs[m_bpeLog2][Min(2U, Log2(m_maxExpandX))];
    }
    return pfnRet;
 }

+/**
+****************************************************************************************************
+*   LutAddresser::DoCopyImgMemPreFlushes
+*
+*   @brief
+*       Does any flushes required for nontemporal SIMD instructions to access the image memory.
+****************************************************************************************************
+*/
+void LutAddresser::DoCopyImgMemPreFlushes(
+    ADDR_COPY_FLAGS flags
+    ) const
+{
+#if ADDR_HAS_AVX2
+    if ((flags.blockMemcpy || flags.hybridMemcpy) && CpuSupportsAvx2())
+    {
+        // Loads are weakly ordered, and we need to ensure they start after the previous copy
+        NonTemporalLoadStoreFence();
+    }
+#endif
+}
+
+/**
+****************************************************************************************************
+*   LutAddresser::DoCopyMemImgPostFlushes
+*
+*   @brief
+*       Does any flushes required for nontemporal SIMD instructions to access the image memory.
+****************************************************************************************************
+*/
+void LutAddresser::DoCopyMemImgPostFlushes(
+    ADDR_COPY_FLAGS flags
+    ) const
+{
+#if ADDR_HAS_AVX2
+    if (CpuSupportsAvx2())
+    {
+        // Stores are weakly ordered, and we need to ensure they finish before the next submission
+        // or copy.
+        NonTemporalStoreFence();
+    }
+#endif
+}
+
+
+#if __cplusplus < 201703L
+// Constexpr arrays need an additional definition at namespace scope until c++17
+#if ADDR_HAS_AVX2
+constexpr ADDR_EXTENT3D MicroSw_2D_1BPE_AVX2::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_2D_2BPE_AVX2::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_2D_4BPE_AVX2::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_2D_8BPE_AVX2::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_2D_16BPE_AVX2::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_3D_1BPE_AVX2::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_3D_2BPE_AVX2::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_3D_4BPE_AVX2::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_3D_8BPE_AVX2::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_3D_16BPE_AVX2::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_R_1BPE_AVX2::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_R_2BPE_AVX2::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_R_4BPE_AVX2::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_Z_1BPE_AVX2::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_D_1BPE_AVX2::MicroBlockExtent;
+#endif
+#if ADDR_HAS_NEON
+constexpr ADDR_EXTENT3D MicroSw_2D_1BPE_NEON::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_2D_2BPE_NEON::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_2D_4BPE_NEON::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_2D_8BPE_NEON::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_2D_16BPE_NEON::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_3D_1BPE_NEON::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_3D_2BPE_NEON::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_3D_4BPE_NEON::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_3D_8BPE_NEON::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_3D_16BPE_NEON::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_R_1BPE_NEON::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_R_2BPE_NEON::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_R_4BPE_NEON::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_Z_1BPE_NEON::MicroBlockExtent;
+constexpr ADDR_EXTENT3D MicroSw_D_1BPE_NEON::MicroBlockExtent;
+#endif
+
+#endif
+
 }
--- a/src/amd/addrlib/src/core/addrswizzler.h
+++ b/src/amd/addrlib/src/core/addrswizzler.h
@ -1,7 +1,8 @@
 /*
 ************************************************************************************************************************
 *
-*  Copyright (C) 2024 Advanced Micro Devices, Inc.  All rights reserved.
+*  Copyright (C) 2024-2026 Advanced Micro Devices, Inc.  All rights reserved.
+*  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
 /**
@ -26,10 +27,13 @@ typedef void (*UnalignedCopyMemImgFunc)(
    void*               pImgBlockSliceStart,  // Block corresponding to beginning of slice
    void*               pBuf,                 // Pointer to data starting from the copy origin.
    size_t              bufStrideY,           // Stride of each row in pBuf
+    size_t              bufStrideZ,           // Stride of each slice in pBuf
    UINT_32             imageBlocksY,         // Width of the image slice, in blocks.
-    ADDR_COORD2D        origin,               // Absolute origin, in elements
-    ADDR_EXTENT2D       extent,               // Size to copy, in elements
-    UINT_32             sliceXor,             // Includes pipeBankXor and z XOR
+    UINT_32             imageBlocksZ,         // Depth pitch of the image slice, in blocks.
+    ADDR_COORD3D        origin,               // Absolute origin, in elements
+    ADDR_EXTENT3D       extent,               // Size to copy, in elements
+    UINT_32             pipeBankXor,          // Final value to XOR into the address
+    BOOL_32             isInMipTail,          // True if this is in the mip tail.
    const LutAddresser& addresser);

 // This class calculates and holds up to four lookup tables (x/y/z/s) which can be used to cheaply calculate the
@ -60,10 +64,21 @@ public:

    // Get the block size
    UINT_32  GetBlockBits() const { return m_blockBits; }
+    UINT_32  GetBlockX() const { return m_blockSize.width; }
+    UINT_32  GetBlockY() const { return m_blockSize.height; }
+    UINT_32  GetBlockZ() const { return m_blockSize.depth; }
    UINT_32  GetBlockXBits() const { return Log2(m_blockSize.width); }
    UINT_32  GetBlockYBits() const { return Log2(m_blockSize.height); }
    UINT_32  GetBlockZBits() const { return Log2(m_blockSize.depth); }

+    // Get the microblock size
+    UINT_32  GetMicroBlockX() const { return m_microBlockSize.width; }
+    UINT_32  GetMicroBlockY() const { return m_microBlockSize.height; }
+    UINT_32  GetMicroBlockZ() const { return m_microBlockSize.depth; }
+
+    // Get other image props
+    UINT_32  GetBpeLog2() const { return m_bpeLog2; }
+
    // "Fast single channel" functions to get the part that each channel contributes to be XORd together.
    UINT_32  GetAddressX(UINT_32  x) const { return m_pXLut[x & m_xLutMask];}
    UINT_32  GetAddressY(UINT_32  y) const { return m_pYLut[y & m_yLutMask];}
@ -71,8 +86,11 @@ public:
    UINT_32  GetAddressS(UINT_32  s) const { return m_pSLut[s & m_sLutMask];}

    // Get a function that can copy a single 2D slice of an image with this swizzle.
-    UnalignedCopyMemImgFunc GetCopyMemImgFunc() const;
-    UnalignedCopyMemImgFunc GetCopyImgMemFunc() const;
+    UnalignedCopyMemImgFunc GetCopyMemImgFunc(ADDR_COPY_FLAGS flags) const;
+    UnalignedCopyMemImgFunc GetCopyImgMemFunc(ADDR_COPY_FLAGS flags) const;
+
+    void DoCopyMemImgPostFlushes(ADDR_COPY_FLAGS flags) const;
+    void DoCopyImgMemPreFlushes(ADDR_COPY_FLAGS flags) const;
 private:
    // Calculate general properties of the swizzle equations
    void InitSwizzleProps();
@ -99,6 +117,9 @@ private:

    // The block size
    ADDR_EXTENT3D m_blockSize;
+    
+    // The microblock size
+    ADDR_EXTENT3D m_microBlockSize;

    // Number of 'x' bits at the bottom of the equation. Must be a pow2 and at least 1.
    // This will be used as a simple optimization to batch together operations on adjacent x pixels.
--- a/src/amd/addrlib/src/core/addrswizzlersimd.h
+++ b/src/amd/addrlib/src/core/addrswizzlersimd.h
--- a/src/amd/addrlib/src/gfx10/gfx10addrlib.cpp
+++ b/src/amd/addrlib/src/gfx10/gfx10addrlib.cpp
@ -1,7 +1,7 @@
 /*
 ************************************************************************************************************************
 *
-*  Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved.
+*  Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved.
 *  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
@ -2782,9 +2782,11 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlGetPreferredSurfaceSetting(

            if ((forbid64KbBlockType == FALSE) && (forbidVarBlockType == FALSE))
            {
+                UINT_32 ratioLow;
+                UINT_32 ratioHi;
+                GetSwizzleModePreferenceRatio(pIn, &ratioLow, &ratioHi);
+
                const UINT_8  maxFmaskSwizzleModeType = 2;
-                const UINT_32 ratioLow                = pIn->flags.minimizeAlign ? 1 : (pIn->flags.opt4space ? 3 : 2);
-                const UINT_32 ratioHi                 = pIn->flags.minimizeAlign ? 1 : (pIn->flags.opt4space ? 2 : 1);
                const UINT_32 fmaskBpp                = GetFmaskBpp(pIn->numSamples, pIn->numFrags);
                const UINT_32 numSlices               = Max(pIn->numSlices, 1u);
                const UINT_32 width                   = Max(pIn->width, 1u);
@ -3097,8 +3099,10 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlGetPreferredSurfaceSetting(
                        // Tracks the size of each valid swizzle mode's surface in bytes
                        UINT_64 padSize[AddrBlockMaxTiledType] = {};

-                        const UINT_32 ratioLow           = computeMinSize ? 1 : (pIn->flags.opt4space ? 3 : 2);
-                        const UINT_32 ratioHi            = computeMinSize ? 1 : (pIn->flags.opt4space ? 2 : 1);
+                        UINT_32 ratioLow;
+                        UINT_32 ratioHi;
+                        GetSwizzleModePreferenceRatio(pIn, &ratioLow, &ratioHi);
+
                        const UINT_64 sizeAlignInElement = Max(NextPow2(pIn->minSizeAlign) / (bpp >> 3), 1u);
                        UINT_32       minSizeBlk         = AddrBlockMicro; // Tracks the most optimal block to use
                        UINT_64       minSize            = 0;              // Tracks the minimum acceptable block type
@ -4111,7 +4115,7 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlComputeSurfaceAddrFromCoordTiled(
 *   Gfx10Lib::HwlCopyMemToSurface
 *
 *   @brief
-*       Copy multiple regions from memory to a non-linear surface. 
+*       Copy multiple regions from memory to a non-linear surface.
 *
 *   @return
 *       Error or success.
@ -4177,7 +4181,7 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlCopyMemToSurface(

    LutAddresser addresser = LutAddresser();
    addresser.Init(fullSwizzlePattern, ADDR_MAX_EQUATION_BIT, blockExtent, blkSizeLog2);
-    UnalignedCopyMemImgFunc pfnCopyUnaligned = addresser.GetCopyMemImgFunc();
+    UnalignedCopyMemImgFunc pfnCopyUnaligned = addresser.GetCopyMemImgFunc(pIn->copyFlags);
    if (pfnCopyUnaligned == nullptr)
    {
        ADDR_ASSERT_ALWAYS();
@ -4192,35 +4196,27 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlCopyMemToSurface(
            const ADDR2_MIP_INFO* pMipInfo = &mipInfo[pCurRegion->mipId];
            UINT_64 mipOffset = pIn->singleSubres ? 0 : pMipInfo->macroBlockOffset;
            UINT_32 yBlks = pMipInfo->pitch / localOut.blockWidth;
+            UINT_32 zBlks = localOut.sliceSize >> (addresser.GetBlockBits() - addresser.GetBlockZBits());

-            UINT_32 xStart = pCurRegion->x + pMipInfo->mipTailCoordX;
-            UINT_32 yStart = pCurRegion->y + pMipInfo->mipTailCoordY;
-            UINT_32 sliceStart = pCurRegion->slice + pMipInfo->mipTailCoordZ;
+            ADDR_COORD3D rawOrigin = {
+                pCurRegion->x + pMipInfo->mipTailCoordX,
+                pCurRegion->y + pMipInfo->mipTailCoordY,
+                pCurRegion->slice + pMipInfo->mipTailCoordZ
+            };

-            for (UINT_32 slice = sliceStart; slice < (sliceStart + pCurRegion->copyDims.depth); slice++)
-            {
-                // The copy functions take the base address of the hardware slice, not the logical slice. Those are
-                // not the same thing in 3D swizzles. Logical slices within 3D swizzles are handled by sliceXor
-                // for unaligned copies.
-                UINT_32 sliceBlkStart = PowTwoAlignDown(slice, localOut.blockSlices);
-                UINT_32 sliceXor = pIn->pbXor ^ addresser.GetAddressZ(slice);
-
-                UINT_64 memOffset = ((slice - pCurRegion->slice) * pCurRegion->memSlicePitch);
-                UINT_64 imgOffset = mipOffset + (sliceBlkStart * localOut.sliceSize);
-
-                ADDR_COORD2D sliceOrigin = { xStart, yStart };
-                ADDR_EXTENT2D sliceExtent = { pCurRegion->copyDims.width, pCurRegion->copyDims.height };
-
-                pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, imgOffset),
-                                 VoidPtrInc(pCurRegion->pMem, memOffset),
-                                 pCurRegion->memRowPitch,
-                                 yBlks,
-                                 sliceOrigin,
-                                 sliceExtent,
-                                 sliceXor,
-                                 addresser);
-            }
+            pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, mipOffset),
+                             pCurRegion->pMem,
+                             pCurRegion->memRowPitch,
+                             pCurRegion->memSlicePitch,
+                             yBlks,
+                             zBlks,
+                             rawOrigin,
+                             pCurRegion->copyDims,
+                             pIn->pbXor,
+                             (pCurRegion->mipId >= localOut.firstMipIdInTail),
+                             addresser);
        }
+        addresser.DoCopyMemImgPostFlushes(pIn->copyFlags);
    }
    return returnCode;
 }
@ -4230,7 +4226,7 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlCopyMemToSurface(
 *   Gfx10Lib::HwlCopySurfaceToMem
 *
 *   @brief
-*       Copy multiple regions from a non-linear surface to memory. 
+*       Copy multiple regions from a non-linear surface to memory.
 *
 *   @return
 *       Error or success.
@ -4296,7 +4292,7 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlCopySurfaceToMem(

    LutAddresser addresser = LutAddresser();
    addresser.Init(fullSwizzlePattern, ADDR_MAX_EQUATION_BIT, blockExtent, blkSizeLog2);
-    UnalignedCopyMemImgFunc pfnCopyUnaligned = addresser.GetCopyImgMemFunc();
+    UnalignedCopyMemImgFunc pfnCopyUnaligned = addresser.GetCopyImgMemFunc(pIn->copyFlags);
    if (pfnCopyUnaligned == nullptr)
    {
        ADDR_ASSERT_ALWAYS();
@ -4305,40 +4301,32 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlCopySurfaceToMem(

    if (returnCode == ADDR_OK)
    {
+        addresser.DoCopyImgMemPreFlushes(pIn->copyFlags);
        for (UINT_32  regionIdx = 0; regionIdx < regionCount; regionIdx++)
        {
            const ADDR2_COPY_MEMSURFACE_REGION* pCurRegion = &pRegions[regionIdx];
            const ADDR2_MIP_INFO* pMipInfo = &mipInfo[pCurRegion->mipId];
            UINT_64 mipOffset = pIn->singleSubres ? 0 : pMipInfo->macroBlockOffset;
            UINT_32 yBlks = pMipInfo->pitch / localOut.blockWidth;
+            UINT_32 zBlks = localOut.sliceSize >> (addresser.GetBlockBits() - addresser.GetBlockZBits());

-            UINT_32 xStart = pCurRegion->x + pMipInfo->mipTailCoordX;
-            UINT_32 yStart = pCurRegion->y + pMipInfo->mipTailCoordY;
-            UINT_32 sliceStart = pCurRegion->slice + pMipInfo->mipTailCoordZ;
+            ADDR_COORD3D rawOrigin = {
+                pCurRegion->x + pMipInfo->mipTailCoordX,
+                pCurRegion->y + pMipInfo->mipTailCoordY,
+                pCurRegion->slice + pMipInfo->mipTailCoordZ
+            };

-            for (UINT_32 slice = sliceStart; slice < (sliceStart + pCurRegion->copyDims.depth); slice++)
-            {
-                // The copy functions take the base address of the hardware slice, not the logical slice. Those are
-                // not the same thing in 3D swizzles. Logical slices within 3D swizzles are handled by sliceXor
-                // for unaligned copies.
-                UINT_32 sliceBlkStart = PowTwoAlignDown(slice, localOut.blockSlices);
-                UINT_32 sliceXor = pIn->pbXor ^ addresser.GetAddressZ(slice);
-
-                UINT_64 memOffset = ((slice - pCurRegion->slice) * pCurRegion->memSlicePitch);
-                UINT_64 imgOffset = mipOffset + (sliceBlkStart * localOut.sliceSize);
-
-                ADDR_COORD2D sliceOrigin = { xStart, yStart };
-                ADDR_EXTENT2D sliceExtent = { pCurRegion->copyDims.width, pCurRegion->copyDims.height };
-
-                pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, imgOffset),
-                                 VoidPtrInc(pCurRegion->pMem, memOffset),
-                                 pCurRegion->memRowPitch,
-                                 yBlks,
-                                 sliceOrigin,
-                                 sliceExtent,
-                                 sliceXor,
-                                 addresser);
-            }
+            pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, mipOffset),
+                             pCurRegion->pMem,
+                             pCurRegion->memRowPitch,
+                             pCurRegion->memSlicePitch,
+                             yBlks,
+                             zBlks,
+                             rawOrigin,
+                             pCurRegion->copyDims,
+                             pIn->pbXor,
+                             (pCurRegion->mipId >= localOut.firstMipIdInTail),
+                             addresser);
        }
    }
    return returnCode;
--- a/src/amd/addrlib/src/gfx11/gfx11addrlib.cpp
+++ b/src/amd/addrlib/src/gfx11/gfx11addrlib.cpp
@ -1,7 +1,7 @@
 /*
 ************************************************************************************************************************
 *
-*  Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved.
+*  Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved.
 *  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
@ -758,6 +758,14 @@ ChipFamily Gfx11Lib::HwlConvertChipFamily(
        case FAMILY_PHX:
            m_settings.isPhoenix = 1;
            break;
+        case FAMILY_GFX1170:
+        {
+            if (ASICREV_IS_GFX1170(chipRevision))
+            {
+                m_settings.isGfx1170 = 1;
+            }
+        }
+        break;
        default:
            ADDR_ASSERT(!"Unknown chip family");
            break;
@ -2651,10 +2659,13 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlGetPreferredSurfaceSetting(

                        UINT_64 padSize[AddrBlockMaxTiledType] = {};

-                        const UINT_32 ratioLow           = computeMinSize ? 1 : (pIn->flags.opt4space ? 3 : 2);
-                        const UINT_32 ratioHi            = computeMinSize ? 1 : (pIn->flags.opt4space ? 2 : 1);
+                        UINT_32 ratioLow;
+                        UINT_32 ratioHi;
+                        GetSwizzleModePreferenceRatio(pIn, &ratioLow, &ratioHi);
+
                        const UINT_64 sizeAlignInElement = Max(NextPow2(pIn->minSizeAlign) / (bpp >> 3), 1u);
                        UINT_32       minSizeBlk         = AddrBlockMicro;
+                        UINT_32       selectedBlk        = AddrBlockMaxTiledType;
                        UINT_64       minSize            = 0;

                        ADDR2_COMPUTE_SURFACE_INFO_OUTPUT localOut = {};
@ -2678,11 +2689,66 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlGetPreferredSurfaceSetting(
                                {
                                    padSize[i] = localOut.surfSize;

-                                    if ((minSize == 0) ||
-                                        Addr2BlockTypeWithinMemoryBudget(minSize, padSize[i], ratioLow, ratioHi))
+                                    if (pIn->useBlockBasedHeuristic)
                                    {
-                                        minSize    = padSize[i];
-                                        minSizeBlk = i;
+                                        const UINT_32 blockCountX = localOut.pitch / localOut.blockWidth;
+                                        const UINT_32 blockCountY = localOut.height / localOut.blockHeight;
+                                        const UINT_32 blockCountZ = localOut.numSlices / localOut.blockSlices;
+
+                                        UINT_32 requiredBlockCountX = 1;
+                                        UINT_32 requiredBlockCountY = 1;
+                                        UINT_32 requiredBlockCountZ = 1;
+
+                                        switch (pIn->resourceType)
+                                        {
+                                        case ADDR_RSRC_TEX_1D:
+                                            requiredBlockCountX = 2;
+                                            break;
+                                        case ADDR_RSRC_TEX_2D:
+                                            requiredBlockCountX = 2;
+                                            requiredBlockCountY = 2;
+                                            break;
+                                        case ADDR_RSRC_TEX_3D:
+                                            requiredBlockCountX = 2;
+                                            requiredBlockCountY = 2;
+                                            if (IsThick(pIn->resourceType, localIn.swizzleMode))
+                                            {
+                                                requiredBlockCountZ = 2;
+                                            }
+                                            break;
+                                        default:
+                                            ADDR_ASSERT_ALWAYS();
+                                        }
+
+                                        // If the block count is sufficient, select this block type. Otherwise, track the block type with minimum size to
+                                        // fall back to it, in case no block type can satisfy the block count requirement.
+                                        if ((blockCountX >= requiredBlockCountX) &&
+                                            (blockCountY >= requiredBlockCountY) &&
+                                            (blockCountZ >= requiredBlockCountZ) &&
+                                            (localIn.swizzleMode != ADDR_SW_LINEAR))
+                                        {
+                                            selectedBlk = i;
+                                        }
+                                        else
+                                        {
+                                            const bool has3DThick = (allowedSwModeSet.value & Gfx11Rsrc3dThickSwModeMask) != 0;
+                                            const bool is3DThin = (pOut->resourceType == ADDR_RSRC_TEX_3D) &&
+                                                                   IsThin(pOut->resourceType, swMode[i]);
+                                            if (((has3DThick && is3DThin) == FALSE) && (minSize == 0 || (padSize[i] < minSize)))
+                                            {
+                                                minSize = padSize[i];
+                                                minSizeBlk = i;
+                                            }
+                                        }
+                                    }
+                                    else
+                                    {
+                                        if ((minSize == 0) ||
+                                            Addr2BlockTypeWithinMemoryBudget(minSize, padSize[i], ratioLow, ratioHi))
+                                        {
+                                            minSize = padSize[i];
+                                            minSizeBlk = i;
+                                        }
                                    }
                                }
                                else
@ -2693,63 +2759,77 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlGetPreferredSurfaceSetting(
                            }
                        }

-                        if (pIn->memoryBudget > 1.0)
+                        if (pIn->useBlockBasedHeuristic)
                        {
-                            // If minimum size is given by swizzle mode with bigger-block type, then don't ever check
-                            // smaller-block type again in coming loop
-                            switch (minSizeBlk)
+                            // If there was no block size that would satisfy block based heuristic, fall back to the budget-based heuristic.
+                            if (selectedBlk == AddrBlockMaxTiledType)
                            {
-                                case AddrBlockThick256KB:
-                                    allowedBlockSet.gfx11.thin256KB = 0;
-                                case AddrBlockThin256KB:
-                                    allowedBlockSet.macroThick64KB = 0;
-                                case AddrBlockThick64KB:
-                                    allowedBlockSet.macroThin64KB = 0;
-                                case AddrBlockThin64KB:
-                                    allowedBlockSet.macroThick4KB = 0;
-                                case AddrBlockThick4KB:
-                                    allowedBlockSet.macroThin4KB = 0;
-                                case AddrBlockThin4KB:
-                                    allowedBlockSet.micro  = 0;
-                                case AddrBlockMicro:
-                                    allowedBlockSet.linear = 0;
-                                case AddrBlockLinear:
-                                    break;
-
-                                default:
-                                    ADDR_ASSERT_ALWAYS();
-                                    break;
+                                selectedBlk = minSizeBlk;
                            }
-
-                            for (UINT_32 i = AddrBlockMicro; i < AddrBlockMaxTiledType; i++)
+                        }
+                        else
+                        {
+                            if (pIn->memoryBudget > 1.0)
                            {
-                                if ((i != minSizeBlk) &&
-                                    Addr2IsBlockTypeAvailable(allowedBlockSet, static_cast<::AddrBlockType>(i)))
+                                // If minimum size is given by swizzle mode with bigger-block type, then don't ever check
+                                // smaller-block type again in coming loop
+                                switch (minSizeBlk)
                                {
-                                    if (Addr2BlockTypeWithinMemoryBudget(minSize, padSize[i], 0, 0, pIn->memoryBudget) == FALSE)
+                                    case AddrBlockThick256KB:
+                                        allowedBlockSet.gfx11.thin256KB = 0;
+                                    case AddrBlockThin256KB:
+                                        allowedBlockSet.macroThick64KB = 0;
+                                    case AddrBlockThick64KB:
+                                        allowedBlockSet.macroThin64KB = 0;
+                                    case AddrBlockThin64KB:
+                                        allowedBlockSet.macroThick4KB = 0;
+                                    case AddrBlockThick4KB:
+                                        allowedBlockSet.macroThin4KB = 0;
+                                    case AddrBlockThin4KB:
+                                        allowedBlockSet.micro  = 0;
+                                    case AddrBlockMicro:
+                                        allowedBlockSet.linear = 0;
+                                    case AddrBlockLinear:
+                                        break;
+
+                                    default:
+                                        ADDR_ASSERT_ALWAYS();
+                                        break;
+                                }
+
+                                for (UINT_32 i = AddrBlockMicro; i < AddrBlockMaxTiledType; i++)
+                                {
+                                    if ((i != minSizeBlk) &&
+                                        Addr2IsBlockTypeAvailable(allowedBlockSet, static_cast<AddrBlockType>(i)))
                                    {
-                                        // Clear the block type if the memory waste is unacceptable
-                                        allowedBlockSet.value &= ~(1u << (i - 1));
+                                        if (Addr2BlockTypeWithinMemoryBudget(minSize, padSize[i], 0, 0, pIn->memoryBudget) == FALSE)
+                                        {
+                                            // Clear the block type if the memory waste is unacceptable
+                                            allowedBlockSet.value &= ~(1u << (i - 1));
+                                        }
                                    }
                                }
+
+                                // Remove linear block type if 2 or more block types are allowed
+                                if (IsPow2(allowedBlockSet.value) == FALSE)
+                                {
+                                    allowedBlockSet.linear = 0;
+                                }
+
+                                // Select the biggest allowed block type
+                                minSizeBlk = Log2(allowedBlockSet.value) + 1;
+
+                                if (minSizeBlk == static_cast<UINT_32>(AddrBlockMaxTiledType))
+                                {
+                                    minSizeBlk = AddrBlockLinear;
+                                }
                            }

-                            // Remove linear block type if 2 or more block types are allowed
-                            if (IsPow2(allowedBlockSet.value) == FALSE)
-                            {
-                                allowedBlockSet.linear = 0;
-                            }
-
-                            // Select the biggest allowed block type
-                            minSizeBlk = Log2(allowedBlockSet.value) + 1;
-
-                            if (minSizeBlk == static_cast<UINT_32>(AddrBlockMaxTiledType))
-                            {
-                                minSizeBlk = AddrBlockLinear;
-                            }
+                            selectedBlk = minSizeBlk;
                        }

-                        switch (minSizeBlk)
+
+                        switch (selectedBlk)
                        {
                            case AddrBlockLinear:
                                allowedSwModeSet.value &= Gfx11LinearSwModeMask;
@ -3685,7 +3765,7 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlComputeSurfaceAddrFromCoordTiled(
 *   Gfx11Lib::HwlCopyMemToSurface
 *
 *   @brief
-*       Copy multiple regions from memory to a non-linear surface. 
+*       Copy multiple regions from memory to a non-linear surface.
 *
 *   @return
 *       Error or success.
@ -3751,7 +3831,7 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlCopyMemToSurface(

    LutAddresser addresser = LutAddresser();
    addresser.Init(fullSwizzlePattern, ADDR_MAX_EQUATION_BIT, blockExtent, blkSizeLog2);
-    UnalignedCopyMemImgFunc pfnCopyUnaligned = addresser.GetCopyMemImgFunc();
+    UnalignedCopyMemImgFunc pfnCopyUnaligned = addresser.GetCopyMemImgFunc(pIn->copyFlags);
    if (pfnCopyUnaligned == nullptr)
    {
        ADDR_ASSERT_ALWAYS();
@ -3766,35 +3846,27 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlCopyMemToSurface(
            const ADDR2_MIP_INFO* pMipInfo = &mipInfo[pCurRegion->mipId];
            UINT_64 mipOffset = pIn->singleSubres ? 0 : pMipInfo->macroBlockOffset;
            UINT_32 yBlks = pMipInfo->pitch / localOut.blockWidth;
+            UINT_32 zBlks = localOut.sliceSize >> (addresser.GetBlockBits() - addresser.GetBlockZBits());

-            UINT_32 xStart = pCurRegion->x + pMipInfo->mipTailCoordX;
-            UINT_32 yStart = pCurRegion->y + pMipInfo->mipTailCoordY;
-            UINT_32 sliceStart = pCurRegion->slice + pMipInfo->mipTailCoordZ;
+            ADDR_COORD3D rawOrigin = {
+                pCurRegion->x + pMipInfo->mipTailCoordX,
+                pCurRegion->y + pMipInfo->mipTailCoordY,
+                pCurRegion->slice + pMipInfo->mipTailCoordZ
+            };

-            for (UINT_32 slice = sliceStart; slice < (sliceStart + pCurRegion->copyDims.depth); slice++)
-            {
-                // The copy functions take the base address of the hardware slice, not the logical slice. Those are
-                // not the same thing in 3D swizzles. Logical slices within 3D swizzles are handled by sliceXor
-                // for unaligned copies.
-                UINT_32 sliceBlkStart = PowTwoAlignDown(slice, localOut.blockSlices);
-                UINT_32 sliceXor = pIn->pbXor ^ addresser.GetAddressZ(slice);
-
-                UINT_64 memOffset = ((slice - pCurRegion->slice) * pCurRegion->memSlicePitch);
-                UINT_64 imgOffset = mipOffset + (sliceBlkStart * localOut.sliceSize);
-
-                ADDR_COORD2D sliceOrigin = { xStart, yStart };
-                ADDR_EXTENT2D sliceExtent = { pCurRegion->copyDims.width, pCurRegion->copyDims.height };
-
-                pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, imgOffset),
-                                 VoidPtrInc(pCurRegion->pMem, memOffset),
-                                 pCurRegion->memRowPitch,
-                                 yBlks,
-                                 sliceOrigin,
-                                 sliceExtent,
-                                 sliceXor,
-                                 addresser);
-            }
+            pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, mipOffset),
+                             pCurRegion->pMem,
+                             pCurRegion->memRowPitch,
+                             pCurRegion->memSlicePitch,
+                             yBlks,
+                             zBlks,
+                             rawOrigin,
+                             pCurRegion->copyDims,
+                             pIn->pbXor,
+                             (pCurRegion->mipId >= localOut.firstMipIdInTail),
+                             addresser);
        }
+        addresser.DoCopyMemImgPostFlushes(pIn->copyFlags);
    }
    return returnCode;
 }
@ -3804,7 +3876,7 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlCopyMemToSurface(
 *   Gfx11Lib::HwlCopySurfaceToMem
 *
 *   @brief
-*       Copy multiple regions from a non-linear surface to memory. 
+*       Copy multiple regions from a non-linear surface to memory.
 *
 *   @return
 *       Error or success.
@ -3870,7 +3942,7 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlCopySurfaceToMem(

    LutAddresser addresser = LutAddresser();
    addresser.Init(fullSwizzlePattern, ADDR_MAX_EQUATION_BIT, blockExtent, blkSizeLog2);
-    UnalignedCopyMemImgFunc pfnCopyUnaligned = addresser.GetCopyImgMemFunc();
+    UnalignedCopyMemImgFunc pfnCopyUnaligned = addresser.GetCopyImgMemFunc(pIn->copyFlags);
    if (pfnCopyUnaligned == nullptr)
    {
        ADDR_ASSERT_ALWAYS();
@ -3879,40 +3951,32 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlCopySurfaceToMem(

    if (returnCode == ADDR_OK)
    {
+        addresser.DoCopyImgMemPreFlushes(pIn->copyFlags);
        for (UINT_32  regionIdx = 0; regionIdx < regionCount; regionIdx++)
        {
            const ADDR2_COPY_MEMSURFACE_REGION* pCurRegion = &pRegions[regionIdx];
            const ADDR2_MIP_INFO* pMipInfo = &mipInfo[pCurRegion->mipId];
            UINT_64 mipOffset = pIn->singleSubres ? 0 : pMipInfo->macroBlockOffset;
            UINT_32 yBlks = pMipInfo->pitch / localOut.blockWidth;
+            UINT_32 zBlks = localOut.sliceSize >> (addresser.GetBlockBits() - addresser.GetBlockZBits());

-            UINT_32 xStart = pCurRegion->x + pMipInfo->mipTailCoordX;
-            UINT_32 yStart = pCurRegion->y + pMipInfo->mipTailCoordY;
-            UINT_32 sliceStart = pCurRegion->slice + pMipInfo->mipTailCoordZ;
+            ADDR_COORD3D rawOrigin = {
+                pCurRegion->x + pMipInfo->mipTailCoordX,
+                pCurRegion->y + pMipInfo->mipTailCoordY,
+                pCurRegion->slice + pMipInfo->mipTailCoordZ
+            };

-            for (UINT_32 slice = sliceStart; slice < (sliceStart + pCurRegion->copyDims.depth); slice++)
-            {
-                // The copy functions take the base address of the hardware slice, not the logical slice. Those are
-                // not the same thing in 3D swizzles. Logical slices within 3D swizzles are handled by sliceXor
-                // for unaligned copies.
-                UINT_32 sliceBlkStart = PowTwoAlignDown(slice, localOut.blockSlices);
-                UINT_32 sliceXor = pIn->pbXor ^ addresser.GetAddressZ(slice);
-
-                UINT_64 memOffset = ((slice - pCurRegion->slice) * pCurRegion->memSlicePitch);
-                UINT_64 imgOffset = mipOffset + (sliceBlkStart * localOut.sliceSize);
-
-                ADDR_COORD2D sliceOrigin = { xStart, yStart };
-                ADDR_EXTENT2D sliceExtent = { pCurRegion->copyDims.width, pCurRegion->copyDims.height };
-
-                pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, imgOffset),
-                                 VoidPtrInc(pCurRegion->pMem, memOffset),
-                                 pCurRegion->memRowPitch,
-                                 yBlks,
-                                 sliceOrigin,
-                                 sliceExtent,
-                                 sliceXor,
-                                 addresser);
-            }
+            pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, mipOffset),
+                             pCurRegion->pMem,
+                             pCurRegion->memRowPitch,
+                             pCurRegion->memSlicePitch,
+                             yBlks,
+                             zBlks,
+                             rawOrigin,
+                             pCurRegion->copyDims,
+                             pIn->pbXor,
+                             (pCurRegion->mipId >= localOut.firstMipIdInTail),
+                             addresser);
        }
    }
    return returnCode;
--- a/src/amd/addrlib/src/gfx11/gfx11addrlib.h
+++ b/src/amd/addrlib/src/gfx11/gfx11addrlib.h
@ -1,7 +1,7 @@
 /*
 ************************************************************************************************************************
 *
-*  Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved.
+*  Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved.
 *  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
@ -36,7 +36,8 @@ struct Gfx11ChipSettings
    {
        UINT_32 isStrix             :  1;
        UINT_32 isPhoenix           :  1;
-        UINT_32 reserved1           : 30;
+        UINT_32 isGfx1170           :  1;
+        UINT_32 reserved1           : 29;

        // Misc configuration bits
        UINT_32 reserved2           : 32;
--- a/src/amd/addrlib/src/gfx12/gfx12addrlib.cpp
+++ b/src/amd/addrlib/src/gfx12/gfx12addrlib.cpp
@ -1,7 +1,7 @@
 /*
 ************************************************************************************************************************
 *
-*  Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
+*  Copyright (C) 2022-2026 Advanced Micro Devices, Inc. All rights reserved.
 *  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
@ -71,8 +71,7 @@ const SwizzleModeFlags Gfx12Lib::SwizzleModeTable[ADDR3_MAX_TYPE] =
 Gfx12Lib::Gfx12Lib(
    const Client* pClient)
    :
-    Lib(pClient),
-    m_numSwizzleBits(0)
+    Lib(pClient)
 {
    memcpy(m_swizzleModeTable, SwizzleModeTable, sizeof(SwizzleModeTable));
 }
@ -878,7 +877,7 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlComputeSurfaceAddrFromCoordTiled(
 *   Gfx12Lib::HwlCopyMemToSurface
 *
 *   @brief
-*       Copy multiple regions from memory to a non-linear surface. 
+*       Copy multiple regions from memory to a non-linear surface.
 *
 *   @return
 *       Error or success.
@ -925,7 +924,7 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlCopyMemToSurface(
    }

    LutAddresser addresser = LutAddresser();
-    UnalignedCopyMemImgFunc pfnCopyUnaligned = nullptr; 
+    UnalignedCopyMemImgFunc pfnCopyUnaligned = nullptr;
    if (returnCode == ADDR_OK)
    {
        const UINT_32 blkSizeLog2 = GetBlockSizeLog2(pIn->swizzleMode);
@ -936,7 +935,7 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlCopyMemToSurface(
        ADDR_BIT_SETTING fullSwizzlePattern[Log2Size256K] = {};
        GetSwizzlePatternFromPatternInfo(pPatInfo, fullSwizzlePattern);
        addresser.Init(fullSwizzlePattern, Log2Size256K, localOut.blockExtent, blkSizeLog2);
-        pfnCopyUnaligned = addresser.GetCopyMemImgFunc();
+        pfnCopyUnaligned = addresser.GetCopyMemImgFunc(pIn->copyFlags);
        if (pfnCopyUnaligned == nullptr)
        {
            ADDR_ASSERT_ALWAYS(); // What format is this?
@ -952,35 +951,27 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlCopyMemToSurface(
            const ADDR3_MIP_INFO* pMipInfo = &mipInfo[pCurRegion->mipId];
            UINT_64 mipOffset = pIn->singleSubres ? 0 : pMipInfo->macroBlockOffset;
            UINT_32 yBlks = pMipInfo->pitch / localOut.blockExtent.width;
+            UINT_32 zBlks = localOut.sliceSize >> (addresser.GetBlockBits() - addresser.GetBlockZBits());

-            UINT_32 xStart = pCurRegion->x + pMipInfo->mipTailCoordX;
-            UINT_32 yStart = pCurRegion->y + pMipInfo->mipTailCoordY;
-            UINT_32 sliceStart = pCurRegion->slice + pMipInfo->mipTailCoordZ;
+            ADDR_COORD3D rawOrigin = {
+                pCurRegion->x + pMipInfo->mipTailCoordX,
+                pCurRegion->y + pMipInfo->mipTailCoordY,
+                pCurRegion->slice + pMipInfo->mipTailCoordZ
+            };

-            for (UINT_32 slice = sliceStart; slice < (sliceStart + pCurRegion->copyDims.depth); slice++)
-            {
-                // The copy functions take the base address of the hardware slice, not the logical slice. Those are
-                // not the same thing in 3D swizzles. Logical slices within 3D swizzles are handled by sliceXor
-                // for unaligned copies.
-                UINT_32 sliceBlkStart = PowTwoAlignDown(slice, localOut.blockExtent.depth);
-                UINT_32 sliceXor = pIn->pbXor ^ addresser.GetAddressZ(slice);
-
-                UINT_64 memOffset = ((slice - pCurRegion->slice) * pCurRegion->memSlicePitch);
-                UINT_64 imgOffset = mipOffset + (sliceBlkStart * localOut.sliceSize);
-
-                ADDR_COORD2D sliceOrigin = { xStart, yStart };
-                ADDR_EXTENT2D sliceExtent = { pCurRegion->copyDims.width, pCurRegion->copyDims.height };
-
-                pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, imgOffset),
-                                 VoidPtrInc(pCurRegion->pMem, memOffset),
-                                 pCurRegion->memRowPitch,
-                                 yBlks,
-                                 sliceOrigin,
-                                 sliceExtent,
-                                 sliceXor,
-                                 addresser);
-            }
+            pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, mipOffset),
+                             pCurRegion->pMem,
+                             pCurRegion->memRowPitch,
+                             pCurRegion->memSlicePitch,
+                             yBlks,
+                             zBlks,
+                             rawOrigin,
+                             pCurRegion->copyDims,
+                             pIn->pbXor,
+                             (pCurRegion->mipId >= localOut.firstMipIdInTail),
+                             addresser);
        }
+        addresser.DoCopyMemImgPostFlushes(pIn->copyFlags);
    }
    return returnCode;
 }
@ -990,7 +981,7 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlCopyMemToSurface(
 *   Gfx12Lib::HwlCopySurfaceToMem
 *
 *   @brief
-*       Copy multiple regions from a non-linear surface to memory. 
+*       Copy multiple regions from a non-linear surface to memory.
 *
 *   @return
 *       Error or success.
@ -1037,7 +1028,7 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlCopySurfaceToMem(
    }

    LutAddresser addresser = LutAddresser();
-    UnalignedCopyMemImgFunc pfnCopyUnaligned = nullptr; 
+    UnalignedCopyMemImgFunc pfnCopyUnaligned = nullptr;
    if (returnCode == ADDR_OK)
    {
        const UINT_32 blkSizeLog2 = GetBlockSizeLog2(pIn->swizzleMode);
@ -1048,7 +1039,7 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlCopySurfaceToMem(
        ADDR_BIT_SETTING fullSwizzlePattern[Log2Size256K] = {};
        GetSwizzlePatternFromPatternInfo(pPatInfo, fullSwizzlePattern);
        addresser.Init(fullSwizzlePattern, Log2Size256K, localOut.blockExtent, blkSizeLog2);
-        pfnCopyUnaligned = addresser.GetCopyImgMemFunc();
+        pfnCopyUnaligned = addresser.GetCopyImgMemFunc(pIn->copyFlags);
        if (pfnCopyUnaligned == nullptr)
        {
            ADDR_ASSERT_ALWAYS(); // What format is this?
@ -1058,78 +1049,37 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlCopySurfaceToMem(

    if (returnCode == ADDR_OK)
    {
+        addresser.DoCopyImgMemPreFlushes(pIn->copyFlags);
        for (UINT_32  regionIdx = 0; regionIdx < regionCount; regionIdx++)
        {
            const ADDR3_COPY_MEMSURFACE_REGION* pCurRegion = &pRegions[regionIdx];
            const ADDR3_MIP_INFO* pMipInfo = &mipInfo[pCurRegion->mipId];
            UINT_64 mipOffset = pIn->singleSubres ? 0 : pMipInfo->macroBlockOffset;
            UINT_32 yBlks = pMipInfo->pitch / localOut.blockExtent.width;
+            UINT_32 zBlks = localOut.sliceSize >> (addresser.GetBlockBits() - addresser.GetBlockZBits());

-            UINT_32 xStart = pCurRegion->x + pMipInfo->mipTailCoordX;
-            UINT_32 yStart = pCurRegion->y + pMipInfo->mipTailCoordY;
-            UINT_32 sliceStart = pCurRegion->slice + pMipInfo->mipTailCoordZ;
+            ADDR_COORD3D rawOrigin = {
+                pCurRegion->x + pMipInfo->mipTailCoordX,
+                pCurRegion->y + pMipInfo->mipTailCoordY,
+                pCurRegion->slice + pMipInfo->mipTailCoordZ
+            };

-            for (UINT_32 slice = sliceStart; slice < (sliceStart + pCurRegion->copyDims.depth); slice++)
-            {
-                // The copy functions take the base address of the hardware slice, not the logical slice. Those are
-                // not the same thing in 3D swizzles. Logical slices within 3D swizzles are handled by sliceXor
-                // for unaligned copies.
-                UINT_32 sliceBlkStart = PowTwoAlignDown(slice, localOut.blockExtent.depth);
-                UINT_32 sliceXor = pIn->pbXor ^ addresser.GetAddressZ(slice);
-
-                UINT_64 memOffset = ((slice - pCurRegion->slice) * pCurRegion->memSlicePitch);
-                UINT_64 imgOffset = mipOffset + (sliceBlkStart * localOut.sliceSize);
-
-                ADDR_COORD2D sliceOrigin = { xStart, yStart };
-                ADDR_EXTENT2D sliceExtent = { pCurRegion->copyDims.width, pCurRegion->copyDims.height };
-
-                pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, imgOffset),
-                                 VoidPtrInc(pCurRegion->pMem, memOffset),
-                                 pCurRegion->memRowPitch,
-                                 yBlks,
-                                 sliceOrigin,
-                                 sliceExtent,
-                                 sliceXor,
-                                 addresser);
-            }
+            pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, mipOffset),
+                             pCurRegion->pMem,
+                             pCurRegion->memRowPitch,
+                             pCurRegion->memSlicePitch,
+                             yBlks,
+                             zBlks,
+                             rawOrigin,
+                             pCurRegion->copyDims,
+                             pIn->pbXor,
+                             (pCurRegion->mipId >= localOut.firstMipIdInTail),
+                             addresser);
        }
    }
    return returnCode;
 }

-
-/**
-************************************************************************************************************************
-*   Gfx12Lib::HwlComputePipeBankXor
-*
-*   @brief
-*       Generate a PipeBankXor value to be ORed into bits above numSwizzleBits of address
-*
-*   @return
-*       PipeBankXor value
-************************************************************************************************************************
-*/
-ADDR_E_RETURNCODE Gfx12Lib::HwlComputePipeBankXor(
-    const ADDR3_COMPUTE_PIPEBANKXOR_INPUT* pIn,     ///< [in] input structure
-    ADDR3_COMPUTE_PIPEBANKXOR_OUTPUT*      pOut     ///< [out] output structure
-    ) const
-{
-    if ((m_numSwizzleBits != 0)               && // does this configuration support swizzling
-        //         base address XOR in GFX12 will be applied to all blk_size = 4KB, 64KB, or 256KB swizzle modes,
-        //         Note that Linear and 256B are excluded.
-        (IsLinear(pIn->swizzleMode) == FALSE) &&
-        (IsBlock256b(pIn->swizzleMode) == FALSE))
-    {
-        pOut->pipeBankXor = pIn->surfIndex % (1 << m_numSwizzleBits);
-    }
-    else
-    {
-        pOut->pipeBankXor = 0;
-    }
-
-    return ADDR_OK;
-}
-
 /**
 ************************************************************************************************************************
 *   Gfx12Lib::GetSwizzlePatternInfo
@ -1263,72 +1213,13 @@ const ADDR_SW_PATINFO* Gfx12Lib::GetSwizzlePatternInfo(
 BOOL_32 Gfx12Lib::HwlInitGlobalParams(
    const ADDR_CREATE_INPUT* pCreateIn) ///< [in] create input
 {
-    BOOL_32              valid = TRUE;
-    GB_ADDR_CONFIG_GFX12 gbAddrConfig;
-
-    gbAddrConfig.u32All = pCreateIn->regValue.gbAddrConfig;
-
-    switch (gbAddrConfig.bits.NUM_PIPES)
-    {
-        case ADDR_CONFIG_1_PIPE:
-            m_pipesLog2 = 0;
-            break;
-        case ADDR_CONFIG_2_PIPE:
-            m_pipesLog2 = 1;
-            break;
-        case ADDR_CONFIG_4_PIPE:
-            m_pipesLog2 = 2;
-            break;
-        case ADDR_CONFIG_8_PIPE:
-            m_pipesLog2 = 3;
-            break;
-        case ADDR_CONFIG_16_PIPE:
-            m_pipesLog2 = 4;
-            break;
-        case ADDR_CONFIG_32_PIPE:
-            m_pipesLog2 = 5;
-            break;
-        case ADDR_CONFIG_64_PIPE:
-            m_pipesLog2 = 6;
-            break;
-        default:
-            ADDR_ASSERT_ALWAYS();
-            valid = FALSE;
-            break;
-    }
-
-    switch (gbAddrConfig.bits.PIPE_INTERLEAVE_SIZE)
-    {
-        case ADDR_CONFIG_PIPE_INTERLEAVE_256B:
-            m_pipeInterleaveLog2 = 8;
-            break;
-        case ADDR_CONFIG_PIPE_INTERLEAVE_512B:
-            m_pipeInterleaveLog2 = 9;
-            break;
-        case ADDR_CONFIG_PIPE_INTERLEAVE_1KB:
-            m_pipeInterleaveLog2 = 10;
-            break;
-        case ADDR_CONFIG_PIPE_INTERLEAVE_2KB:
-            m_pipeInterleaveLog2 = 11;
-            break;
-        default:
-            ADDR_ASSERT_ALWAYS();
-            valid = FALSE;
-            break;
-    }
-
-    m_numSwizzleBits = ((m_pipesLog2 >= 3) ? m_pipesLog2 - 2 : 0);
-
    // Gfx10+ chips treat packed 8-bit 422 formats as 32bpe with 2pix/elem.
    m_configFlags.use32bppFor422Fmt = TRUE;

-    if (valid)
-    {
-        InitEquationTable();
-        InitBlockDimensionTable();
-    }
+    InitEquationTable();
+    InitBlockDimensionTable();

-    return valid;
+    return TRUE;
 }

 /**
@ -1579,10 +1470,10 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlComputeSlicePipeBankXor(
                                                                            pIn->slice,
                                                                            0);

-                const UINT_32 pipeBankXor = pipeBankXorOffset >> m_pipeInterleaveLog2;
+                const UINT_32 pipeBankXor = pipeBankXorOffset >> PipeInterleaveLog2;

                // Should have no bit set under pipe interleave
-                ADDR_ASSERT((pipeBankXor << m_pipeInterleaveLog2) == pipeBankXorOffset);
+                ADDR_ASSERT((pipeBankXor << PipeInterleaveLog2) == pipeBankXorOffset);

                pOut->pipeBankXor = pIn->basePipeBankXor ^ pipeBankXor;
            }
@ -2043,7 +1934,7 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlComputeStereoInfo(
        UINT_32 yPosMask = 0;

        // First get "max y bit"
-        for (UINT_32 i = m_pipeInterleaveLog2; i < blkSizeLog2; i++)
+        for (UINT_32 i = PipeInterleaveLog2; i < blkSizeLog2; i++)
        {
            ADDR_ASSERT(m_equationTable[eqIndex].addr[i].valid == 1);

@ -2055,7 +1946,7 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlComputeStereoInfo(
        }

        // Then loop again for populating a position mask of "max Y bit"
-        for (UINT_32 i = m_pipeInterleaveLog2; i < blkSizeLog2; i++)
+        for (UINT_32 i = PipeInterleaveLog2; i < blkSizeLog2; i++)
        {
            if ((m_equationTable[eqIndex].addr[i].channel == 1) &&
                (m_equationTable[eqIndex].addr[i].index == yMax))
@ -2074,7 +1965,7 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlComputeStereoInfo(

            if ((alignedHeight >> yMax) & 1)
            {
-                *pRightXor = yPosMask >> m_pipeInterleaveLog2;
+                *pRightXor = yPosMask >> PipeInterleaveLog2;
            }
        }
    }
--- a/src/amd/addrlib/src/gfx12/gfx12addrlib.h
+++ b/src/amd/addrlib/src/gfx12/gfx12addrlib.h
@ -1,7 +1,7 @@
 /*
 ************************************************************************************************************************
 *
-*  Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
+*  Copyright (C) 2022-2026 Advanced Micro Devices, Inc. All rights reserved.
 *  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
@ -147,10 +147,6 @@ private:
    static const UINT_32 MaxImageDim  = 32768; // Max image size is 32k
    static const UINT_32 MaxMipLevels = 16;

-    virtual ADDR_E_RETURNCODE HwlComputePipeBankXor(
-        const ADDR3_COMPUTE_PIPEBANKXOR_INPUT* pIn,
-        ADDR3_COMPUTE_PIPEBANKXOR_OUTPUT*      pOut) const override;
-
    virtual BOOL_32 HwlInitGlobalParams(const ADDR_CREATE_INPUT* pCreateIn) override;

    virtual ADDR_E_RETURNCODE HwlComputeStereoInfo(
@ -172,8 +168,6 @@ private:
        const ADDR3_COPY_MEMSURFACE_REGION* pRegions,
        UINT_32                             regionCount) const override;

-    UINT_32           m_numSwizzleBits;
-
    // Initialize equation table
    VOID InitEquationTable();