amd: import gfx11.7 addrlib

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40958>
This commit is contained in:
Ganesh Belgur Ramachandra 2026-03-23 22:14:28 +00:00 committed by Samuel Pitoiset
parent dfca417db8
commit d778ede72c
20 changed files with 3407 additions and 509 deletions

View file

@ -1,7 +1,7 @@
/*
************************************************************************************************************************
*
* Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved.
* SPDX-License-Identifier: MIT
*
***********************************************************************************************************************/
@ -24,7 +24,7 @@ extern "C"
#endif
#define ADDRLIB_VERSION_MAJOR 10
#define ADDRLIB_VERSION_MINOR 1
#define ADDRLIB_VERSION_MINOR 6
#define ADDRLIB_MAKE_VERSION(major, minor) ((major << 16) | minor)
#define ADDRLIB_VERSION ADDRLIB_MAKE_VERSION(ADDRLIB_VERSION_MAJOR, ADDRLIB_VERSION_MINOR)
@ -107,6 +107,11 @@ typedef struct _ADDR_EXTENT3D
* AddrComputeFmaskAddrFromCoord()
* AddrComputeFmaskCoordFromAddr()
*
* /////////////////////////////////////////////////////////////////////////////////////////////////
* // Format properties functions
* /////////////////////////////////////////////////////////////////////////////////////////////////
* AddrFormatProperties()
*
**/
/**
* /////////////////////////////////////////////////////////////////////////////////////////////////
@ -452,6 +457,49 @@ ADDR_E_RETURNCODE ADDR_API AddrCreate(
ADDR_E_RETURNCODE ADDR_API AddrDestroy(
ADDR_HANDLE hLib);
/**
****************************************************************************************************
* ADDR_FORMAT_PROPERTIES_IN
*
* @brief
* Input structure to the AddrFormatProperties routine.
*
****************************************************************************************************
*/
typedef struct _ADDR_FORMAT_PROPERTIES_IN {
UINT_32 size; ///< Size of this structure in bytes
AddrFormat format; ///< If format is set to valid one, bpp/width/height
/// might be overwritten
} ADDR_FORMAT_PROPERTIES_IN;
/**
****************************************************************************************************
* ADDR_FORMAT_PROPERTIES_OUT
*
* @brief
* Output structure from the AddrFormatProperties routine.
*
****************************************************************************************************
*/
typedef struct _ADDR_FORMAT_PROPERTIES_OUT {
UINT_32 size; ///< Size of this structure in bytes
UINT_32 bpp; ///< Bits per pixel as laid out in memory (eg. 128bpp for BC7)
ADDR_EXTENT2D expand; ///< Dimensions of one macro pixel block
} ADDR_FORMAT_PROPERTIES_OUT;
/**
****************************************************************************************************
* AddrFormatProperties
*
* @brief
* Gets a list of format properties
*
****************************************************************************************************
*/
ADDR_E_RETURNCODE ADDR_API AddrFormatProperties(
ADDR_HANDLE hLib,
const ADDR_FORMAT_PROPERTIES_IN* in,
ADDR_FORMAT_PROPERTIES_OUT* pOut);
////////////////////////////////////////////////////////////////////////////////////////////////////
// Surface functions
@ -2463,6 +2511,7 @@ typedef union _ADDR2_SURFACE_FLAGS
UINT_32 rotated : 1; ///< This resource is rotated and displayable
UINT_32 needEquation : 1; ///< This resource needs equation to be generated if possible
UINT_32 opt4space : 1; ///< This resource should be optimized for space
UINT_32 computeMaxSize : 1; ///< This resource should select the largest swizzle possible
UINT_32 minimizeAlign : 1; ///< This resource should use minimum alignment
UINT_32 noMetadata : 1; ///< This resource has no metadata
UINT_32 metaRbUnaligned : 1; ///< This resource has rb unaligned metadata
@ -2470,7 +2519,7 @@ typedef union _ADDR2_SURFACE_FLAGS
UINT_32 view3dAs2dArray : 1; ///< This resource is a 3D resource viewed as 2D array
UINT_32 allowExtEquation : 1; ///< If unset, only legacy DX eqs are allowed (2 XORs)
UINT_32 requireMetadata : 1; ///< This resource must support metadata
UINT_32 reserved : 11; ///< Reserved bits
UINT_32 reserved : 10; ///< Reserved bits
};
UINT_32 value;
@ -2666,6 +2715,31 @@ ADDR_E_RETURNCODE ADDR_API Addr2ComputeSurfaceAddrFromCoord(
const ADDR2_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn,
ADDR2_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT* pOut);
/**
****************************************************************************************************
* ADDR_COPY_FLAGS
*
* @brief
* Options controlling image copy functions.
****************************************************************************************************
*/
typedef union _ADDR_COPY_FLAGS {
struct
{
UINT_32 blockMemcpy : 1; ///< Memory layout is pre-swizzled and stored block-by-block.
/// For regions in the miptail, this uses hybrid memcpy.
/// Regions must cover full width/height of the subresource.
UINT_32 hybridMemcpy : 1; ///< Memory layout is partially pre-swizzled and stored
/// microblock-by-microblock. Data in this format is agnostic to
/// chip harvesting and block size. Regions will be padded out
/// to microblock boundaries for alignment.
/// Mutually exclusive with 'blockMemcpy'.
UINT_32 reserved : 30; ///< Reserved bits
};
UINT_32 value;
} ADDR_COPY_FLAGS;
/**
****************************************************************************************************
* ADDR2_COPY_MEMSURFACE_REGION
@ -2718,6 +2792,7 @@ typedef struct _ADDR2_COPY_MEMSURFACE_INPUT
/// - copyDims.depth == 1
/// - all copy regions target the same mip
/// - all copy regions target the same slice/depth
ADDR_COPY_FLAGS copyFlags; ///< Controls how the copy is performed.
} ADDR2_COPY_MEMSURFACE_INPUT;
/**
@ -4008,30 +4083,34 @@ typedef union _ADDR2_SWMODE_SET
*/
typedef struct _ADDR2_GET_PREFERRED_SURF_SETTING_INPUT
{
UINT_32 size; ///< Size of this structure in bytes
UINT_32 size; ///< Size of this structure in bytes
ADDR2_SURFACE_FLAGS flags; ///< Surface flags
AddrResourceType resourceType; ///< Surface type
AddrFormat format; ///< Surface format
AddrResrouceLocation resourceLoction; ///< Surface heap choice
ADDR2_BLOCK_SET forbiddenBlock; ///< Client can use it to disable some block setting
///< such as linear for DXTn, tiled for YUV
ADDR2_SWTYPE_SET preferredSwSet; ///< Client can use it to specify sw type(s) wanted
BOOL_32 noXor; ///< Do not use xor mode for this resource
UINT_32 bpp; ///< bits per pixel
UINT_32 width; ///< Width (of mip0), in pixels
UINT_32 height; ///< Height (of mip0), in pixels
UINT_32 numSlices; ///< Number surface slice/depth (of mip0),
UINT_32 numMipLevels; ///< Total mipmap levels.
UINT_32 numSamples; ///< Number of samples
UINT_32 numFrags; ///< Number of fragments, leave it zero or the same as
/// number of samples for normal AA; Set it to the
/// number of fragments for EQAA
UINT_32 maxAlign; ///< maximum base/size alignment requested by client
UINT_32 minSizeAlign; ///< memory allocated for surface in client driver will
/// be padded to multiple of this value (in bytes)
DOUBLE memoryBudget; ///< Memory consumption ratio based on minimum possible
/// size.
ADDR2_SURFACE_FLAGS flags; ///< Surface flags
AddrResourceType resourceType; ///< Surface type
AddrFormat format; ///< Surface format
AddrResrouceLocation resourceLoction; ///< Surface heap choice
ADDR2_BLOCK_SET forbiddenBlock; ///< Client can use it to disable some block setting
///< such as linear for DXTn, tiled for YUV
ADDR2_SWTYPE_SET preferredSwSet; ///< Client can use it to specify sw type(s) wanted
BOOL_32 noXor; ///< Do not use xor mode for this resource
UINT_32 bpp; ///< bits per pixel
UINT_32 width; ///< Width (of mip0), in pixels
UINT_32 height; ///< Height (of mip0), in pixels
UINT_32 numSlices; ///< Number surface slice/depth (of mip0),
UINT_32 numMipLevels; ///< Total mipmap levels.
UINT_32 numSamples; ///< Number of samples
UINT_32 numFrags; ///< Number of fragments, leave it zero or the same as
/// number of samples for normal AA; Set it to the
/// number of fragments for EQAA
UINT_32 maxAlign; ///< maximum base/size alignment requested by client
UINT_32 minSizeAlign; ///< memory allocated for surface in client driver will
/// be padded to multiple of this value (in bytes)
DOUBLE memoryBudget; ///< Memory consumption ratio based on minimum possible
/// size.
bool useBlockBasedHeuristic; ///< Use the block-based heuristic for swizzle mode selection.
/// The heuristic has the property of image size predictably
/// with image extents, which is needed for Vulkan. It ignores
/// minSizeAlign, maxAlign and memoryBudget options
} ADDR2_GET_PREFERRED_SURF_SETTING_INPUT;
/**
@ -4488,6 +4567,7 @@ typedef struct _ADDR3_COPY_MEMSURFACE_INPUT
/// - copyDims.depth == 1
/// - all copy regions target the same mip
/// - all copy regions target the same slice/depth
ADDR_COPY_FLAGS copyFlags; ///< Controls how the copy is performed.
} ADDR3_COPY_MEMSURFACE_INPUT;
/**

View file

@ -1,7 +1,7 @@
/*
************************************************************************************************************************
*
* Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved.
* SPDX-License-Identifier: MIT
*
***********************************************************************************************************************/

View file

@ -20,6 +20,7 @@ files_addrlib = files(
'src/core/addrobject.h',
'src/core/addrswizzler.cpp',
'src/core/addrswizzler.h',
'src/core/addrswizzlersimd.h',
'src/core/coord.cpp',
'src/core/coord.h',
'src/gfx9/gfx9addrlib.cpp',

View file

@ -1,7 +1,7 @@
/*
************************************************************************************************************************
*
* Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved.
* SPDX-License-Identifier: MIT
*
***********************************************************************************************************************/
@ -81,6 +81,34 @@ ADDR_E_RETURNCODE ADDR_API AddrDestroy(
return returnCode;
}
/**
****************************************************************************************************
* AddrFormatProperties
*
* @brief
* Retreives properties of the specified format.
*
****************************************************************************************************
*/
ADDR_E_RETURNCODE ADDR_API AddrFormatProperties(
ADDR_HANDLE hLib,
const ADDR_FORMAT_PROPERTIES_IN& in,
ADDR_FORMAT_PROPERTIES_OUT* pOut)
{
ADDR_E_RETURNCODE retCode = ADDR_INVALIDPARAMS;
if (hLib)
{
Lib* pLib = Lib::GetLib(hLib);
if (pLib != NULL)
{
retCode = pLib->GetFormatProperties(in, pOut);
}
}
return retCode;
}
////////////////////////////////////////////////////////////////////////////////////////////////////
// Surface functions

View file

@ -1,7 +1,7 @@
/*
************************************************************************************************************************
*
* Copyright (C) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (C) 2017-2026 Advanced Micro Devices, Inc. All rights reserved.
* SPDX-License-Identifier: MIT
*
***********************************************************************************************************************/
@ -29,6 +29,7 @@
#define FAMILY_NV3 0x91 //# 145 / Navi: 3x
#define FAMILY_STX 0x96
#define FAMILY_PHX 0x94 //# 148 / Phoenix
#define FAMILY_GFX1170 0x9A
#define FAMILY_RMB 0x92 //# 146 / Rembrandt
#define FAMILY_RPL 0x95 //# 149 / Raphael
#define FAMILY_MDN 0x97 //# 151 / Mendocino
@ -109,6 +110,7 @@
#define AMDGPU_PHOENIX2_RANGE 0x80, 0xC0 //# 128 <= x < 192
#define AMDGPU_HAWK_POINT1_RANGE 0xC0, 0xF0 //# 192 <= x < 240
#define AMDGPU_HAWK_POINT2_RANGE 0xF0, 0xFF //# 240 <= x < 255
#define AMDGPU_GFX1170_RANGE 0x01, 0x40 //# 1 <= x < 64
#define AMDGPU_REMBRANDT_RANGE 0x01, 0xFF //# 01 <= x < 255
#define AMDGPU_RAPHAEL_RANGE 0x01, 0xFF //# 1 <= x < max
@ -189,6 +191,7 @@
#define ASICREV_IS_PHOENIX2(r) ASICREV_IS(r, PHOENIX2)
#define ASICREV_IS_HAWK_POINT1(r) ASICREV_IS(r, HAWK_POINT1)
#define ASICREV_IS_HAWK_POINT2(r) ASICREV_IS(r, HAWK_POINT2)
#define ASICREV_IS_GFX1170(r) ASICREV_IS(r, GFX1170)
#define ASICREV_IS_REMBRANDT(r) ASICREV_IS(r, REMBRANDT)
#define ASICREV_IS_RAPHAEL(r) ASICREV_IS(r, RAPHAEL)

View file

@ -1,7 +1,7 @@
/*
************************************************************************************************************************
*
* Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved.
* SPDX-License-Identifier: MIT
*
***********************************************************************************************************************/
@ -360,7 +360,7 @@ static inline UINT_32 BitMaskScanForward(
{
ADDR_ASSERT(mask > 0);
unsigned long out = 0;
#if (defined(_WIN64) && defined(_M_X64)) || (defined(_WIN32) && defined(_M_IX64))
#if ((defined(_WIN64) && defined(_M_X64)) || (defined(_WIN32) && defined(_M_IX64))) && !defined(_M_ARM64EC)
out = ::_tzcnt_u32(mask);
#elif (defined(_WIN32) || defined(_WIN64))
::_BitScanForward(&out, mask);
@ -436,6 +436,22 @@ static inline UINT_64 IsPow2(
return !(dim & (dim - 1));
}
/**
****************************************************************************************************
* RoundUpToMultiple
*
* @brief
* Rounds up the specified integer to the nearest multiple of the specified alignment value.
****************************************************************************************************
*/
template <typename T>
constexpr T RoundUpToMultiple(
T operand, ///< Value to be aligned.
T alignment) ///< Alignment desired.
{
return (((operand + (alignment - 1)) / alignment) * alignment);
}
/**
****************************************************************************************************
* PowTwoAlign
@ -647,6 +663,25 @@ static inline UINT_32 Log2(
return (x != 0) ? (31 ^ BitMaskScanReverse(x)) : 0;
}
/**
****************************************************************************************************
* ConstexprLog2
*
* @brief
* Compute log of base 2 no matter the target is power of 2 or not. Returns 0 if 0.
****************************************************************************************************
*/
static constexpr inline UINT_32 ConstexprLog2(
UINT_32 x) ///< [in] the value should calculate log based 2
{
UINT_32 out = 0;
while (x >>= 1)
{
out++;
}
return out;
}
/**
****************************************************************************************************
* QLog2

View file

@ -1,7 +1,7 @@
/*
************************************************************************************************************************
*
* Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved.
* SPDX-License-Identifier: MIT
*
***********************************************************************************************************************/
@ -222,6 +222,7 @@ ADDR_E_RETURNCODE Lib::Create(
case FAMILY_NV3:
case FAMILY_STX:
case FAMILY_PHX:
case FAMILY_GFX1170:
pLib = Gfx11HwlInit(&client);
break;
case FAMILY_NV4:
@ -304,6 +305,44 @@ ADDR_E_RETURNCODE Lib::Create(
return returnCode;
}
/**
****************************************************************************************************
* Lib::GetFormatProperties
*
* @brief
* Returns the properties of the format as specifed in the input.
* @return
* ADDR_E_RETURNCODE
****************************************************************************************************
*/
ADDR_E_RETURNCODE Lib::GetFormatProperties(
const ADDR_FORMAT_PROPERTIES_IN& in,
ADDR_FORMAT_PROPERTIES_OUT* pOut
) const
{
ADDR_E_RETURNCODE returnCode = ADDR_OK;
if (GetFillSizeFieldsFlags() == TRUE)
{
if ((in.size != sizeof(ADDR_FORMAT_PROPERTIES_IN)) ||
(pOut->size != sizeof(ADDR_FORMAT_PROPERTIES_OUT)))
{
returnCode = ADDR_PARAMSIZEMISMATCH;
}
}
if (returnCode == ADDR_OK)
{
pOut->bpp = GetElemLib()->GetBitsPerPixel(in.format,
nullptr, // elemMode, unused
&pOut->expand.width,
&pOut->expand.height,
nullptr); // unused bits
}
return returnCode;
}
/**
****************************************************************************************************
* Lib::SetChipFamily
@ -315,7 +354,7 @@ ADDR_E_RETURNCODE Lib::Create(
****************************************************************************************************
*/
VOID Lib::SetChipFamily(
UINT_32 uChipFamily, ///< [in] chip family defined in atiih.h
UINT_32 uChipFamily, ///< [in] chip family defined in atiid.h
UINT_32 uChipRevision) ///< [in] chip revision defined in "asic_family"_id.h
{
ChipFamily family = HwlConvertChipFamily(uChipFamily, uChipRevision);
@ -668,6 +707,47 @@ UINT_32 Lib::GetBpe(AddrFormat format) const
return GetElemLib()->GetBitsPerPixel(format);
}
/**
****************************************************************************************************
* Lib::GetSwizzleModePreferenceRatio
*
* @brief
* Get ratio driving swizzle mode selection heuristic. Ratio is returned as fraction nominator
* and denominator
* @return
* void
****************************************************************************************************
*/
void Lib::GetSwizzleModePreferenceRatio(
const ADDR2_GET_PREFERRED_SURF_SETTING_INPUT* pIn,
UINT_32* pOutRatioLo,
UINT_32* pOutRatioHi
) const
{
const BOOL_32 computeMinSize = (pIn->flags.minimizeAlign == 1) || (pIn->memoryBudget >= 1.0);
if (computeMinSize)
{
*pOutRatioLo = 1;
*pOutRatioHi = 1;
}
else if (pIn->flags.opt4space)
{
*pOutRatioLo = 3;
*pOutRatioHi = 2;
}
else if (pIn->flags.computeMaxSize)
{
*pOutRatioLo = 1024;
*pOutRatioHi = 1;
}
else
{
*pOutRatioLo = 2;
*pOutRatioHi = 1;
}
}
/**
************************************************************************************************************************
* Lib::ComputeOffsetFromSwizzlePattern

View file

@ -1,7 +1,7 @@
/*
************************************************************************************************************************
*
* Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved.
* SPDX-License-Identifier: MIT
*
***********************************************************************************************************************/
@ -300,6 +300,10 @@ public:
delete this;
}
ADDR_E_RETURNCODE GetFormatProperties(
const ADDR_FORMAT_PROPERTIES_IN& in,
ADDR_FORMAT_PROPERTIES_OUT* pOut) const;
static Lib* GetLib(ADDR_HANDLE hLib);
/// Returns which version of addrlib functions should be used.
@ -333,6 +337,10 @@ public:
UINT_32 GetBpe(AddrFormat format) const;
void GetSwizzleModePreferenceRatio(
const ADDR2_GET_PREFERRED_SURF_SETTING_INPUT* pIn,
UINT_32* pOutRatioLo,
UINT_32* pOutRatioHi) const;
static UINT_32 ComputeOffsetFromSwizzlePattern(
const UINT_64* pPattern,

View file

@ -1,7 +1,7 @@
/*
************************************************************************************************************************
*
* Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved.
* SPDX-License-Identifier: MIT
*
***********************************************************************************************************************/
@ -2473,7 +2473,7 @@ UINT_64 Lib::HwlComputeXmaskAddrFromCoord(
//
macroTileIndexX = x / macroTileWidth;
macroTileIndexY = y / macroTileHeight;
macroTileOffset = ((macroTileIndexY * macroTilesPerRow) + macroTileIndexX) * macroTileBytes;
macroTileOffset = (static_cast<UINT_64>(macroTileIndexY * macroTilesPerRow) + macroTileIndexX) * macroTileBytes;
//
// Compute the pixel offset within the macro tile.
@ -2675,7 +2675,7 @@ VOID Lib::ComputeSurfaceCoordFromAddrMicroTiled(
//
sliceBits = static_cast<UINT_64>(pitch) * height * microTileThickness * bpp * numSamples;
rowBits = (pitch / MicroTileWidth) * microTileBits;
rowBits = static_cast<UINT_64>(pitch / MicroTileWidth) * microTileBits;
//
// Extract the slice index.
@ -3559,11 +3559,11 @@ BOOL_32 Lib::DegradeTo1D(
if (degrade == FALSE)
{
// Only check width and height as slices are aligned to thickness
UINT_64 unalignedSize = width * height;
UINT_64 unalignedSize = static_cast<UINT_64>(width) * height;
UINT_32 alignedPitch = PowTwoAlign(width, macroTilePitchAlign);
UINT_32 alignedHeight = PowTwoAlign(height, macroTileHeightAlign);
UINT_64 alignedSize = alignedPitch * alignedHeight;
UINT_64 alignedSize = static_cast<UINT_64>(alignedPitch) * alignedHeight;
// alignedSize > 1.5 * unalignedSize
if (2 * alignedSize > 3 * unalignedSize)

View file

@ -1,7 +1,7 @@
/*
************************************************************************************************************************
*
* Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved.
* SPDX-License-Identifier: MIT
*
***********************************************************************************************************************/
@ -207,7 +207,7 @@ ADDR_E_RETURNCODE Lib::ComputeSurfaceInfo(
// Overwrite these parameters if we have a valid format
}
if (localIn.bpp != 0)
if (localIn.bpp >= 8)
{
localIn.width = Max(localIn.width, 1u);
localIn.height = Max(localIn.height, 1u);
@ -444,8 +444,8 @@ ADDR_E_RETURNCODE Lib::CopyLinearSurface(
void* pMipBase = VoidPtrInc(pIn->pMappedSurface,
(pIn->singleSubres ? 0 : mipInfo[pCurRegion->mipId].offset));
const size_t lineSizeBytes = (localIn.bpp >> 3) * pCurRegion->copyDims.width;
const size_t lineImgPitchBytes = (localIn.bpp >> 3) * mipInfo[pCurRegion->mipId].pitch;
const size_t lineSizeBytes = (static_cast<size_t>(localIn.bpp) >> 3) * pCurRegion->copyDims.width;
const size_t lineImgPitchBytes = (static_cast<size_t>(localIn.bpp) >> 3) * mipInfo[pCurRegion->mipId].pitch;
for (UINT_32 sliceIdx = 0; sliceIdx < pCurRegion->copyDims.depth; sliceIdx++)
{
@ -504,6 +504,11 @@ ADDR_E_RETURNCODE Lib::CopyMemToSurface(
{
returnCode = ADDR_INVALIDPARAMS;
}
else if (pIn->copyFlags.blockMemcpy && pIn->copyFlags.hybridMemcpy)
{
// Invalid to specify conflicting copy modes.
returnCode = ADDR_INVALIDPARAMS;
}
else
{
UINT_32 baseSlice = pRegions[0].slice;
@ -573,6 +578,11 @@ ADDR_E_RETURNCODE Lib::CopySurfaceToMem(
{
returnCode = ADDR_INVALIDPARAMS;
}
else if (pIn->copyFlags.blockMemcpy && pIn->copyFlags.hybridMemcpy)
{
// Invalid to specify conflicting copy modes.
returnCode = ADDR_INVALIDPARAMS;
}
else
{
UINT_32 baseSlice = pRegions[0].slice;
@ -1424,7 +1434,7 @@ ADDR_E_RETURNCODE Lib::ComputeSurfaceAddrFromCoordLinear(
{
pOut->addr = (localOut.sliceSize * pIn->slice) +
mipInfo[pIn->mipId].offset +
(pIn->y * mipInfo[pIn->mipId].pitch + pIn->x) * (pIn->bpp >> 3);
(static_cast<size_t>(pIn->y) * mipInfo[pIn->mipId].pitch + pIn->x) * (pIn->bpp >> 3);
pOut->bitPosition = 0;
}
else

View file

@ -1,7 +1,7 @@
/*
************************************************************************************************************************
*
* Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved.
* SPDX-License-Identifier: MIT
*
***********************************************************************************************************************/
@ -39,8 +39,6 @@ namespace V3
Lib::Lib()
:
Addr::Lib(),
m_pipesLog2(0),
m_pipeInterleaveLog2(0),
m_numEquations(0)
{
Init();
@ -59,8 +57,6 @@ Lib::Lib(
const Client* pClient)
:
Addr::Lib(pClient),
m_pipesLog2(0),
m_pipeInterleaveLog2(0),
m_numEquations(0)
{
Init();
@ -265,7 +261,7 @@ ADDR_E_RETURNCODE Lib::ComputeSurfaceInfo(
// Overwrite these parameters if we have a valid format
}
if (localIn.bpp != 0)
if (localIn.bpp >= 8)
{
localIn.width = Max(localIn.width, 1u);
localIn.height = Max(localIn.height, 1u);
@ -547,8 +543,8 @@ ADDR_E_RETURNCODE Lib::CopyLinearSurface(
void* pMipBase = VoidPtrInc(pIn->pMappedSurface,
(pIn->singleSubres ? 0 : mipInfo[pCurRegion->mipId].offset));
const size_t lineSizeBytes = (localIn.bpp >> 3) * pCurRegion->copyDims.width;
const size_t lineImgPitchBytes = (localIn.bpp >> 3) * mipInfo[pCurRegion->mipId].pitch;
const size_t lineSizeBytes = (static_cast<size_t>(localIn.bpp) >> 3) * pCurRegion->copyDims.width;
const size_t lineImgPitchBytes = (static_cast<size_t>(localIn.bpp) >> 3) * mipInfo[pCurRegion->mipId].pitch;
for (UINT_32 sliceIdx = 0; sliceIdx < pCurRegion->copyDims.depth; sliceIdx++)
{
@ -611,6 +607,11 @@ ADDR_E_RETURNCODE Lib::CopyMemToSurface(
{
returnCode = ADDR_INVALIDPARAMS;
}
else if (pIn->copyFlags.blockMemcpy && pIn->copyFlags.hybridMemcpy)
{
// Invalid to specify conflicting copy modes.
returnCode = ADDR_INVALIDPARAMS;
}
else
{
UINT_32 baseSlice = pRegions[0].slice;
@ -680,6 +681,11 @@ ADDR_E_RETURNCODE Lib::CopySurfaceToMem(
{
returnCode = ADDR_INVALIDPARAMS;
}
else if (pIn->copyFlags.blockMemcpy && pIn->copyFlags.hybridMemcpy)
{
// Invalid to specify conflicting copy modes.
returnCode = ADDR_INVALIDPARAMS;
}
else
{
UINT_32 baseSlice = pRegions[0].slice;
@ -736,7 +742,7 @@ ADDR_E_RETURNCODE Lib::ComputePipeBankXor(
const ADDR3_COMPUTE_PIPEBANKXOR_INPUT* pIn,
ADDR3_COMPUTE_PIPEBANKXOR_OUTPUT* pOut)
{
ADDR_E_RETURNCODE returnCode;
ADDR_E_RETURNCODE returnCode = ADDR_OK;
if ((GetFillSizeFieldsFlags() == TRUE) &&
((pIn->size != sizeof(ADDR3_COMPUTE_PIPEBANKXOR_INPUT)) ||
@ -746,7 +752,23 @@ ADDR_E_RETURNCODE Lib::ComputePipeBankXor(
}
else
{
returnCode = HwlComputePipeBankXor(pIn, pOut);
// The swizzle mode determines how many unused bits there are in the address. We never (ok, rarely...) program
// the low eight bits of the address, so the "numSwizzleBits" effectively represents the number of "guaranteed
// zero" programmed bits in the address.
const UINT_32 numSwizzleBits = GetBlockSizeLog2(pIn->swizzleMode, FALSE) - 8;
// make sure this configuration supports swizzling
if (numSwizzleBits != 0)
{
// These cases should have been excluded with the "numSwizzleBits" calculation above, but make sure here.
ADDR_ASSERT((IsLinear(pIn->swizzleMode) == FALSE) && (IsBlock256b(pIn->swizzleMode) == FALSE));
pOut->pipeBankXor = pIn->surfIndex % (1 << numSwizzleBits);
}
else
{
pOut->pipeBankXor = 0;
}
}
return returnCode;
@ -1167,7 +1189,6 @@ ADDR_E_RETURNCODE Lib::ComputeSurfaceInfoSanityCheck(
return HwlValidateNonSwModeParams(&localIn) ? ADDR_OK : ADDR_INVALIDPARAMS;
}
/**
************************************************************************************************************************
* Lib::ComputeOffsetFromEquation

View file

@ -1,7 +1,7 @@
/*
************************************************************************************************************************
*
* Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (C) 2022-2026 Advanced Micro Devices, Inc. All rights reserved.
* SPDX-License-Identifier: MIT
*
***********************************************************************************************************************/
@ -44,7 +44,6 @@ struct ADDR3_COORD
struct ADDR3_COMPUTE_SURFACE_INFO_PARAMS_INPUT
{
const ADDR3_COMPUTE_SURFACE_INFO_INPUT* pSurfInfo;
void* pvAddrParams;
};
/**
@ -155,14 +154,16 @@ protected:
Lib(); // Constructor is protected
Lib(const Client* pClient);
UINT_32 m_pipesLog2; ///< Number of pipe per shader engine Log2
UINT_32 m_pipeInterleaveLog2; ///< Log2 of pipe interleave bytes
SwizzleModeFlags m_swizzleModeTable[ADDR3_MAX_TYPE]; ///< Swizzle mode table
// Number of unique MSAA sample rates (1/2/4/8)
static const UINT_32 MaxNumMsaaRates = 4;
//# These fields exist in the GB_ADDR_CONFIG register; however, the HW does not care about them.
//# The HW acts as if the log2(pipes)==5 and log2(pi) == 8, always.
static const UINT_32 NumPipesLog2 = 5;
static const UINT_32 PipeInterleaveLog2 = 8;
// Number of equation entries in the table
UINT_32 m_numEquations;
@ -444,4 +445,4 @@ private:
} // V3
} // Addr
#endif
#endif

View file

@ -2,7 +2,8 @@
/*
************************************************************************************************************************
*
* Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (C) 2024-2026 Advanced Micro Devices, Inc. All rights reserved.
* SPDX-License-Identifier: MIT
*
***********************************************************************************************************************/
@ -14,6 +15,7 @@
*/
#include "addrswizzler.h"
#include "addrswizzlersimd.h"
namespace Addr
{
@ -98,6 +100,23 @@ void LutAddresser::InitSwizzleProps()
m_sLutMask |= m_bit[i].s;
}
// Derive the microblock size from the swizzle equation.
UINT_32 xMbMask = 0;
UINT_32 yMbMask = 0;
UINT_32 zMbMask = 0;
for (UINT_32 i = 0; i < 8; i++)
{
xMbMask |= m_bit[i].x;
yMbMask |= m_bit[i].y;
zMbMask |= m_bit[i].z;
}
m_microBlockSize.width = xMbMask + 1;
m_microBlockSize.height = yMbMask + 1;
m_microBlockSize.depth = zMbMask + 1;
ADDR_ASSERT(IsPow2(m_microBlockSize.width));
ADDR_ASSERT(IsPow2(m_microBlockSize.height));
ADDR_ASSERT(IsPow2(m_microBlockSize.depth));
// An expandX of 1 is a no-op
m_maxExpandX = 1;
if (m_sLutMask == 0)
@ -153,7 +172,7 @@ void LutAddresser::InitLuts()
m_pYLut = &m_lutData[0];
ADDR_ASSERT(m_pYLut[0] == 0);
}
if (m_zLutMask != 0)
{
m_pZLut = &m_lutData[curOffset];
@ -269,82 +288,33 @@ UINT_32 LutAddresser::EvalEquation(
/**
****************************************************************************************************
* Copy2DSliceUnaligned
* CopyRowUnaligned
*
* @brief
* Copies an arbitrary 2D pixel region to or from a surface.
* Copies a single row to or from a surface.
****************************************************************************************************
*/
template <int BPELog2, int ExpandX, bool ImgIsDest>
void Copy2DSliceUnaligned(
void* pImgBlockSliceStart, // Block corresponding to beginning of slice
void* pBuf, // Pointer to data starting from the copy origin.
size_t bufStrideY, // Stride of each row in pBuf
UINT_32 imageBlocksY, // Width of the image slice, in blocks.
ADDR_COORD2D origin, // Absolute origin, in elements
ADDR_EXTENT2D extent, // Size to copy, in elements
UINT_32 sliceXor, // Includes pipeBankXor and z XOR
void CopyRowUnaligned(
void* pRowImgBlockStart, // Pointer to the image block at x=0
void* pBuf, // Pointer to data at x=0
UINT_32 xStart, // x value to start at
UINT_32 xEnd, // x value to finish at (not inclusive)
UINT_32 rowXor, // Value to XOR in for each address (makes up PBX and y/z coords)
const LutAddresser& addresser)
{
UINT_32 xStart = origin.x;
UINT_32 xEnd = origin.x + extent.width;
UINT_32 x = xStart;
constexpr UINT_32 PixBytes = (1 << BPELog2);
// Apply a negative offset now so later code can do eg. pBuf[x] instead of pBuf[x - origin.x]
pBuf = VoidPtrDec(pBuf, xStart * PixBytes);
// Do things one row at a time for unaligned regions.
for (UINT_32 y = origin.y; y < (origin.y + extent.height); y++)
// Most swizzles pack 2-4 pixels horizontally. Take advantage of this even in non-microblock-aligned
// regions to commonly do 2-4x less work. This is still way less good than copying by whole microblocks though.
if (ExpandX > 1)
{
UINT_32 yBlk = (y >> addresser.GetBlockYBits()) * imageBlocksY;
UINT_32 rowXor = sliceXor ^ addresser.GetAddressY(y);
UINT_32 x = xStart;
// Most swizzles pack 2-4 pixels horizontally. Take advantage of this even in non-microblock-aligned
// regions to commonly do 2-4x less work. This is still way less good than copying by whole microblocks though.
if (ExpandX > 1)
// Unaligned left edge
for (; x < Min(xEnd, PowTwoAlign(xStart, ExpandX)); x++)
{
// Unaligned left edge
for (; x < Min(xEnd, PowTwoAlign(xStart, ExpandX)); x++)
{
UINT_32 blk = (yBlk + (x >> addresser.GetBlockXBits()));
void* pImgBlock = VoidPtrInc(pImgBlockSliceStart, blk << addresser.GetBlockBits());
void* pPix = VoidPtrInc(pImgBlock, rowXor ^ addresser.GetAddressX(x));
if (ImgIsDest)
{
memcpy(pPix, VoidPtrInc(pBuf, x * PixBytes), PixBytes);
}
else
{
memcpy(VoidPtrInc(pBuf, x * PixBytes), pPix, PixBytes);
}
}
// Aligned middle
for (; x < PowTwoAlignDown(xEnd, ExpandX); x += ExpandX)
{
UINT_32 blk = (yBlk + (x >> addresser.GetBlockXBits()));
void* pImgBlock = VoidPtrInc(pImgBlockSliceStart, blk << addresser.GetBlockBits());
void* pPix = VoidPtrInc(pImgBlock, rowXor ^ addresser.GetAddressX(x));
if (ImgIsDest)
{
memcpy(pPix, VoidPtrInc(pBuf, x * PixBytes), PixBytes * ExpandX);
}
else
{
memcpy(VoidPtrInc(pBuf, x * PixBytes), pPix, PixBytes * ExpandX);
}
}
}
// Unaligned end (or the whole thing when ExpandX == 1)
for (; x < xEnd; x++)
{
// Get the index of the block within the slice
UINT_32 blk = (yBlk + (x >> addresser.GetBlockXBits()));
// Apply that index to get the base address of the current block.
void* pImgBlock = VoidPtrInc(pImgBlockSliceStart, blk << addresser.GetBlockBits());
// Grab the x-xor and XOR it all together, adding to get the final address
UINT_32 blk = (x >> addresser.GetBlockXBits());
void* pImgBlock = VoidPtrInc(pRowImgBlockStart, blk << addresser.GetBlockBits());
void* pPix = VoidPtrInc(pImgBlock, rowXor ^ addresser.GetAddressX(x));
if (ImgIsDest)
{
@ -355,8 +325,478 @@ void Copy2DSliceUnaligned(
memcpy(VoidPtrInc(pBuf, x * PixBytes), pPix, PixBytes);
}
}
// Aligned middle
for (; x < PowTwoAlignDown(xEnd, ExpandX); x += ExpandX)
{
UINT_32 blk = (x >> addresser.GetBlockXBits());
void* pImgBlock = VoidPtrInc(pRowImgBlockStart, blk << addresser.GetBlockBits());
void* pPix = VoidPtrInc(pImgBlock, rowXor ^ addresser.GetAddressX(x));
if (ImgIsDest)
{
memcpy(pPix, VoidPtrInc(pBuf, x * PixBytes), PixBytes * ExpandX);
}
else
{
memcpy(VoidPtrInc(pBuf, x * PixBytes), pPix, PixBytes * ExpandX);
}
}
}
// Unaligned end (or the whole thing when ExpandX == 1)
for (; x < xEnd; x++)
{
// Get the index of the block within the slice
UINT_32 blk = (x >> addresser.GetBlockXBits());
// Apply that index to get the base address of the current block.
void* pImgBlock = VoidPtrInc(pRowImgBlockStart, blk << addresser.GetBlockBits());
// Grab the x-xor and XOR it all together, adding to get the final address
void* pPix = VoidPtrInc(pImgBlock, rowXor ^ addresser.GetAddressX(x));
if (ImgIsDest)
{
memcpy(pPix, VoidPtrInc(pBuf, x * PixBytes), PixBytes);
}
else
{
memcpy(VoidPtrInc(pBuf, x * PixBytes), pPix, PixBytes);
}
}
}
pBuf = VoidPtrInc(pBuf, bufStrideY);
/**
****************************************************************************************************
* CopyImgUnaligned
*
* @brief
* Copies an arbitrary 3D pixel region to or from a surface.
****************************************************************************************************
*/
template <int BPELog2, int ExpandX, bool ImgIsDest>
void CopyImgUnaligned(
void* pImgBlockStart, // Block corresponding to beginning of image
void* pBuf, // Pointer to data starting from the copy origin.
size_t bufStrideY, // Stride of each row in pBuf
size_t bufStrideZ, // Stride of each slice in pBuf
UINT_32 imageBlocksY, // Width of the image slice, in blocks.
UINT_32 imageBlocksZ, // Depth pitch of the image slice, in blocks.
ADDR_COORD3D origin, // Absolute origin, in elements
ADDR_EXTENT3D extent, // Size to copy, in elements
UINT_32 pipeBankXor, // Final value to xor in
BOOL_32 isInMipTail, // True if this is in the mip tail.
const LutAddresser& addresser)
{
constexpr UINT_32 PixBytes = (1 << BPELog2);
// Apply a negative x/y offset now so later code can do eg. pBuf[x] instead of pBuf[x - origin.x]
// Keep the z offset.
pBuf = VoidPtrDec(pBuf, origin.x * PixBytes);
void* pSliceBuf = pBuf;
// Do things one slice/row at a time for unaligned regions.
for (UINT_32 z = origin.z; z < (origin.z + extent.depth); z++)
{
UINT_32 sliceXor = pipeBankXor ^ addresser.GetAddressZ(z);
UINT_32 zBlk = (z >> addresser.GetBlockZBits()) * imageBlocksZ;
void* pRowBuf = pSliceBuf;
for (UINT_32 y = origin.y; y < (origin.y + extent.height); y++)
{
UINT_32 yBlk = (y >> addresser.GetBlockYBits()) * imageBlocksY;
UINT_32 rowXor = sliceXor ^ addresser.GetAddressY(y);
UINT_64 rowOffset = ((zBlk + yBlk) << addresser.GetBlockBits());
void* pImgBlockRow = VoidPtrInc(pImgBlockStart, rowOffset);
CopyRowUnaligned<BPELog2, ExpandX, ImgIsDest>(
pImgBlockRow,
pRowBuf,
origin.x,
origin.x + extent.width,
rowXor,
addresser);
pRowBuf = VoidPtrInc(pRowBuf, bufStrideY);
}
pSliceBuf = VoidPtrInc(pSliceBuf, bufStrideZ);
}
}
/**
****************************************************************************************************
* HandleUnalignedRegions
*
* @brief
* Does unaligned copies for any X/Y/Z edges that are not fully aligned, fixing up the
* copy region and pointer to point at the aligned region that remains.
****************************************************************************************************
*/
template <int BPELog2, int ExpandX>
void HandleUnalignedRegions(
void* pImgBlockStart, // Block corresponding to beginning of image
void** ppBuf, // Pointer to pointer to data starting from the copy origin.
size_t bufStrideY, // Stride of each row in pBuf
size_t bufStrideZ, // Stride of each slice in pBuf
UINT_32 imageBlocksY, // Width of the image slice, in blocks.
UINT_32 imageBlocksZ, // Depth pitch of the image slice, in blocks.
ADDR_COORD3D* pOrigin, // Absolute origin, in elements
ADDR_EXTENT3D* pExtent, // Size to copy, in elements
ADDR_EXTENT3D align, // Size to align on, in elements
UINT_32 pipeBankXor, // Final value to xor in
BOOL_32 isInMipTail, // True if this is in the mip tail.
const LutAddresser& addresser)
{
constexpr bool ImgIsDest = true;
// Go through the start/end of the x/y/z extents and copy the parts that aren't aligned.
if (pOrigin->x != PowTwoAlign(pOrigin->x, align.width))
{
UINT_32 xSize = Min(pOrigin->x + pExtent->width, PowTwoAlign(pOrigin->x, align.width)) - pOrigin->x;
CopyImgUnaligned<BPELog2, ExpandX, ImgIsDest>(
pImgBlockStart,
*ppBuf,
bufStrideY,
bufStrideZ,
imageBlocksY,
imageBlocksZ,
*pOrigin,
{ xSize, pExtent->height, pExtent->depth},
pipeBankXor,
isInMipTail,
addresser);
pExtent->width -= xSize;
pOrigin->x += xSize;
*ppBuf = VoidPtrInc(*ppBuf, xSize << BPELog2);
}
if (pOrigin->y != PowTwoAlign(pOrigin->y, align.height))
{
UINT_32 ySize = Min(pOrigin->y + pExtent->height, PowTwoAlign(pOrigin->y, align.height)) - pOrigin->y;
CopyImgUnaligned<BPELog2, ExpandX, ImgIsDest>(
pImgBlockStart,
*ppBuf,
bufStrideY,
bufStrideZ,
imageBlocksY,
imageBlocksZ,
*pOrigin,
{ pExtent->width, ySize, pExtent->depth},
pipeBankXor,
isInMipTail,
addresser);
pExtent->height -= ySize;
pOrigin->y += ySize;
*ppBuf = VoidPtrInc(*ppBuf, ySize * bufStrideY);
}
if (pOrigin->z != PowTwoAlign(pOrigin->z, align.depth))
{
UINT_32 zSize = Min(pOrigin->z + pExtent->depth, PowTwoAlign(pOrigin->z, align.depth)) - pOrigin->z;
CopyImgUnaligned<BPELog2, ExpandX, ImgIsDest>(
pImgBlockStart,
*ppBuf,
bufStrideY,
bufStrideZ,
imageBlocksY,
imageBlocksZ,
*pOrigin,
{ pExtent->width, pExtent->height, zSize },
pipeBankXor,
isInMipTail,
addresser);
pExtent->depth -= zSize;
pOrigin->z += zSize;
*ppBuf = VoidPtrInc(*ppBuf, zSize * bufStrideZ);
}
// At this point the starts are aligned, so we can care about just size rather than origin+size.
if ((pExtent->width) != PowTwoAlignDown(pExtent->width, align.width))
{
UINT_32 xAlignedSize = PowTwoAlignDown(pOrigin->x + pExtent->width, align.width) - pOrigin->x;
void* pBuf = VoidPtrInc(*ppBuf, xAlignedSize << BPELog2);
CopyImgUnaligned<BPELog2, ExpandX, ImgIsDest>(
pImgBlockStart,
pBuf,
bufStrideY,
bufStrideZ,
imageBlocksY,
imageBlocksZ,
{ pOrigin->x + xAlignedSize, pOrigin->y, pOrigin->z},
{ pExtent->width - xAlignedSize, pExtent->height, pExtent->depth },
pipeBankXor,
isInMipTail,
addresser);
pExtent->width = xAlignedSize;
}
if ((pExtent->height) != PowTwoAlignDown(pExtent->height, align.height))
{
UINT_32 yAlignedSize = PowTwoAlignDown(pOrigin->y + pExtent->height, align.height) - pOrigin->y;
void* pBuf = VoidPtrInc(*ppBuf, yAlignedSize * bufStrideY);
CopyImgUnaligned<BPELog2, ExpandX, ImgIsDest>(
pImgBlockStart,
pBuf,
bufStrideY,
bufStrideZ,
imageBlocksY,
imageBlocksZ,
{ pOrigin->x, pOrigin->y + yAlignedSize, pOrigin->z},
{ pExtent->width, pExtent->height - yAlignedSize, pExtent->depth },
pipeBankXor,
isInMipTail,
addresser);
pExtent->height = yAlignedSize;
}
if ((pExtent->depth) != PowTwoAlignDown(pExtent->depth, align.depth))
{
UINT_32 zAlignedSize = PowTwoAlignDown(pOrigin->z + pExtent->depth, align.depth) - pOrigin->z;
void* pBuf = VoidPtrInc(*ppBuf, zAlignedSize * bufStrideZ);
CopyImgUnaligned<BPELog2, ExpandX, ImgIsDest>(
pImgBlockStart,
pBuf,
bufStrideY,
bufStrideZ,
imageBlocksY,
imageBlocksZ,
{ pOrigin->x, pOrigin->y, pOrigin->z + zAlignedSize },
{ pExtent->width, pExtent->height, pExtent->depth - zAlignedSize },
pipeBankXor,
isInMipTail,
addresser);
pExtent->depth = zAlignedSize;
}
}
/**
****************************************************************************************************
* CopyMemImgHybrid
*
* @brief
* Copies a 3D pixel region to a surface. Uses fast copies for fully covered microblocks.
****************************************************************************************************
*/
template <class MicroSw>
AVX2_FUNC NEON_FUNC void CopyMemImgHybrid(
void* pImgBlockStart, // Block corresponding to beginning of image
void* pBuf, // Pointer to data starting from the copy origin.
size_t bufStrideY, // Stride of each row in pBuf
size_t bufStrideZ, // Stride of each slice in pBuf
UINT_32 imageBlocksY, // Width of the image slice, in blocks.
UINT_32 imageBlocksZ, // Depth pitch of the image slice, in blocks.
ADDR_COORD3D origin, // Absolute origin, in elements
ADDR_EXTENT3D extent, // Size to copy, in elements
UINT_32 pipeBankXor, // Final value to xor in
BOOL_32 isInMipTail, // True if this is in the mip tail.
const LutAddresser& addresser)
{
// Handle unaligned edges in x/y/z and fixup the extents to match.
HandleUnalignedRegions<MicroSw::BpeLog2, MicroSw::ExpandX>(
pImgBlockStart,
&pBuf,
bufStrideY,
bufStrideZ,
imageBlocksY,
imageBlocksZ,
&origin,
&extent,
MicroSw::MicroBlockExtent,
pipeBankXor,
isInMipTail,
addresser
);
// Apply a negative x/y offset now so later code can do eg. pBuf[x] instead of pBuf[x - origin.x]
// Keep the z offset.
pBuf = VoidPtrDec(pBuf, origin.x << MicroSw::BpeLog2);
void* pSliceBuf = pBuf;
// Do things one slice/row at a time for unaligned regions.
for (UINT_32 z = origin.z; z < (origin.z + extent.depth); z += MicroSw::MicroBlockExtent.depth)
{
UINT_32 sliceXor = pipeBankXor ^ addresser.GetAddressZ(z);
UINT_32 zBlk = (z >> addresser.GetBlockZBits()) * imageBlocksZ;
void* pRowBuf = pSliceBuf;
for (UINT_32 y = origin.y; y < (origin.y + extent.height); y += MicroSw::MicroBlockExtent.height)
{
UINT_32 yBlk = ((y >> addresser.GetBlockYBits()) * imageBlocksY) + zBlk;
UINT_32 rowXor = sliceXor ^ addresser.GetAddressY(y);
for (UINT_32 x = origin.x; x < (origin.x + extent.width); x += MicroSw::MicroBlockExtent.width)
{
UINT_32 xBlk = (x >> addresser.GetBlockXBits()) + yBlk;
UINT_64 offset = (xBlk << addresser.GetBlockBits());
offset ^= rowXor;
offset ^= addresser.GetAddressX(x);
void* pPix = VoidPtrInc(pImgBlockStart, offset);
void* pPixBuf = VoidPtrInc(pRowBuf, x << MicroSw::BpeLog2);
MicroSw::CopyMicroBlock(
pPix,
pPixBuf,
bufStrideY,
bufStrideZ
);
}
pRowBuf = VoidPtrInc(pRowBuf, bufStrideY * MicroSw::MicroBlockExtent.height);
}
pSliceBuf = VoidPtrInc(pSliceBuf, bufStrideZ * MicroSw::MicroBlockExtent.depth);
}
}
/**
****************************************************************************************************
* CopyMemImgMicroblocks
*
* @brief
* Copies the microblocks of a 3D pixel region to/from a surface.
****************************************************************************************************
*/
template <bool ImgIsDest, bool NonTemporal>
AVX2_FUNC NEON_FUNC void CopyMemImgMicroblocks(
void* pImgBlockStart, // Block corresponding to beginning of image
void* pBuf, // Pointer to data starting from the copy origin.
size_t bufStrideY, // Stride of each row in pBuf, ignored.
size_t bufStrideZ, // Stride of each slice in pBuf, ignored.
UINT_32 imageBlocksY, // Width of the image slice, in blocks.
UINT_32 imageBlocksZ, // Depth pitch of the image slice, in blocks.
ADDR_COORD3D origin, // Absolute origin, in elements
ADDR_EXTENT3D extent, // Size to copy, in elements
UINT_32 pipeBankXor, // Final value to xor in
BOOL_32 isInMipTail, // True if this is in the mip tail.
const LutAddresser& addresser)
{
// Pad out our dims to microblock boundaries.
origin.x = PowTwoAlignDown(origin.x, addresser.GetMicroBlockX());
origin.y = PowTwoAlignDown(origin.y, addresser.GetMicroBlockY());
origin.z = PowTwoAlignDown(origin.z, addresser.GetMicroBlockZ());
extent.width = PowTwoAlign(extent.width, addresser.GetMicroBlockX());
extent.height = PowTwoAlign(extent.height, addresser.GetMicroBlockY());
extent.depth = PowTwoAlign(extent.depth, addresser.GetMicroBlockZ());
// Calculate the address of the first pixel in each microblock (256B), then copy it.
for (UINT_32 z = origin.z; z < (origin.z + extent.depth); z += addresser.GetMicroBlockZ())
{
UINT_32 sliceXor = pipeBankXor ^ addresser.GetAddressZ(z);
UINT_32 zBlk = (z >> addresser.GetBlockZBits()) * imageBlocksZ;
for (UINT_32 y = origin.y; y < (origin.y + extent.height); y += addresser.GetMicroBlockY())
{
UINT_32 yBlk = ((y >> addresser.GetBlockYBits()) * imageBlocksY) + zBlk;
UINT_32 rowXor = sliceXor ^ addresser.GetAddressY(y);
for (UINT_32 x = origin.x; x < (origin.x + extent.width); x += addresser.GetMicroBlockX())
{
UINT_32 xBlk = (x >> addresser.GetBlockXBits()) + yBlk;
UINT_64 offset = (xBlk << addresser.GetBlockBits());
offset ^= rowXor;
offset ^= addresser.GetAddressX(x);
void* pPix = VoidPtrInc(pImgBlockStart, offset);
constexpr UINT_32 CopySize = 1 << 8;
#if ADDR_HAS_AVX2
if (NonTemporal && ImgIsDest)
{
StreamCopyToImgAligned(pPix, pBuf, CopySize);
}
else if (NonTemporal)
{
StreamCopyFromImgAligned(pBuf, pPix, CopySize);
}
else
#endif
if (ImgIsDest)
{
memcpy(pPix, pBuf, CopySize);
}
else
{
memcpy(pBuf, pPix, CopySize);
}
pBuf = VoidPtrInc(pBuf, CopySize);
}
}
}
}
/**
****************************************************************************************************
* CopyMemImgBlocks
*
* @brief
* Copies the blocks of a 3D pixel region to/from a surface.
****************************************************************************************************
*/
template <bool ImgIsDest, bool NonTemporal>
AVX2_FUNC NEON_FUNC void CopyMemImgBlocks(
void* pImgBlockStart, // Block corresponding to beginning of image
void* pBuf, // Pointer to data starting from the copy origin.
size_t bufStrideY, // Stride of each row in pBuf, ignored.
size_t bufStrideZ, // Stride of each slice in pBuf, ignored.
UINT_32 imageBlocksY, // Width of the image slice, in blocks.
UINT_32 imageBlocksZ, // Depth pitch of the image slice, in blocks.
ADDR_COORD3D origin, // Absolute origin, in elements
ADDR_EXTENT3D extent, // Size to copy, in elements
UINT_32 pipeBankXor, // Final value to xor in
BOOL_32 isInMipTail, // True if this is in the mip tail.
const LutAddresser& addresser)
{
if (isInMipTail)
{
return CopyMemImgMicroblocks<ImgIsDest, NonTemporal>(
pImgBlockStart,
pBuf,
bufStrideY,
bufStrideZ,
imageBlocksY,
imageBlocksZ,
origin,
extent,
pipeBankXor,
isInMipTail,
addresser
);
}
// Pad out our dims to block boundaries.
origin.x = PowTwoAlignDown(origin.x, addresser.GetBlockX());
origin.y = PowTwoAlignDown(origin.y, addresser.GetBlockY());
origin.z = PowTwoAlignDown(origin.z, addresser.GetBlockZ());
extent.width = PowTwoAlign(extent.width, addresser.GetBlockX());
extent.height = PowTwoAlign(extent.height, addresser.GetBlockY());
extent.depth = PowTwoAlign(extent.depth, addresser.GetBlockZ());
// Copy block by block. No complex swizzling here, everything is in (strided) typewriter order.
for (UINT_32 z = origin.z; z < (origin.z + extent.depth); z += addresser.GetBlockZ())
{
UINT_32 zBlk = (z >> addresser.GetBlockZBits()) * imageBlocksZ;
for (UINT_32 y = origin.y; y < (origin.y + extent.height); y += addresser.GetBlockY())
{
UINT_32 yBlk = ((y >> addresser.GetBlockYBits()) * imageBlocksY) + zBlk;
UINT_32 xBlkStart = (origin.x >> addresser.GetBlockXBits()) + yBlk;
UINT_32 numXBlk = extent.width >> addresser.GetBlockXBits();
UINT_64 offset = (xBlkStart << addresser.GetBlockBits());
void* pPix = VoidPtrInc(pImgBlockStart, offset);
UINT_32 copySize = numXBlk << addresser.GetBlockBits();
#if ADDR_HAS_AVX2
if (NonTemporal && ImgIsDest)
{
StreamCopyToImgAligned(pPix, pBuf, copySize);
}
else if (NonTemporal)
{
StreamCopyFromImgAligned(pBuf, pPix, copySize);
}
else
#endif
if (ImgIsDest)
{
memcpy(pPix, pBuf, copySize);
}
else
{
memcpy(pBuf, pPix, copySize);
}
pBuf = VoidPtrInc(pBuf, copySize);
}
}
}
@ -368,33 +808,130 @@ void Copy2DSliceUnaligned(
* Determines and returns which copy function to use for copying to images
****************************************************************************************************
*/
UnalignedCopyMemImgFunc LutAddresser::GetCopyMemImgFunc() const
UnalignedCopyMemImgFunc LutAddresser::GetCopyMemImgFunc(
ADDR_COPY_FLAGS flags
) const
{
UnalignedCopyMemImgFunc pfnRet = nullptr;
// This key encodes how the bottom 8 bits (256B) are formed, so we can match to the correct optimized
// swizzle function (they are all swizzle-agnostic beyond those 256B).
UINT_64 microSwKey = GetMicroSwKey(reinterpret_cast<const UINT_64*>(&m_bit[0]));
if (flags.blockMemcpy)
{
#if ADDR_HAS_AVX2
if (CpuSupportsAvx2())
{
pfnRet = CopyMemImgBlocks<true, true>;
}
else
#endif
{
pfnRet = CopyMemImgBlocks<true, false>;
}
}
if ((pfnRet == nullptr) && flags.hybridMemcpy)
{
#if ADDR_HAS_AVX2
if (CpuSupportsAvx2())
{
pfnRet = CopyMemImgMicroblocks<true, true>;
}
else
#endif
{
pfnRet = CopyMemImgMicroblocks<true, false>;
}
}
// If this is one of the known microswizzles and CPU support is present, use a hybrid copy that does
// SIMD swizzling for aligned regions and falls back for unaligned edges.
#if ADDR_HAS_AVX2
static constexpr struct {
UINT_64 microSwKey;
UnalignedCopyMemImgFunc pfn;
} AvxFuncs[] = {
{ GetMicroSwKey(MicroSw_2D_1BPE_AVX2::MicroEq), CopyMemImgHybrid<MicroSw_2D_1BPE_AVX2>},
{ GetMicroSwKey(MicroSw_2D_2BPE_AVX2::MicroEq), CopyMemImgHybrid<MicroSw_2D_2BPE_AVX2>},
{ GetMicroSwKey(MicroSw_2D_4BPE_AVX2::MicroEq), CopyMemImgHybrid<MicroSw_2D_4BPE_AVX2>},
{ GetMicroSwKey(MicroSw_2D_8BPE_AVX2::MicroEq), CopyMemImgHybrid<MicroSw_2D_8BPE_AVX2>},
{ GetMicroSwKey(MicroSw_2D_16BPE_AVX2::MicroEq), CopyMemImgHybrid<MicroSw_2D_16BPE_AVX2>},
{ GetMicroSwKey(MicroSw_3D_1BPE_AVX2::MicroEq), CopyMemImgHybrid<MicroSw_3D_1BPE_AVX2>},
{ GetMicroSwKey(MicroSw_3D_2BPE_AVX2::MicroEq), CopyMemImgHybrid<MicroSw_3D_2BPE_AVX2>},
{ GetMicroSwKey(MicroSw_3D_4BPE_AVX2::MicroEq), CopyMemImgHybrid<MicroSw_3D_4BPE_AVX2>},
{ GetMicroSwKey(MicroSw_3D_8BPE_AVX2::MicroEq), CopyMemImgHybrid<MicroSw_3D_8BPE_AVX2>},
{ GetMicroSwKey(MicroSw_3D_16BPE_AVX2::MicroEq), CopyMemImgHybrid<MicroSw_3D_16BPE_AVX2>},
{ GetMicroSwKey(MicroSw_R_1BPE_AVX2::MicroEq), CopyMemImgHybrid<MicroSw_R_1BPE_AVX2>},
{ GetMicroSwKey(MicroSw_R_2BPE_AVX2::MicroEq), CopyMemImgHybrid<MicroSw_R_2BPE_AVX2>},
{ GetMicroSwKey(MicroSw_R_4BPE_AVX2::MicroEq), CopyMemImgHybrid<MicroSw_R_4BPE_AVX2>},
{ GetMicroSwKey(MicroSw_Z_1BPE_AVX2::MicroEq), CopyMemImgHybrid<MicroSw_Z_1BPE_AVX2>},
{ GetMicroSwKey(MicroSw_D_1BPE_AVX2::MicroEq), CopyMemImgHybrid<MicroSw_D_1BPE_AVX2>}
};
if ((pfnRet == nullptr) && CpuSupportsAvx2())
{
for (const auto& func : AvxFuncs)
{
if (func.microSwKey == microSwKey)
{
pfnRet = func.pfn;
break;
}
}
}
#endif // ADDR_HAS_AVX2
#if ADDR_HAS_NEON
static constexpr struct {
UINT_64 microSwKey;
UnalignedCopyMemImgFunc pfn;
} NeonFuncs[] = {
{ GetMicroSwKey(MicroSw_2D_1BPE_NEON::MicroEq), CopyMemImgHybrid<MicroSw_2D_1BPE_NEON>},
{ GetMicroSwKey(MicroSw_2D_2BPE_NEON::MicroEq), CopyMemImgHybrid<MicroSw_2D_2BPE_NEON>},
{ GetMicroSwKey(MicroSw_2D_4BPE_NEON::MicroEq), CopyMemImgHybrid<MicroSw_2D_4BPE_NEON>},
{ GetMicroSwKey(MicroSw_2D_8BPE_NEON::MicroEq), CopyMemImgHybrid<MicroSw_2D_8BPE_NEON>},
{ GetMicroSwKey(MicroSw_2D_16BPE_NEON::MicroEq), CopyMemImgHybrid<MicroSw_2D_16BPE_NEON>},
{ GetMicroSwKey(MicroSw_3D_1BPE_NEON::MicroEq), CopyMemImgHybrid<MicroSw_3D_1BPE_NEON>},
{ GetMicroSwKey(MicroSw_3D_2BPE_NEON::MicroEq), CopyMemImgHybrid<MicroSw_3D_2BPE_NEON>},
{ GetMicroSwKey(MicroSw_3D_4BPE_NEON::MicroEq), CopyMemImgHybrid<MicroSw_3D_4BPE_NEON>},
{ GetMicroSwKey(MicroSw_3D_8BPE_NEON::MicroEq), CopyMemImgHybrid<MicroSw_3D_8BPE_NEON>},
{ GetMicroSwKey(MicroSw_3D_16BPE_NEON::MicroEq), CopyMemImgHybrid<MicroSw_3D_16BPE_NEON>},
{ GetMicroSwKey(MicroSw_R_1BPE_NEON::MicroEq), CopyMemImgHybrid<MicroSw_R_1BPE_NEON>},
{ GetMicroSwKey(MicroSw_R_2BPE_NEON::MicroEq), CopyMemImgHybrid<MicroSw_R_2BPE_NEON>},
{ GetMicroSwKey(MicroSw_R_4BPE_NEON::MicroEq), CopyMemImgHybrid<MicroSw_R_4BPE_NEON>},
{ GetMicroSwKey(MicroSw_Z_1BPE_NEON::MicroEq), CopyMemImgHybrid<MicroSw_Z_1BPE_NEON>},
{ GetMicroSwKey(MicroSw_D_1BPE_NEON::MicroEq), CopyMemImgHybrid<MicroSw_D_1BPE_NEON>}
};
if ((pfnRet == nullptr) && CpuSupportsNeon())
{
for (const auto& func : NeonFuncs)
{
if (func.microSwKey == microSwKey)
{
pfnRet = func.pfn;
break;
}
}
}
#endif // ADDR_HAS_NEON
// While these are all the same function, the codegen gets really bad if the size of each pixel
// is not known at compile time. Hence, templates.
const UnalignedCopyMemImgFunc Funcs[MaxElementBytesLog2][3] =
{
// ExpandX = 1, 2, 4
{ Copy2DSliceUnaligned<0, 1, true>, Copy2DSliceUnaligned<0, 2, true>, Copy2DSliceUnaligned<0, 4, true> }, // 1BPE
{ Copy2DSliceUnaligned<1, 1, true>, Copy2DSliceUnaligned<1, 2, true>, Copy2DSliceUnaligned<1, 4, true> }, // 2BPE
{ Copy2DSliceUnaligned<2, 1, true>, Copy2DSliceUnaligned<2, 2, true>, Copy2DSliceUnaligned<2, 4, true> }, // 4BPE
{ Copy2DSliceUnaligned<3, 1, true>, Copy2DSliceUnaligned<3, 2, true>, Copy2DSliceUnaligned<3, 4, true> }, // 8BPE
{ Copy2DSliceUnaligned<4, 1, true>, Copy2DSliceUnaligned<4, 2, true>, Copy2DSliceUnaligned<4, 4, true> }, // 16BPE
{ CopyImgUnaligned<0, 1, true>, CopyImgUnaligned<0, 2, true>, CopyImgUnaligned<0, 4, true> }, // 1BPE
{ CopyImgUnaligned<1, 1, true>, CopyImgUnaligned<1, 2, true>, CopyImgUnaligned<1, 4, true> }, // 2BPE
{ CopyImgUnaligned<2, 1, true>, CopyImgUnaligned<2, 2, true>, CopyImgUnaligned<2, 4, true> }, // 4BPE
{ CopyImgUnaligned<3, 1, true>, CopyImgUnaligned<3, 2, true>, CopyImgUnaligned<3, 4, true> }, // 8BPE
{ CopyImgUnaligned<4, 1, true>, CopyImgUnaligned<4, 2, true>, CopyImgUnaligned<4, 4, true> }, // 16BPE
};
UnalignedCopyMemImgFunc pfnRet = nullptr;
ADDR_ASSERT(m_bpeLog2 < MaxElementBytesLog2);
if (m_maxExpandX >= 4)
// Fallback functions
if (pfnRet == nullptr)
{
pfnRet = Funcs[m_bpeLog2][2];
}
else if (m_maxExpandX >= 2)
{
pfnRet = Funcs[m_bpeLog2][1];
}
else
{
pfnRet = Funcs[m_bpeLog2][0];
ADDR_ASSERT(m_bpeLog2 < MaxElementBytesLog2);
pfnRet = Funcs[m_bpeLog2][Min(2U, Log2(m_maxExpandX))];
}
return pfnRet;
}
@ -407,35 +944,139 @@ UnalignedCopyMemImgFunc LutAddresser::GetCopyMemImgFunc() const
* Determines and returns which copy function to use for copying from images
****************************************************************************************************
*/
UnalignedCopyMemImgFunc LutAddresser::GetCopyImgMemFunc() const
UnalignedCopyMemImgFunc LutAddresser::GetCopyImgMemFunc(
ADDR_COPY_FLAGS flags
) const
{
UnalignedCopyMemImgFunc pfnRet = nullptr;
if (flags.blockMemcpy)
{
#if ADDR_HAS_AVX2
if (CpuSupportsAvx2())
{
pfnRet = CopyMemImgBlocks<false, true>;
}
else
#endif
{
pfnRet = CopyMemImgBlocks<false, false>;
}
}
if ((pfnRet == nullptr) && flags.hybridMemcpy)
{
#if ADDR_HAS_AVX2
if (CpuSupportsAvx2())
{
pfnRet = CopyMemImgMicroblocks<false, true>;
}
else
#endif
{
pfnRet = CopyMemImgMicroblocks<false, false>;
}
}
// While these are all the same function, the codegen gets really bad if the size of each pixel
// is not known at compile time. Hence, templates.
const UnalignedCopyMemImgFunc Funcs[MaxElementBytesLog2][3] =
{
// ExpandX = 1, 2, 4
{ Copy2DSliceUnaligned<0, 1, false>, Copy2DSliceUnaligned<0, 2, false>, Copy2DSliceUnaligned<0, 4, false> }, // 1BPE
{ Copy2DSliceUnaligned<1, 1, false>, Copy2DSliceUnaligned<1, 2, false>, Copy2DSliceUnaligned<1, 4, false> }, // 2BPE
{ Copy2DSliceUnaligned<2, 1, false>, Copy2DSliceUnaligned<2, 2, false>, Copy2DSliceUnaligned<2, 4, false> }, // 4BPE
{ Copy2DSliceUnaligned<3, 1, false>, Copy2DSliceUnaligned<3, 2, false>, Copy2DSliceUnaligned<3, 4, false> }, // 8BPE
{ Copy2DSliceUnaligned<4, 1, false>, Copy2DSliceUnaligned<4, 2, false>, Copy2DSliceUnaligned<4, 4, false> }, // 16BPE
{ CopyImgUnaligned<0, 1, false>, CopyImgUnaligned<0, 2, false>, CopyImgUnaligned<0, 4, false> }, // 1BPE
{ CopyImgUnaligned<1, 1, false>, CopyImgUnaligned<1, 2, false>, CopyImgUnaligned<1, 4, false> }, // 2BPE
{ CopyImgUnaligned<2, 1, false>, CopyImgUnaligned<2, 2, false>, CopyImgUnaligned<2, 4, false> }, // 4BPE
{ CopyImgUnaligned<3, 1, false>, CopyImgUnaligned<3, 2, false>, CopyImgUnaligned<3, 4, false> }, // 8BPE
{ CopyImgUnaligned<4, 1, false>, CopyImgUnaligned<4, 2, false>, CopyImgUnaligned<4, 4, false> }, // 16BPE
};
UnalignedCopyMemImgFunc pfnRet = nullptr;
ADDR_ASSERT(m_bpeLog2 < MaxElementBytesLog2);
if (m_maxExpandX >= 4)
if (pfnRet == nullptr)
{
pfnRet = Funcs[m_bpeLog2][2];
}
else if (m_maxExpandX >= 2)
{
pfnRet = Funcs[m_bpeLog2][1];
}
else
{
pfnRet = Funcs[m_bpeLog2][0];
pfnRet = Funcs[m_bpeLog2][Min(2U, Log2(m_maxExpandX))];
}
return pfnRet;
}
/**
****************************************************************************************************
* LutAddresser::DoCopyImgMemPreFlushes
*
* @brief
* Does any flushes required for nontemporal SIMD instructions to access the image memory.
****************************************************************************************************
*/
void LutAddresser::DoCopyImgMemPreFlushes(
ADDR_COPY_FLAGS flags
) const
{
#if ADDR_HAS_AVX2
if ((flags.blockMemcpy || flags.hybridMemcpy) && CpuSupportsAvx2())
{
// Loads are weakly ordered, and we need to ensure they start after the previous copy
NonTemporalLoadStoreFence();
}
#endif
}
/**
****************************************************************************************************
* LutAddresser::DoCopyMemImgPostFlushes
*
* @brief
* Does any flushes required for nontemporal SIMD instructions to access the image memory.
****************************************************************************************************
*/
void LutAddresser::DoCopyMemImgPostFlushes(
ADDR_COPY_FLAGS flags
) const
{
#if ADDR_HAS_AVX2
if (CpuSupportsAvx2())
{
// Stores are weakly ordered, and we need to ensure they finish before the next submission
// or copy.
NonTemporalStoreFence();
}
#endif
}
#if __cplusplus < 201703L
// Constexpr arrays need an additional definition at namespace scope until c++17
#if ADDR_HAS_AVX2
constexpr ADDR_EXTENT3D MicroSw_2D_1BPE_AVX2::MicroBlockExtent;
constexpr ADDR_EXTENT3D MicroSw_2D_2BPE_AVX2::MicroBlockExtent;
constexpr ADDR_EXTENT3D MicroSw_2D_4BPE_AVX2::MicroBlockExtent;
constexpr ADDR_EXTENT3D MicroSw_2D_8BPE_AVX2::MicroBlockExtent;
constexpr ADDR_EXTENT3D MicroSw_2D_16BPE_AVX2::MicroBlockExtent;
constexpr ADDR_EXTENT3D MicroSw_3D_1BPE_AVX2::MicroBlockExtent;
constexpr ADDR_EXTENT3D MicroSw_3D_2BPE_AVX2::MicroBlockExtent;
constexpr ADDR_EXTENT3D MicroSw_3D_4BPE_AVX2::MicroBlockExtent;
constexpr ADDR_EXTENT3D MicroSw_3D_8BPE_AVX2::MicroBlockExtent;
constexpr ADDR_EXTENT3D MicroSw_3D_16BPE_AVX2::MicroBlockExtent;
constexpr ADDR_EXTENT3D MicroSw_R_1BPE_AVX2::MicroBlockExtent;
constexpr ADDR_EXTENT3D MicroSw_R_2BPE_AVX2::MicroBlockExtent;
constexpr ADDR_EXTENT3D MicroSw_R_4BPE_AVX2::MicroBlockExtent;
constexpr ADDR_EXTENT3D MicroSw_Z_1BPE_AVX2::MicroBlockExtent;
constexpr ADDR_EXTENT3D MicroSw_D_1BPE_AVX2::MicroBlockExtent;
#endif
#if ADDR_HAS_NEON
constexpr ADDR_EXTENT3D MicroSw_2D_1BPE_NEON::MicroBlockExtent;
constexpr ADDR_EXTENT3D MicroSw_2D_2BPE_NEON::MicroBlockExtent;
constexpr ADDR_EXTENT3D MicroSw_2D_4BPE_NEON::MicroBlockExtent;
constexpr ADDR_EXTENT3D MicroSw_2D_8BPE_NEON::MicroBlockExtent;
constexpr ADDR_EXTENT3D MicroSw_2D_16BPE_NEON::MicroBlockExtent;
constexpr ADDR_EXTENT3D MicroSw_3D_1BPE_NEON::MicroBlockExtent;
constexpr ADDR_EXTENT3D MicroSw_3D_2BPE_NEON::MicroBlockExtent;
constexpr ADDR_EXTENT3D MicroSw_3D_4BPE_NEON::MicroBlockExtent;
constexpr ADDR_EXTENT3D MicroSw_3D_8BPE_NEON::MicroBlockExtent;
constexpr ADDR_EXTENT3D MicroSw_3D_16BPE_NEON::MicroBlockExtent;
constexpr ADDR_EXTENT3D MicroSw_R_1BPE_NEON::MicroBlockExtent;
constexpr ADDR_EXTENT3D MicroSw_R_2BPE_NEON::MicroBlockExtent;
constexpr ADDR_EXTENT3D MicroSw_R_4BPE_NEON::MicroBlockExtent;
constexpr ADDR_EXTENT3D MicroSw_Z_1BPE_NEON::MicroBlockExtent;
constexpr ADDR_EXTENT3D MicroSw_D_1BPE_NEON::MicroBlockExtent;
#endif
#endif
}

View file

@ -1,7 +1,8 @@
/*
************************************************************************************************************************
*
* Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (C) 2024-2026 Advanced Micro Devices, Inc. All rights reserved.
* SPDX-License-Identifier: MIT
*
***********************************************************************************************************************/
/**
@ -26,10 +27,13 @@ typedef void (*UnalignedCopyMemImgFunc)(
void* pImgBlockSliceStart, // Block corresponding to beginning of slice
void* pBuf, // Pointer to data starting from the copy origin.
size_t bufStrideY, // Stride of each row in pBuf
size_t bufStrideZ, // Stride of each slice in pBuf
UINT_32 imageBlocksY, // Width of the image slice, in blocks.
ADDR_COORD2D origin, // Absolute origin, in elements
ADDR_EXTENT2D extent, // Size to copy, in elements
UINT_32 sliceXor, // Includes pipeBankXor and z XOR
UINT_32 imageBlocksZ, // Depth pitch of the image slice, in blocks.
ADDR_COORD3D origin, // Absolute origin, in elements
ADDR_EXTENT3D extent, // Size to copy, in elements
UINT_32 pipeBankXor, // Final value to XOR into the address
BOOL_32 isInMipTail, // True if this is in the mip tail.
const LutAddresser& addresser);
// This class calculates and holds up to four lookup tables (x/y/z/s) which can be used to cheaply calculate the
@ -60,10 +64,21 @@ public:
// Get the block size
UINT_32 GetBlockBits() const { return m_blockBits; }
UINT_32 GetBlockX() const { return m_blockSize.width; }
UINT_32 GetBlockY() const { return m_blockSize.height; }
UINT_32 GetBlockZ() const { return m_blockSize.depth; }
UINT_32 GetBlockXBits() const { return Log2(m_blockSize.width); }
UINT_32 GetBlockYBits() const { return Log2(m_blockSize.height); }
UINT_32 GetBlockZBits() const { return Log2(m_blockSize.depth); }
// Get the microblock size
UINT_32 GetMicroBlockX() const { return m_microBlockSize.width; }
UINT_32 GetMicroBlockY() const { return m_microBlockSize.height; }
UINT_32 GetMicroBlockZ() const { return m_microBlockSize.depth; }
// Get other image props
UINT_32 GetBpeLog2() const { return m_bpeLog2; }
// "Fast single channel" functions to get the part that each channel contributes to be XORd together.
UINT_32 GetAddressX(UINT_32 x) const { return m_pXLut[x & m_xLutMask];}
UINT_32 GetAddressY(UINT_32 y) const { return m_pYLut[y & m_yLutMask];}
@ -71,8 +86,11 @@ public:
UINT_32 GetAddressS(UINT_32 s) const { return m_pSLut[s & m_sLutMask];}
// Get a function that can copy a single 2D slice of an image with this swizzle.
UnalignedCopyMemImgFunc GetCopyMemImgFunc() const;
UnalignedCopyMemImgFunc GetCopyImgMemFunc() const;
UnalignedCopyMemImgFunc GetCopyMemImgFunc(ADDR_COPY_FLAGS flags) const;
UnalignedCopyMemImgFunc GetCopyImgMemFunc(ADDR_COPY_FLAGS flags) const;
void DoCopyMemImgPostFlushes(ADDR_COPY_FLAGS flags) const;
void DoCopyImgMemPreFlushes(ADDR_COPY_FLAGS flags) const;
private:
// Calculate general properties of the swizzle equations
void InitSwizzleProps();
@ -99,6 +117,9 @@ private:
// The block size
ADDR_EXTENT3D m_blockSize;
// The microblock size
ADDR_EXTENT3D m_microBlockSize;
// Number of 'x' bits at the bottom of the equation. Must be a pow2 and at least 1.
// This will be used as a simple optimization to batch together operations on adjacent x pixels.

File diff suppressed because it is too large Load diff

View file

@ -1,7 +1,7 @@
/*
************************************************************************************************************************
*
* Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved.
* SPDX-License-Identifier: MIT
*
***********************************************************************************************************************/
@ -2782,9 +2782,11 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlGetPreferredSurfaceSetting(
if ((forbid64KbBlockType == FALSE) && (forbidVarBlockType == FALSE))
{
UINT_32 ratioLow;
UINT_32 ratioHi;
GetSwizzleModePreferenceRatio(pIn, &ratioLow, &ratioHi);
const UINT_8 maxFmaskSwizzleModeType = 2;
const UINT_32 ratioLow = pIn->flags.minimizeAlign ? 1 : (pIn->flags.opt4space ? 3 : 2);
const UINT_32 ratioHi = pIn->flags.minimizeAlign ? 1 : (pIn->flags.opt4space ? 2 : 1);
const UINT_32 fmaskBpp = GetFmaskBpp(pIn->numSamples, pIn->numFrags);
const UINT_32 numSlices = Max(pIn->numSlices, 1u);
const UINT_32 width = Max(pIn->width, 1u);
@ -3097,8 +3099,10 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlGetPreferredSurfaceSetting(
// Tracks the size of each valid swizzle mode's surface in bytes
UINT_64 padSize[AddrBlockMaxTiledType] = {};
const UINT_32 ratioLow = computeMinSize ? 1 : (pIn->flags.opt4space ? 3 : 2);
const UINT_32 ratioHi = computeMinSize ? 1 : (pIn->flags.opt4space ? 2 : 1);
UINT_32 ratioLow;
UINT_32 ratioHi;
GetSwizzleModePreferenceRatio(pIn, &ratioLow, &ratioHi);
const UINT_64 sizeAlignInElement = Max(NextPow2(pIn->minSizeAlign) / (bpp >> 3), 1u);
UINT_32 minSizeBlk = AddrBlockMicro; // Tracks the most optimal block to use
UINT_64 minSize = 0; // Tracks the minimum acceptable block type
@ -4111,7 +4115,7 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlComputeSurfaceAddrFromCoordTiled(
* Gfx10Lib::HwlCopyMemToSurface
*
* @brief
* Copy multiple regions from memory to a non-linear surface.
* Copy multiple regions from memory to a non-linear surface.
*
* @return
* Error or success.
@ -4177,7 +4181,7 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlCopyMemToSurface(
LutAddresser addresser = LutAddresser();
addresser.Init(fullSwizzlePattern, ADDR_MAX_EQUATION_BIT, blockExtent, blkSizeLog2);
UnalignedCopyMemImgFunc pfnCopyUnaligned = addresser.GetCopyMemImgFunc();
UnalignedCopyMemImgFunc pfnCopyUnaligned = addresser.GetCopyMemImgFunc(pIn->copyFlags);
if (pfnCopyUnaligned == nullptr)
{
ADDR_ASSERT_ALWAYS();
@ -4192,35 +4196,27 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlCopyMemToSurface(
const ADDR2_MIP_INFO* pMipInfo = &mipInfo[pCurRegion->mipId];
UINT_64 mipOffset = pIn->singleSubres ? 0 : pMipInfo->macroBlockOffset;
UINT_32 yBlks = pMipInfo->pitch / localOut.blockWidth;
UINT_32 zBlks = localOut.sliceSize >> (addresser.GetBlockBits() - addresser.GetBlockZBits());
UINT_32 xStart = pCurRegion->x + pMipInfo->mipTailCoordX;
UINT_32 yStart = pCurRegion->y + pMipInfo->mipTailCoordY;
UINT_32 sliceStart = pCurRegion->slice + pMipInfo->mipTailCoordZ;
ADDR_COORD3D rawOrigin = {
pCurRegion->x + pMipInfo->mipTailCoordX,
pCurRegion->y + pMipInfo->mipTailCoordY,
pCurRegion->slice + pMipInfo->mipTailCoordZ
};
for (UINT_32 slice = sliceStart; slice < (sliceStart + pCurRegion->copyDims.depth); slice++)
{
// The copy functions take the base address of the hardware slice, not the logical slice. Those are
// not the same thing in 3D swizzles. Logical slices within 3D swizzles are handled by sliceXor
// for unaligned copies.
UINT_32 sliceBlkStart = PowTwoAlignDown(slice, localOut.blockSlices);
UINT_32 sliceXor = pIn->pbXor ^ addresser.GetAddressZ(slice);
UINT_64 memOffset = ((slice - pCurRegion->slice) * pCurRegion->memSlicePitch);
UINT_64 imgOffset = mipOffset + (sliceBlkStart * localOut.sliceSize);
ADDR_COORD2D sliceOrigin = { xStart, yStart };
ADDR_EXTENT2D sliceExtent = { pCurRegion->copyDims.width, pCurRegion->copyDims.height };
pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, imgOffset),
VoidPtrInc(pCurRegion->pMem, memOffset),
pCurRegion->memRowPitch,
yBlks,
sliceOrigin,
sliceExtent,
sliceXor,
addresser);
}
pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, mipOffset),
pCurRegion->pMem,
pCurRegion->memRowPitch,
pCurRegion->memSlicePitch,
yBlks,
zBlks,
rawOrigin,
pCurRegion->copyDims,
pIn->pbXor,
(pCurRegion->mipId >= localOut.firstMipIdInTail),
addresser);
}
addresser.DoCopyMemImgPostFlushes(pIn->copyFlags);
}
return returnCode;
}
@ -4230,7 +4226,7 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlCopyMemToSurface(
* Gfx10Lib::HwlCopySurfaceToMem
*
* @brief
* Copy multiple regions from a non-linear surface to memory.
* Copy multiple regions from a non-linear surface to memory.
*
* @return
* Error or success.
@ -4296,7 +4292,7 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlCopySurfaceToMem(
LutAddresser addresser = LutAddresser();
addresser.Init(fullSwizzlePattern, ADDR_MAX_EQUATION_BIT, blockExtent, blkSizeLog2);
UnalignedCopyMemImgFunc pfnCopyUnaligned = addresser.GetCopyImgMemFunc();
UnalignedCopyMemImgFunc pfnCopyUnaligned = addresser.GetCopyImgMemFunc(pIn->copyFlags);
if (pfnCopyUnaligned == nullptr)
{
ADDR_ASSERT_ALWAYS();
@ -4305,40 +4301,32 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlCopySurfaceToMem(
if (returnCode == ADDR_OK)
{
addresser.DoCopyImgMemPreFlushes(pIn->copyFlags);
for (UINT_32 regionIdx = 0; regionIdx < regionCount; regionIdx++)
{
const ADDR2_COPY_MEMSURFACE_REGION* pCurRegion = &pRegions[regionIdx];
const ADDR2_MIP_INFO* pMipInfo = &mipInfo[pCurRegion->mipId];
UINT_64 mipOffset = pIn->singleSubres ? 0 : pMipInfo->macroBlockOffset;
UINT_32 yBlks = pMipInfo->pitch / localOut.blockWidth;
UINT_32 zBlks = localOut.sliceSize >> (addresser.GetBlockBits() - addresser.GetBlockZBits());
UINT_32 xStart = pCurRegion->x + pMipInfo->mipTailCoordX;
UINT_32 yStart = pCurRegion->y + pMipInfo->mipTailCoordY;
UINT_32 sliceStart = pCurRegion->slice + pMipInfo->mipTailCoordZ;
ADDR_COORD3D rawOrigin = {
pCurRegion->x + pMipInfo->mipTailCoordX,
pCurRegion->y + pMipInfo->mipTailCoordY,
pCurRegion->slice + pMipInfo->mipTailCoordZ
};
for (UINT_32 slice = sliceStart; slice < (sliceStart + pCurRegion->copyDims.depth); slice++)
{
// The copy functions take the base address of the hardware slice, not the logical slice. Those are
// not the same thing in 3D swizzles. Logical slices within 3D swizzles are handled by sliceXor
// for unaligned copies.
UINT_32 sliceBlkStart = PowTwoAlignDown(slice, localOut.blockSlices);
UINT_32 sliceXor = pIn->pbXor ^ addresser.GetAddressZ(slice);
UINT_64 memOffset = ((slice - pCurRegion->slice) * pCurRegion->memSlicePitch);
UINT_64 imgOffset = mipOffset + (sliceBlkStart * localOut.sliceSize);
ADDR_COORD2D sliceOrigin = { xStart, yStart };
ADDR_EXTENT2D sliceExtent = { pCurRegion->copyDims.width, pCurRegion->copyDims.height };
pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, imgOffset),
VoidPtrInc(pCurRegion->pMem, memOffset),
pCurRegion->memRowPitch,
yBlks,
sliceOrigin,
sliceExtent,
sliceXor,
addresser);
}
pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, mipOffset),
pCurRegion->pMem,
pCurRegion->memRowPitch,
pCurRegion->memSlicePitch,
yBlks,
zBlks,
rawOrigin,
pCurRegion->copyDims,
pIn->pbXor,
(pCurRegion->mipId >= localOut.firstMipIdInTail),
addresser);
}
}
return returnCode;

View file

@ -1,7 +1,7 @@
/*
************************************************************************************************************************
*
* Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved.
* SPDX-License-Identifier: MIT
*
***********************************************************************************************************************/
@ -758,6 +758,14 @@ ChipFamily Gfx11Lib::HwlConvertChipFamily(
case FAMILY_PHX:
m_settings.isPhoenix = 1;
break;
case FAMILY_GFX1170:
{
if (ASICREV_IS_GFX1170(chipRevision))
{
m_settings.isGfx1170 = 1;
}
}
break;
default:
ADDR_ASSERT(!"Unknown chip family");
break;
@ -2651,10 +2659,13 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlGetPreferredSurfaceSetting(
UINT_64 padSize[AddrBlockMaxTiledType] = {};
const UINT_32 ratioLow = computeMinSize ? 1 : (pIn->flags.opt4space ? 3 : 2);
const UINT_32 ratioHi = computeMinSize ? 1 : (pIn->flags.opt4space ? 2 : 1);
UINT_32 ratioLow;
UINT_32 ratioHi;
GetSwizzleModePreferenceRatio(pIn, &ratioLow, &ratioHi);
const UINT_64 sizeAlignInElement = Max(NextPow2(pIn->minSizeAlign) / (bpp >> 3), 1u);
UINT_32 minSizeBlk = AddrBlockMicro;
UINT_32 selectedBlk = AddrBlockMaxTiledType;
UINT_64 minSize = 0;
ADDR2_COMPUTE_SURFACE_INFO_OUTPUT localOut = {};
@ -2678,11 +2689,66 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlGetPreferredSurfaceSetting(
{
padSize[i] = localOut.surfSize;
if ((minSize == 0) ||
Addr2BlockTypeWithinMemoryBudget(minSize, padSize[i], ratioLow, ratioHi))
if (pIn->useBlockBasedHeuristic)
{
minSize = padSize[i];
minSizeBlk = i;
const UINT_32 blockCountX = localOut.pitch / localOut.blockWidth;
const UINT_32 blockCountY = localOut.height / localOut.blockHeight;
const UINT_32 blockCountZ = localOut.numSlices / localOut.blockSlices;
UINT_32 requiredBlockCountX = 1;
UINT_32 requiredBlockCountY = 1;
UINT_32 requiredBlockCountZ = 1;
switch (pIn->resourceType)
{
case ADDR_RSRC_TEX_1D:
requiredBlockCountX = 2;
break;
case ADDR_RSRC_TEX_2D:
requiredBlockCountX = 2;
requiredBlockCountY = 2;
break;
case ADDR_RSRC_TEX_3D:
requiredBlockCountX = 2;
requiredBlockCountY = 2;
if (IsThick(pIn->resourceType, localIn.swizzleMode))
{
requiredBlockCountZ = 2;
}
break;
default:
ADDR_ASSERT_ALWAYS();
}
// If the block count is sufficient, select this block type. Otherwise, track the block type with minimum size to
// fall back to it, in case no block type can satisfy the block count requirement.
if ((blockCountX >= requiredBlockCountX) &&
(blockCountY >= requiredBlockCountY) &&
(blockCountZ >= requiredBlockCountZ) &&
(localIn.swizzleMode != ADDR_SW_LINEAR))
{
selectedBlk = i;
}
else
{
const bool has3DThick = (allowedSwModeSet.value & Gfx11Rsrc3dThickSwModeMask) != 0;
const bool is3DThin = (pOut->resourceType == ADDR_RSRC_TEX_3D) &&
IsThin(pOut->resourceType, swMode[i]);
if (((has3DThick && is3DThin) == FALSE) && (minSize == 0 || (padSize[i] < minSize)))
{
minSize = padSize[i];
minSizeBlk = i;
}
}
}
else
{
if ((minSize == 0) ||
Addr2BlockTypeWithinMemoryBudget(minSize, padSize[i], ratioLow, ratioHi))
{
minSize = padSize[i];
minSizeBlk = i;
}
}
}
else
@ -2693,63 +2759,77 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlGetPreferredSurfaceSetting(
}
}
if (pIn->memoryBudget > 1.0)
if (pIn->useBlockBasedHeuristic)
{
// If minimum size is given by swizzle mode with bigger-block type, then don't ever check
// smaller-block type again in coming loop
switch (minSizeBlk)
// If there was no block size that would satisfy block based heuristic, fall back to the budget-based heuristic.
if (selectedBlk == AddrBlockMaxTiledType)
{
case AddrBlockThick256KB:
allowedBlockSet.gfx11.thin256KB = 0;
case AddrBlockThin256KB:
allowedBlockSet.macroThick64KB = 0;
case AddrBlockThick64KB:
allowedBlockSet.macroThin64KB = 0;
case AddrBlockThin64KB:
allowedBlockSet.macroThick4KB = 0;
case AddrBlockThick4KB:
allowedBlockSet.macroThin4KB = 0;
case AddrBlockThin4KB:
allowedBlockSet.micro = 0;
case AddrBlockMicro:
allowedBlockSet.linear = 0;
case AddrBlockLinear:
break;
default:
ADDR_ASSERT_ALWAYS();
break;
selectedBlk = minSizeBlk;
}
for (UINT_32 i = AddrBlockMicro; i < AddrBlockMaxTiledType; i++)
}
else
{
if (pIn->memoryBudget > 1.0)
{
if ((i != minSizeBlk) &&
Addr2IsBlockTypeAvailable(allowedBlockSet, static_cast<::AddrBlockType>(i)))
// If minimum size is given by swizzle mode with bigger-block type, then don't ever check
// smaller-block type again in coming loop
switch (minSizeBlk)
{
if (Addr2BlockTypeWithinMemoryBudget(minSize, padSize[i], 0, 0, pIn->memoryBudget) == FALSE)
case AddrBlockThick256KB:
allowedBlockSet.gfx11.thin256KB = 0;
case AddrBlockThin256KB:
allowedBlockSet.macroThick64KB = 0;
case AddrBlockThick64KB:
allowedBlockSet.macroThin64KB = 0;
case AddrBlockThin64KB:
allowedBlockSet.macroThick4KB = 0;
case AddrBlockThick4KB:
allowedBlockSet.macroThin4KB = 0;
case AddrBlockThin4KB:
allowedBlockSet.micro = 0;
case AddrBlockMicro:
allowedBlockSet.linear = 0;
case AddrBlockLinear:
break;
default:
ADDR_ASSERT_ALWAYS();
break;
}
for (UINT_32 i = AddrBlockMicro; i < AddrBlockMaxTiledType; i++)
{
if ((i != minSizeBlk) &&
Addr2IsBlockTypeAvailable(allowedBlockSet, static_cast<AddrBlockType>(i)))
{
// Clear the block type if the memory waste is unacceptable
allowedBlockSet.value &= ~(1u << (i - 1));
if (Addr2BlockTypeWithinMemoryBudget(minSize, padSize[i], 0, 0, pIn->memoryBudget) == FALSE)
{
// Clear the block type if the memory waste is unacceptable
allowedBlockSet.value &= ~(1u << (i - 1));
}
}
}
// Remove linear block type if 2 or more block types are allowed
if (IsPow2(allowedBlockSet.value) == FALSE)
{
allowedBlockSet.linear = 0;
}
// Select the biggest allowed block type
minSizeBlk = Log2(allowedBlockSet.value) + 1;
if (minSizeBlk == static_cast<UINT_32>(AddrBlockMaxTiledType))
{
minSizeBlk = AddrBlockLinear;
}
}
// Remove linear block type if 2 or more block types are allowed
if (IsPow2(allowedBlockSet.value) == FALSE)
{
allowedBlockSet.linear = 0;
}
// Select the biggest allowed block type
minSizeBlk = Log2(allowedBlockSet.value) + 1;
if (minSizeBlk == static_cast<UINT_32>(AddrBlockMaxTiledType))
{
minSizeBlk = AddrBlockLinear;
}
selectedBlk = minSizeBlk;
}
switch (minSizeBlk)
switch (selectedBlk)
{
case AddrBlockLinear:
allowedSwModeSet.value &= Gfx11LinearSwModeMask;
@ -3685,7 +3765,7 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlComputeSurfaceAddrFromCoordTiled(
* Gfx11Lib::HwlCopyMemToSurface
*
* @brief
* Copy multiple regions from memory to a non-linear surface.
* Copy multiple regions from memory to a non-linear surface.
*
* @return
* Error or success.
@ -3751,7 +3831,7 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlCopyMemToSurface(
LutAddresser addresser = LutAddresser();
addresser.Init(fullSwizzlePattern, ADDR_MAX_EQUATION_BIT, blockExtent, blkSizeLog2);
UnalignedCopyMemImgFunc pfnCopyUnaligned = addresser.GetCopyMemImgFunc();
UnalignedCopyMemImgFunc pfnCopyUnaligned = addresser.GetCopyMemImgFunc(pIn->copyFlags);
if (pfnCopyUnaligned == nullptr)
{
ADDR_ASSERT_ALWAYS();
@ -3766,35 +3846,27 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlCopyMemToSurface(
const ADDR2_MIP_INFO* pMipInfo = &mipInfo[pCurRegion->mipId];
UINT_64 mipOffset = pIn->singleSubres ? 0 : pMipInfo->macroBlockOffset;
UINT_32 yBlks = pMipInfo->pitch / localOut.blockWidth;
UINT_32 zBlks = localOut.sliceSize >> (addresser.GetBlockBits() - addresser.GetBlockZBits());
UINT_32 xStart = pCurRegion->x + pMipInfo->mipTailCoordX;
UINT_32 yStart = pCurRegion->y + pMipInfo->mipTailCoordY;
UINT_32 sliceStart = pCurRegion->slice + pMipInfo->mipTailCoordZ;
ADDR_COORD3D rawOrigin = {
pCurRegion->x + pMipInfo->mipTailCoordX,
pCurRegion->y + pMipInfo->mipTailCoordY,
pCurRegion->slice + pMipInfo->mipTailCoordZ
};
for (UINT_32 slice = sliceStart; slice < (sliceStart + pCurRegion->copyDims.depth); slice++)
{
// The copy functions take the base address of the hardware slice, not the logical slice. Those are
// not the same thing in 3D swizzles. Logical slices within 3D swizzles are handled by sliceXor
// for unaligned copies.
UINT_32 sliceBlkStart = PowTwoAlignDown(slice, localOut.blockSlices);
UINT_32 sliceXor = pIn->pbXor ^ addresser.GetAddressZ(slice);
UINT_64 memOffset = ((slice - pCurRegion->slice) * pCurRegion->memSlicePitch);
UINT_64 imgOffset = mipOffset + (sliceBlkStart * localOut.sliceSize);
ADDR_COORD2D sliceOrigin = { xStart, yStart };
ADDR_EXTENT2D sliceExtent = { pCurRegion->copyDims.width, pCurRegion->copyDims.height };
pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, imgOffset),
VoidPtrInc(pCurRegion->pMem, memOffset),
pCurRegion->memRowPitch,
yBlks,
sliceOrigin,
sliceExtent,
sliceXor,
addresser);
}
pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, mipOffset),
pCurRegion->pMem,
pCurRegion->memRowPitch,
pCurRegion->memSlicePitch,
yBlks,
zBlks,
rawOrigin,
pCurRegion->copyDims,
pIn->pbXor,
(pCurRegion->mipId >= localOut.firstMipIdInTail),
addresser);
}
addresser.DoCopyMemImgPostFlushes(pIn->copyFlags);
}
return returnCode;
}
@ -3804,7 +3876,7 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlCopyMemToSurface(
* Gfx11Lib::HwlCopySurfaceToMem
*
* @brief
* Copy multiple regions from a non-linear surface to memory.
* Copy multiple regions from a non-linear surface to memory.
*
* @return
* Error or success.
@ -3870,7 +3942,7 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlCopySurfaceToMem(
LutAddresser addresser = LutAddresser();
addresser.Init(fullSwizzlePattern, ADDR_MAX_EQUATION_BIT, blockExtent, blkSizeLog2);
UnalignedCopyMemImgFunc pfnCopyUnaligned = addresser.GetCopyImgMemFunc();
UnalignedCopyMemImgFunc pfnCopyUnaligned = addresser.GetCopyImgMemFunc(pIn->copyFlags);
if (pfnCopyUnaligned == nullptr)
{
ADDR_ASSERT_ALWAYS();
@ -3879,40 +3951,32 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlCopySurfaceToMem(
if (returnCode == ADDR_OK)
{
addresser.DoCopyImgMemPreFlushes(pIn->copyFlags);
for (UINT_32 regionIdx = 0; regionIdx < regionCount; regionIdx++)
{
const ADDR2_COPY_MEMSURFACE_REGION* pCurRegion = &pRegions[regionIdx];
const ADDR2_MIP_INFO* pMipInfo = &mipInfo[pCurRegion->mipId];
UINT_64 mipOffset = pIn->singleSubres ? 0 : pMipInfo->macroBlockOffset;
UINT_32 yBlks = pMipInfo->pitch / localOut.blockWidth;
UINT_32 zBlks = localOut.sliceSize >> (addresser.GetBlockBits() - addresser.GetBlockZBits());
UINT_32 xStart = pCurRegion->x + pMipInfo->mipTailCoordX;
UINT_32 yStart = pCurRegion->y + pMipInfo->mipTailCoordY;
UINT_32 sliceStart = pCurRegion->slice + pMipInfo->mipTailCoordZ;
ADDR_COORD3D rawOrigin = {
pCurRegion->x + pMipInfo->mipTailCoordX,
pCurRegion->y + pMipInfo->mipTailCoordY,
pCurRegion->slice + pMipInfo->mipTailCoordZ
};
for (UINT_32 slice = sliceStart; slice < (sliceStart + pCurRegion->copyDims.depth); slice++)
{
// The copy functions take the base address of the hardware slice, not the logical slice. Those are
// not the same thing in 3D swizzles. Logical slices within 3D swizzles are handled by sliceXor
// for unaligned copies.
UINT_32 sliceBlkStart = PowTwoAlignDown(slice, localOut.blockSlices);
UINT_32 sliceXor = pIn->pbXor ^ addresser.GetAddressZ(slice);
UINT_64 memOffset = ((slice - pCurRegion->slice) * pCurRegion->memSlicePitch);
UINT_64 imgOffset = mipOffset + (sliceBlkStart * localOut.sliceSize);
ADDR_COORD2D sliceOrigin = { xStart, yStart };
ADDR_EXTENT2D sliceExtent = { pCurRegion->copyDims.width, pCurRegion->copyDims.height };
pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, imgOffset),
VoidPtrInc(pCurRegion->pMem, memOffset),
pCurRegion->memRowPitch,
yBlks,
sliceOrigin,
sliceExtent,
sliceXor,
addresser);
}
pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, mipOffset),
pCurRegion->pMem,
pCurRegion->memRowPitch,
pCurRegion->memSlicePitch,
yBlks,
zBlks,
rawOrigin,
pCurRegion->copyDims,
pIn->pbXor,
(pCurRegion->mipId >= localOut.firstMipIdInTail),
addresser);
}
}
return returnCode;

View file

@ -1,7 +1,7 @@
/*
************************************************************************************************************************
*
* Copyright (C) 2007-2024 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (C) 2007-2026 Advanced Micro Devices, Inc. All rights reserved.
* SPDX-License-Identifier: MIT
*
***********************************************************************************************************************/
@ -36,7 +36,8 @@ struct Gfx11ChipSettings
{
UINT_32 isStrix : 1;
UINT_32 isPhoenix : 1;
UINT_32 reserved1 : 30;
UINT_32 isGfx1170 : 1;
UINT_32 reserved1 : 29;
// Misc configuration bits
UINT_32 reserved2 : 32;

View file

@ -1,7 +1,7 @@
/*
************************************************************************************************************************
*
* Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (C) 2022-2026 Advanced Micro Devices, Inc. All rights reserved.
* SPDX-License-Identifier: MIT
*
***********************************************************************************************************************/
@ -71,8 +71,7 @@ const SwizzleModeFlags Gfx12Lib::SwizzleModeTable[ADDR3_MAX_TYPE] =
Gfx12Lib::Gfx12Lib(
const Client* pClient)
:
Lib(pClient),
m_numSwizzleBits(0)
Lib(pClient)
{
memcpy(m_swizzleModeTable, SwizzleModeTable, sizeof(SwizzleModeTable));
}
@ -878,7 +877,7 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlComputeSurfaceAddrFromCoordTiled(
* Gfx12Lib::HwlCopyMemToSurface
*
* @brief
* Copy multiple regions from memory to a non-linear surface.
* Copy multiple regions from memory to a non-linear surface.
*
* @return
* Error or success.
@ -925,7 +924,7 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlCopyMemToSurface(
}
LutAddresser addresser = LutAddresser();
UnalignedCopyMemImgFunc pfnCopyUnaligned = nullptr;
UnalignedCopyMemImgFunc pfnCopyUnaligned = nullptr;
if (returnCode == ADDR_OK)
{
const UINT_32 blkSizeLog2 = GetBlockSizeLog2(pIn->swizzleMode);
@ -936,7 +935,7 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlCopyMemToSurface(
ADDR_BIT_SETTING fullSwizzlePattern[Log2Size256K] = {};
GetSwizzlePatternFromPatternInfo(pPatInfo, fullSwizzlePattern);
addresser.Init(fullSwizzlePattern, Log2Size256K, localOut.blockExtent, blkSizeLog2);
pfnCopyUnaligned = addresser.GetCopyMemImgFunc();
pfnCopyUnaligned = addresser.GetCopyMemImgFunc(pIn->copyFlags);
if (pfnCopyUnaligned == nullptr)
{
ADDR_ASSERT_ALWAYS(); // What format is this?
@ -952,35 +951,27 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlCopyMemToSurface(
const ADDR3_MIP_INFO* pMipInfo = &mipInfo[pCurRegion->mipId];
UINT_64 mipOffset = pIn->singleSubres ? 0 : pMipInfo->macroBlockOffset;
UINT_32 yBlks = pMipInfo->pitch / localOut.blockExtent.width;
UINT_32 zBlks = localOut.sliceSize >> (addresser.GetBlockBits() - addresser.GetBlockZBits());
UINT_32 xStart = pCurRegion->x + pMipInfo->mipTailCoordX;
UINT_32 yStart = pCurRegion->y + pMipInfo->mipTailCoordY;
UINT_32 sliceStart = pCurRegion->slice + pMipInfo->mipTailCoordZ;
ADDR_COORD3D rawOrigin = {
pCurRegion->x + pMipInfo->mipTailCoordX,
pCurRegion->y + pMipInfo->mipTailCoordY,
pCurRegion->slice + pMipInfo->mipTailCoordZ
};
for (UINT_32 slice = sliceStart; slice < (sliceStart + pCurRegion->copyDims.depth); slice++)
{
// The copy functions take the base address of the hardware slice, not the logical slice. Those are
// not the same thing in 3D swizzles. Logical slices within 3D swizzles are handled by sliceXor
// for unaligned copies.
UINT_32 sliceBlkStart = PowTwoAlignDown(slice, localOut.blockExtent.depth);
UINT_32 sliceXor = pIn->pbXor ^ addresser.GetAddressZ(slice);
UINT_64 memOffset = ((slice - pCurRegion->slice) * pCurRegion->memSlicePitch);
UINT_64 imgOffset = mipOffset + (sliceBlkStart * localOut.sliceSize);
ADDR_COORD2D sliceOrigin = { xStart, yStart };
ADDR_EXTENT2D sliceExtent = { pCurRegion->copyDims.width, pCurRegion->copyDims.height };
pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, imgOffset),
VoidPtrInc(pCurRegion->pMem, memOffset),
pCurRegion->memRowPitch,
yBlks,
sliceOrigin,
sliceExtent,
sliceXor,
addresser);
}
pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, mipOffset),
pCurRegion->pMem,
pCurRegion->memRowPitch,
pCurRegion->memSlicePitch,
yBlks,
zBlks,
rawOrigin,
pCurRegion->copyDims,
pIn->pbXor,
(pCurRegion->mipId >= localOut.firstMipIdInTail),
addresser);
}
addresser.DoCopyMemImgPostFlushes(pIn->copyFlags);
}
return returnCode;
}
@ -990,7 +981,7 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlCopyMemToSurface(
* Gfx12Lib::HwlCopySurfaceToMem
*
* @brief
* Copy multiple regions from a non-linear surface to memory.
* Copy multiple regions from a non-linear surface to memory.
*
* @return
* Error or success.
@ -1037,7 +1028,7 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlCopySurfaceToMem(
}
LutAddresser addresser = LutAddresser();
UnalignedCopyMemImgFunc pfnCopyUnaligned = nullptr;
UnalignedCopyMemImgFunc pfnCopyUnaligned = nullptr;
if (returnCode == ADDR_OK)
{
const UINT_32 blkSizeLog2 = GetBlockSizeLog2(pIn->swizzleMode);
@ -1048,7 +1039,7 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlCopySurfaceToMem(
ADDR_BIT_SETTING fullSwizzlePattern[Log2Size256K] = {};
GetSwizzlePatternFromPatternInfo(pPatInfo, fullSwizzlePattern);
addresser.Init(fullSwizzlePattern, Log2Size256K, localOut.blockExtent, blkSizeLog2);
pfnCopyUnaligned = addresser.GetCopyImgMemFunc();
pfnCopyUnaligned = addresser.GetCopyImgMemFunc(pIn->copyFlags);
if (pfnCopyUnaligned == nullptr)
{
ADDR_ASSERT_ALWAYS(); // What format is this?
@ -1058,78 +1049,37 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlCopySurfaceToMem(
if (returnCode == ADDR_OK)
{
addresser.DoCopyImgMemPreFlushes(pIn->copyFlags);
for (UINT_32 regionIdx = 0; regionIdx < regionCount; regionIdx++)
{
const ADDR3_COPY_MEMSURFACE_REGION* pCurRegion = &pRegions[regionIdx];
const ADDR3_MIP_INFO* pMipInfo = &mipInfo[pCurRegion->mipId];
UINT_64 mipOffset = pIn->singleSubres ? 0 : pMipInfo->macroBlockOffset;
UINT_32 yBlks = pMipInfo->pitch / localOut.blockExtent.width;
UINT_32 zBlks = localOut.sliceSize >> (addresser.GetBlockBits() - addresser.GetBlockZBits());
UINT_32 xStart = pCurRegion->x + pMipInfo->mipTailCoordX;
UINT_32 yStart = pCurRegion->y + pMipInfo->mipTailCoordY;
UINT_32 sliceStart = pCurRegion->slice + pMipInfo->mipTailCoordZ;
ADDR_COORD3D rawOrigin = {
pCurRegion->x + pMipInfo->mipTailCoordX,
pCurRegion->y + pMipInfo->mipTailCoordY,
pCurRegion->slice + pMipInfo->mipTailCoordZ
};
for (UINT_32 slice = sliceStart; slice < (sliceStart + pCurRegion->copyDims.depth); slice++)
{
// The copy functions take the base address of the hardware slice, not the logical slice. Those are
// not the same thing in 3D swizzles. Logical slices within 3D swizzles are handled by sliceXor
// for unaligned copies.
UINT_32 sliceBlkStart = PowTwoAlignDown(slice, localOut.blockExtent.depth);
UINT_32 sliceXor = pIn->pbXor ^ addresser.GetAddressZ(slice);
UINT_64 memOffset = ((slice - pCurRegion->slice) * pCurRegion->memSlicePitch);
UINT_64 imgOffset = mipOffset + (sliceBlkStart * localOut.sliceSize);
ADDR_COORD2D sliceOrigin = { xStart, yStart };
ADDR_EXTENT2D sliceExtent = { pCurRegion->copyDims.width, pCurRegion->copyDims.height };
pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, imgOffset),
VoidPtrInc(pCurRegion->pMem, memOffset),
pCurRegion->memRowPitch,
yBlks,
sliceOrigin,
sliceExtent,
sliceXor,
addresser);
}
pfnCopyUnaligned(VoidPtrInc(pIn->pMappedSurface, mipOffset),
pCurRegion->pMem,
pCurRegion->memRowPitch,
pCurRegion->memSlicePitch,
yBlks,
zBlks,
rawOrigin,
pCurRegion->copyDims,
pIn->pbXor,
(pCurRegion->mipId >= localOut.firstMipIdInTail),
addresser);
}
}
return returnCode;
}
/**
************************************************************************************************************************
* Gfx12Lib::HwlComputePipeBankXor
*
* @brief
* Generate a PipeBankXor value to be ORed into bits above numSwizzleBits of address
*
* @return
* PipeBankXor value
************************************************************************************************************************
*/
ADDR_E_RETURNCODE Gfx12Lib::HwlComputePipeBankXor(
const ADDR3_COMPUTE_PIPEBANKXOR_INPUT* pIn, ///< [in] input structure
ADDR3_COMPUTE_PIPEBANKXOR_OUTPUT* pOut ///< [out] output structure
) const
{
if ((m_numSwizzleBits != 0) && // does this configuration support swizzling
// base address XOR in GFX12 will be applied to all blk_size = 4KB, 64KB, or 256KB swizzle modes,
// Note that Linear and 256B are excluded.
(IsLinear(pIn->swizzleMode) == FALSE) &&
(IsBlock256b(pIn->swizzleMode) == FALSE))
{
pOut->pipeBankXor = pIn->surfIndex % (1 << m_numSwizzleBits);
}
else
{
pOut->pipeBankXor = 0;
}
return ADDR_OK;
}
/**
************************************************************************************************************************
* Gfx12Lib::GetSwizzlePatternInfo
@ -1263,72 +1213,13 @@ const ADDR_SW_PATINFO* Gfx12Lib::GetSwizzlePatternInfo(
BOOL_32 Gfx12Lib::HwlInitGlobalParams(
const ADDR_CREATE_INPUT* pCreateIn) ///< [in] create input
{
BOOL_32 valid = TRUE;
GB_ADDR_CONFIG_GFX12 gbAddrConfig;
gbAddrConfig.u32All = pCreateIn->regValue.gbAddrConfig;
switch (gbAddrConfig.bits.NUM_PIPES)
{
case ADDR_CONFIG_1_PIPE:
m_pipesLog2 = 0;
break;
case ADDR_CONFIG_2_PIPE:
m_pipesLog2 = 1;
break;
case ADDR_CONFIG_4_PIPE:
m_pipesLog2 = 2;
break;
case ADDR_CONFIG_8_PIPE:
m_pipesLog2 = 3;
break;
case ADDR_CONFIG_16_PIPE:
m_pipesLog2 = 4;
break;
case ADDR_CONFIG_32_PIPE:
m_pipesLog2 = 5;
break;
case ADDR_CONFIG_64_PIPE:
m_pipesLog2 = 6;
break;
default:
ADDR_ASSERT_ALWAYS();
valid = FALSE;
break;
}
switch (gbAddrConfig.bits.PIPE_INTERLEAVE_SIZE)
{
case ADDR_CONFIG_PIPE_INTERLEAVE_256B:
m_pipeInterleaveLog2 = 8;
break;
case ADDR_CONFIG_PIPE_INTERLEAVE_512B:
m_pipeInterleaveLog2 = 9;
break;
case ADDR_CONFIG_PIPE_INTERLEAVE_1KB:
m_pipeInterleaveLog2 = 10;
break;
case ADDR_CONFIG_PIPE_INTERLEAVE_2KB:
m_pipeInterleaveLog2 = 11;
break;
default:
ADDR_ASSERT_ALWAYS();
valid = FALSE;
break;
}
m_numSwizzleBits = ((m_pipesLog2 >= 3) ? m_pipesLog2 - 2 : 0);
// Gfx10+ chips treat packed 8-bit 422 formats as 32bpe with 2pix/elem.
m_configFlags.use32bppFor422Fmt = TRUE;
if (valid)
{
InitEquationTable();
InitBlockDimensionTable();
}
InitEquationTable();
InitBlockDimensionTable();
return valid;
return TRUE;
}
/**
@ -1579,10 +1470,10 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlComputeSlicePipeBankXor(
pIn->slice,
0);
const UINT_32 pipeBankXor = pipeBankXorOffset >> m_pipeInterleaveLog2;
const UINT_32 pipeBankXor = pipeBankXorOffset >> PipeInterleaveLog2;
// Should have no bit set under pipe interleave
ADDR_ASSERT((pipeBankXor << m_pipeInterleaveLog2) == pipeBankXorOffset);
ADDR_ASSERT((pipeBankXor << PipeInterleaveLog2) == pipeBankXorOffset);
pOut->pipeBankXor = pIn->basePipeBankXor ^ pipeBankXor;
}
@ -2043,7 +1934,7 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlComputeStereoInfo(
UINT_32 yPosMask = 0;
// First get "max y bit"
for (UINT_32 i = m_pipeInterleaveLog2; i < blkSizeLog2; i++)
for (UINT_32 i = PipeInterleaveLog2; i < blkSizeLog2; i++)
{
ADDR_ASSERT(m_equationTable[eqIndex].addr[i].valid == 1);
@ -2055,7 +1946,7 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlComputeStereoInfo(
}
// Then loop again for populating a position mask of "max Y bit"
for (UINT_32 i = m_pipeInterleaveLog2; i < blkSizeLog2; i++)
for (UINT_32 i = PipeInterleaveLog2; i < blkSizeLog2; i++)
{
if ((m_equationTable[eqIndex].addr[i].channel == 1) &&
(m_equationTable[eqIndex].addr[i].index == yMax))
@ -2074,7 +1965,7 @@ ADDR_E_RETURNCODE Gfx12Lib::HwlComputeStereoInfo(
if ((alignedHeight >> yMax) & 1)
{
*pRightXor = yPosMask >> m_pipeInterleaveLog2;
*pRightXor = yPosMask >> PipeInterleaveLog2;
}
}
}

View file

@ -1,7 +1,7 @@
/*
************************************************************************************************************************
*
* Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (C) 2022-2026 Advanced Micro Devices, Inc. All rights reserved.
* SPDX-License-Identifier: MIT
*
***********************************************************************************************************************/
@ -147,10 +147,6 @@ private:
static const UINT_32 MaxImageDim = 32768; // Max image size is 32k
static const UINT_32 MaxMipLevels = 16;
virtual ADDR_E_RETURNCODE HwlComputePipeBankXor(
const ADDR3_COMPUTE_PIPEBANKXOR_INPUT* pIn,
ADDR3_COMPUTE_PIPEBANKXOR_OUTPUT* pOut) const override;
virtual BOOL_32 HwlInitGlobalParams(const ADDR_CREATE_INPUT* pCreateIn) override;
virtual ADDR_E_RETURNCODE HwlComputeStereoInfo(
@ -172,8 +168,6 @@ private:
const ADDR3_COPY_MEMSURFACE_REGION* pRegions,
UINT_32 regionCount) const override;
UINT_32 m_numSwizzleBits;
// Initialize equation table
VOID InitEquationTable();