tu: Implement transient attachments and lazily allocated memory

Transient attachments have been in Vulkan since 1.0, and are a way to
avoid allocating memory for attachments that can be stored entirely in
tile memory. The driver exposes a memory type with LAZILY_ALLOCATED_BIT,
and apps use this type to allocate images with TRANSIENT_ATTACHMENT
usage, which are restricted to color/depth/stencil/input attachment
usage. The driver is supposed to then delay allocating memory until it
knows that one of the images bound to the VkDeviceMemory must have
actual backing memory.

Implement this using the "lazy VMA" mechanism added earlier. We reserve
an iova range for lazy BOs, and only allocate them if we chose sysmem
rendering or there is a LOAD_OP_LOAD/STORE_OP_STORE. Because we never
split render passes and force sysmem instead, we don't have to deal with
the additional complexity of that here and just allocate everything.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37151>
This commit is contained in:
Connor Abbott 2025-09-02 16:28:54 -04:00 committed by Marge Bot
parent 3b990ba210
commit 764b3d9161
6 changed files with 172 additions and 21 deletions

View file

@ -121,7 +121,7 @@ tu_GetDeviceBufferMemoryRequirements(
pMemoryRequirements->memoryRequirements = (VkMemoryRequirements) {
.size = MAX2(align64(size, alignment), size),
.alignment = alignment,
.memoryTypeBits = (1 << device->physical_device->memory.type_count) - 1,
.memoryTypeBits = (1 << device->physical_device->memory.non_lazy_type_count) - 1,
};
vk_foreach_struct(ext, pMemoryRequirements->pNext) {

View file

@ -2481,7 +2481,7 @@ tu_trace_end_render_pass(struct tu_cmd_buffer *cmd, bool gmem)
if (cmd->state.lrz.image_view) {
struct tu_image *image = cmd->state.lrz.image_view->image;
addr.bo = image->mem->bo;
addr.offset = (image->iova - image->mem->bo->iova) +
addr.offset = (image->iova - image->mem->iova) +
image->lrz_layout.lrz_fc_offset +
offsetof(fd_lrzfc_layout<CHIP>, dir_track);
}
@ -3092,6 +3092,31 @@ tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe,
}
}
static VkResult
tu_allocate_transient_attachments(struct tu_cmd_buffer *cmd, bool sysmem)
{
const struct tu_framebuffer *fb = cmd->state.framebuffer;
const struct tu_render_pass *rp = cmd->state.pass;
for (unsigned i = 0; i < fb->attachment_count; i++) {
const struct tu_image_view *iview = cmd->state.attachments[i];
if (iview && !(iview->image->vk.create_flags &
VK_IMAGE_CREATE_SPARSE_BINDING_BIT) &&
!iview->image->mem->bo &&
(sysmem || rp->attachments[i].load ||
rp->attachments[i].load_stencil ||
rp->attachments[i].store ||
rp->attachments[i].store_stencil)) {
VkResult result = tu_allocate_lazy_memory(cmd->device,
iview->image->mem);
if (result != VK_SUCCESS)
return result;
}
}
return VK_SUCCESS;
}
template <chip CHIP>
static void
tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
@ -3102,6 +3127,12 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling);
const struct tu_image_view *fdm = NULL;
VkResult result = tu_allocate_transient_attachments(cmd, false);
if (result != VK_SUCCESS) {
vk_command_buffer_set_error(&cmd->vk, result);
return;
}
if (cmd->state.pass->fragment_density_map.attachment != VK_ATTACHMENT_UNUSED) {
fdm = cmd->state.attachments[cmd->state.pass->fragment_density_map.attachment];
}
@ -3199,6 +3230,13 @@ static void
tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,
struct tu_renderpass_result *autotune_result)
{
VkResult result = tu_allocate_transient_attachments(cmd, true);
if (result != VK_SUCCESS) {
vk_command_buffer_set_error(&cmd->vk, result);
return;
}
tu_trace_start_render_pass(cmd);
tu6_sysmem_render_begin<CHIP>(cmd, &cmd->cs, autotune_result);

View file

@ -1617,6 +1617,14 @@ tu_physical_device_init(struct tu_physical_device *device,
device->memory.type_count++;
}
device->memory.non_lazy_type_count = device->memory.type_count;
if (device->has_lazy_bos) {
device->memory.types[device->memory.type_count] =
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
VK_MEMORY_PROPERTY_LAZILY_ALLOCATED_BIT;
device->memory.type_count++;
}
/* Provide fallback UBWC config values if the kernel doesn't support
* providing them. This should match what the kernel programs.
*/
@ -3208,6 +3216,20 @@ vk_icdGetInstanceProcAddr(VkInstance instance, const char *pName)
return tu_GetInstanceProcAddr(instance, pName);
}
static VkResult
tu_add_to_heap(struct tu_device *dev, struct tu_bo *bo)
{
struct tu_memory_heap *mem_heap = &dev->physical_device->heap;
uint64_t mem_heap_used = p_atomic_add_return(&mem_heap->used, bo->size);
if (mem_heap_used > mem_heap->size) {
p_atomic_add(&mem_heap->used, -bo->size);
tu_bo_finish(dev, bo);
return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
"Out of heap memory");
}
return VK_SUCCESS;
}
VKAPI_ATTR VkResult VKAPI_CALL
tu_AllocateMemory(VkDevice _device,
const VkMemoryAllocateInfo *pAllocateInfo,
@ -3237,6 +3259,8 @@ tu_AllocateMemory(VkDevice _device,
return VK_SUCCESS;
}
mem->size = pAllocateInfo->allocationSize;
const VkImportMemoryFdInfoKHR *fd_info =
vk_find_struct_const(pAllocateInfo->pNext, IMPORT_MEMORY_FD_INFO_KHR);
@ -3304,19 +3328,28 @@ tu_AllocateMemory(VkDevice _device,
(long)DIV_ROUND_UP(pAllocateInfo->allocationSize, 1024));
VkMemoryPropertyFlags mem_property =
device->physical_device->memory.types[pAllocateInfo->memoryTypeIndex];
result = tu_bo_init_new_explicit_iova(
device, &mem->vk.base, &mem->bo, pAllocateInfo->allocationSize,
client_address, mem_property, alloc_flags, NULL, name);
if (mem_property & VK_MEMORY_PROPERTY_LAZILY_ALLOCATED_BIT) {
mem->lazy = true;
mtx_init(&mem->lazy_mutex, mtx_plain);
enum tu_sparse_vma_flags sparse_flags =
(alloc_flags & TU_BO_ALLOC_REPLAYABLE) ?
TU_SPARSE_VMA_REPLAYABLE : TU_SPARSE_VMA_NONE;
result = tu_sparse_vma_init(device, &mem->vk.base,
&mem->lazy_vma, &mem->iova,
sparse_flags,
pAllocateInfo->allocationSize,
client_address);
} else {
result = tu_bo_init_new_explicit_iova(
device, &mem->vk.base, &mem->bo, pAllocateInfo->allocationSize,
client_address, mem_property, alloc_flags, NULL, name);
}
}
if (result == VK_SUCCESS) {
mem_heap_used = p_atomic_add_return(&mem_heap->used, mem->bo->size);
if (mem_heap_used > mem_heap->size) {
p_atomic_add(&mem_heap->used, -mem->bo->size);
tu_bo_finish(device, mem->bo);
result = vk_errorf(device, VK_ERROR_OUT_OF_DEVICE_MEMORY,
"Out of heap memory");
}
if (result == VK_SUCCESS && !mem->lazy) {
result = tu_add_to_heap(device, mem->bo);
mem->iova = mem->bo->iova;
}
if (result != VK_SUCCESS) {
@ -3339,6 +3372,53 @@ tu_AllocateMemory(VkDevice _device,
return VK_SUCCESS;
}
VkResult
tu_allocate_lazy_memory(struct tu_device *dev,
struct tu_device_memory *mem)
{
assert(mem->lazy);
if (mem->lazy_initialized) {
if (mem->bo)
return VK_SUCCESS;
else
return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
}
VkResult result = VK_SUCCESS;
mtx_lock(&mem->lazy_mutex);
if (!mem->lazy_initialized) {
char name[64] = "lazy vkAllocateMemory()";
if (dev->bo_sizes)
snprintf(name, ARRAY_SIZE(name), "lazy vkAllocateMemory(%ldkb)",
(long)DIV_ROUND_UP(mem->size, 1024));
result =
tu_bo_init_new_explicit_iova(dev, &mem->vk.base,
&mem->bo, mem->size, 0, 0,
TU_BO_ALLOC_NO_FLAGS,
&mem->lazy_vma, name);
mem->lazy_initialized = true;
if (result == VK_SUCCESS) {
result = tu_add_to_heap(dev, mem->bo);
if (result != VK_SUCCESS) {
tu_bo_finish(dev, mem->bo);
mem->bo = NULL;
}
}
}
mtx_unlock(&mem->lazy_mutex);
/* Fail if another thread won the race and failed to allocate a BO */
if (result == VK_SUCCESS && !mem->bo) {
return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
}
return result;
}
VKAPI_ATTR void VKAPI_CALL
tu_FreeMemory(VkDevice _device,
VkDeviceMemory _mem,
@ -3352,8 +3432,16 @@ tu_FreeMemory(VkDevice _device,
TU_RMV(resource_destroy, device, mem);
p_atomic_add(&device->physical_device->heap.used, -mem->bo->size);
tu_bo_finish(device, mem->bo);
if (mem->bo) {
p_atomic_add(&device->physical_device->heap.used, -mem->bo->size);
tu_bo_finish(device, mem->bo);
}
if (mem->lazy) {
tu_sparse_vma_finish(device, &mem->lazy_vma);
mtx_destroy(&mem->lazy_mutex);
}
vk_device_memory_destroy(&device->vk, pAllocator, &mem->vk);
}
@ -3438,10 +3526,11 @@ tu_InvalidateMappedMemoryRanges(VkDevice _device,
VKAPI_ATTR void VKAPI_CALL
tu_GetDeviceMemoryCommitment(VkDevice device,
VkDeviceMemory memory,
VkDeviceMemory _memory,
VkDeviceSize *pCommittedMemoryInBytes)
{
*pCommittedMemoryInBytes = 0;
VK_FROM_HANDLE(tu_device_memory, memory, _memory);
*pCommittedMemoryInBytes = memory->lazy_initialized ? memory->size : 0;
}
VKAPI_ATTR VkResult VKAPI_CALL
@ -3581,7 +3670,7 @@ tu_GetMemoryFdPropertiesKHR(VkDevice _device,
VK_FROM_HANDLE(tu_device, device, _device);
assert(handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT);
pMemoryFdProperties->memoryTypeBits =
(1 << device->physical_device->memory.type_count) - 1;
(1 << device->physical_device->memory.non_lazy_type_count) - 1;
return VK_SUCCESS;
}

View file

@ -146,6 +146,7 @@ struct tu_physical_device
bool has_preemption;
struct {
uint32_t non_lazy_type_count;
uint32_t type_count;
VkMemoryPropertyFlags types[VK_MAX_MEMORY_TYPES];
} memory;
@ -478,6 +479,15 @@ struct tu_device_memory
{
struct vk_device_memory vk;
uint64_t iova;
uint64_t size;
/* For lazy memory */
bool lazy;
bool lazy_initialized;
struct tu_sparse_vma lazy_vma;
mtx_t lazy_mutex;
struct tu_bo *bo;
/* for dedicated allocations */
@ -486,6 +496,10 @@ struct tu_device_memory
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_device_memory, vk.base, VkDeviceMemory,
VK_OBJECT_TYPE_DEVICE_MEMORY)
VkResult
tu_allocate_lazy_memory(struct tu_device *dev,
struct tu_device_memory *mem);
struct tu_attachment_info
{
struct tu_image_view *attachment;

View file

@ -563,7 +563,8 @@ tu_get_image_format_properties(
}
}
if (image_usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) {
if (image_usage & (VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT |
VK_IMAGE_USAGE_TRANSIENT_ATTACHMENT_BIT)) {
if (!(format_feature_flags &
(VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT |
VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT))) {

View file

@ -1005,10 +1005,11 @@ tu_image_bind(struct tu_device *device,
}
image->mem = mem;
image->mem_offset = offset;
image->iova = mem->bo->iova + offset;
image->iova = mem->iova + offset;
if (image->vk.usage & (VK_IMAGE_USAGE_FRAGMENT_DENSITY_MAP_BIT_EXT |
VK_IMAGE_USAGE_HOST_TRANSFER_BIT_EXT)) {
assert(mem->bo); /* Transient images cannot have these usages */
if (!mem->bo->map) {
result = tu_bo_map(device, mem->bo, NULL);
if (result != VK_SUCCESS)
@ -1063,6 +1064,14 @@ tu_get_image_memory_requirements(struct tu_device *dev, struct tu_image *image,
if (image->vk.create_flags & VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT)
alignment = 65536;
/* Only expose the lazy memory type for images with TRANSIENT_ATTACHMENT
* usage.
*/
uint32_t type_count =
(image->vk.usage & VK_IMAGE_USAGE_TRANSIENT_ATTACHMENT_BIT) ?
dev->physical_device->memory.type_count :
dev->physical_device->memory.non_lazy_type_count;
pMemoryRequirements->memoryRequirements = (VkMemoryRequirements) {
/* Due to how we fake the sparse tile size, the real size may not be
* aligned. CTS doesn't like this, and real apps may also be surprised,
@ -1070,7 +1079,7 @@ tu_get_image_memory_requirements(struct tu_device *dev, struct tu_image *image,
*/
.size = align64(image->total_size, alignment),
.alignment = alignment,
.memoryTypeBits = (1 << dev->physical_device->memory.type_count) - 1,
.memoryTypeBits = (1 << type_count) - 1,
};
vk_foreach_struct(ext, pMemoryRequirements->pNext) {