mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-02-07 08:00:36 +01:00
tu: Implement transient attachments and lazily allocated memory
Transient attachments have been in Vulkan since 1.0, and are a way to avoid allocating memory for attachments that can be stored entirely in tile memory. The driver exposes a memory type with LAZILY_ALLOCATED_BIT, and apps use this type to allocate images with TRANSIENT_ATTACHMENT usage, which are restricted to color/depth/stencil/input attachment usage. The driver is supposed to then delay allocating memory until it knows that one of the images bound to the VkDeviceMemory must have actual backing memory. Implement this using the "lazy VMA" mechanism added earlier. We reserve an iova range for lazy BOs, and only allocate them if we chose sysmem rendering or there is a LOAD_OP_LOAD/STORE_OP_STORE. Because we never split render passes and force sysmem instead, we don't have to deal with the additional complexity of that here and just allocate everything. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37151>
This commit is contained in:
parent
3b990ba210
commit
764b3d9161
6 changed files with 172 additions and 21 deletions
|
|
@ -121,7 +121,7 @@ tu_GetDeviceBufferMemoryRequirements(
|
|||
pMemoryRequirements->memoryRequirements = (VkMemoryRequirements) {
|
||||
.size = MAX2(align64(size, alignment), size),
|
||||
.alignment = alignment,
|
||||
.memoryTypeBits = (1 << device->physical_device->memory.type_count) - 1,
|
||||
.memoryTypeBits = (1 << device->physical_device->memory.non_lazy_type_count) - 1,
|
||||
};
|
||||
|
||||
vk_foreach_struct(ext, pMemoryRequirements->pNext) {
|
||||
|
|
|
|||
|
|
@ -2481,7 +2481,7 @@ tu_trace_end_render_pass(struct tu_cmd_buffer *cmd, bool gmem)
|
|||
if (cmd->state.lrz.image_view) {
|
||||
struct tu_image *image = cmd->state.lrz.image_view->image;
|
||||
addr.bo = image->mem->bo;
|
||||
addr.offset = (image->iova - image->mem->bo->iova) +
|
||||
addr.offset = (image->iova - image->mem->iova) +
|
||||
image->lrz_layout.lrz_fc_offset +
|
||||
offsetof(fd_lrzfc_layout<CHIP>, dir_track);
|
||||
}
|
||||
|
|
@ -3092,6 +3092,31 @@ tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe,
|
|||
}
|
||||
}
|
||||
|
||||
static VkResult
|
||||
tu_allocate_transient_attachments(struct tu_cmd_buffer *cmd, bool sysmem)
|
||||
{
|
||||
const struct tu_framebuffer *fb = cmd->state.framebuffer;
|
||||
const struct tu_render_pass *rp = cmd->state.pass;
|
||||
|
||||
for (unsigned i = 0; i < fb->attachment_count; i++) {
|
||||
const struct tu_image_view *iview = cmd->state.attachments[i];
|
||||
if (iview && !(iview->image->vk.create_flags &
|
||||
VK_IMAGE_CREATE_SPARSE_BINDING_BIT) &&
|
||||
!iview->image->mem->bo &&
|
||||
(sysmem || rp->attachments[i].load ||
|
||||
rp->attachments[i].load_stencil ||
|
||||
rp->attachments[i].store ||
|
||||
rp->attachments[i].store_stencil)) {
|
||||
VkResult result = tu_allocate_lazy_memory(cmd->device,
|
||||
iview->image->mem);
|
||||
if (result != VK_SUCCESS)
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
return VK_SUCCESS;
|
||||
}
|
||||
|
||||
template <chip CHIP>
|
||||
static void
|
||||
tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
|
||||
|
|
@ -3102,6 +3127,12 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
|
|||
const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling);
|
||||
const struct tu_image_view *fdm = NULL;
|
||||
|
||||
VkResult result = tu_allocate_transient_attachments(cmd, false);
|
||||
if (result != VK_SUCCESS) {
|
||||
vk_command_buffer_set_error(&cmd->vk, result);
|
||||
return;
|
||||
}
|
||||
|
||||
if (cmd->state.pass->fragment_density_map.attachment != VK_ATTACHMENT_UNUSED) {
|
||||
fdm = cmd->state.attachments[cmd->state.pass->fragment_density_map.attachment];
|
||||
}
|
||||
|
|
@ -3199,6 +3230,13 @@ static void
|
|||
tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,
|
||||
struct tu_renderpass_result *autotune_result)
|
||||
{
|
||||
VkResult result = tu_allocate_transient_attachments(cmd, true);
|
||||
|
||||
if (result != VK_SUCCESS) {
|
||||
vk_command_buffer_set_error(&cmd->vk, result);
|
||||
return;
|
||||
}
|
||||
|
||||
tu_trace_start_render_pass(cmd);
|
||||
|
||||
tu6_sysmem_render_begin<CHIP>(cmd, &cmd->cs, autotune_result);
|
||||
|
|
|
|||
|
|
@ -1617,6 +1617,14 @@ tu_physical_device_init(struct tu_physical_device *device,
|
|||
device->memory.type_count++;
|
||||
}
|
||||
|
||||
device->memory.non_lazy_type_count = device->memory.type_count;
|
||||
if (device->has_lazy_bos) {
|
||||
device->memory.types[device->memory.type_count] =
|
||||
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
|
||||
VK_MEMORY_PROPERTY_LAZILY_ALLOCATED_BIT;
|
||||
device->memory.type_count++;
|
||||
}
|
||||
|
||||
/* Provide fallback UBWC config values if the kernel doesn't support
|
||||
* providing them. This should match what the kernel programs.
|
||||
*/
|
||||
|
|
@ -3208,6 +3216,20 @@ vk_icdGetInstanceProcAddr(VkInstance instance, const char *pName)
|
|||
return tu_GetInstanceProcAddr(instance, pName);
|
||||
}
|
||||
|
||||
static VkResult
|
||||
tu_add_to_heap(struct tu_device *dev, struct tu_bo *bo)
|
||||
{
|
||||
struct tu_memory_heap *mem_heap = &dev->physical_device->heap;
|
||||
uint64_t mem_heap_used = p_atomic_add_return(&mem_heap->used, bo->size);
|
||||
if (mem_heap_used > mem_heap->size) {
|
||||
p_atomic_add(&mem_heap->used, -bo->size);
|
||||
tu_bo_finish(dev, bo);
|
||||
return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
|
||||
"Out of heap memory");
|
||||
}
|
||||
return VK_SUCCESS;
|
||||
}
|
||||
|
||||
VKAPI_ATTR VkResult VKAPI_CALL
|
||||
tu_AllocateMemory(VkDevice _device,
|
||||
const VkMemoryAllocateInfo *pAllocateInfo,
|
||||
|
|
@ -3237,6 +3259,8 @@ tu_AllocateMemory(VkDevice _device,
|
|||
return VK_SUCCESS;
|
||||
}
|
||||
|
||||
mem->size = pAllocateInfo->allocationSize;
|
||||
|
||||
const VkImportMemoryFdInfoKHR *fd_info =
|
||||
vk_find_struct_const(pAllocateInfo->pNext, IMPORT_MEMORY_FD_INFO_KHR);
|
||||
|
||||
|
|
@ -3304,19 +3328,28 @@ tu_AllocateMemory(VkDevice _device,
|
|||
(long)DIV_ROUND_UP(pAllocateInfo->allocationSize, 1024));
|
||||
VkMemoryPropertyFlags mem_property =
|
||||
device->physical_device->memory.types[pAllocateInfo->memoryTypeIndex];
|
||||
result = tu_bo_init_new_explicit_iova(
|
||||
device, &mem->vk.base, &mem->bo, pAllocateInfo->allocationSize,
|
||||
client_address, mem_property, alloc_flags, NULL, name);
|
||||
|
||||
if (mem_property & VK_MEMORY_PROPERTY_LAZILY_ALLOCATED_BIT) {
|
||||
mem->lazy = true;
|
||||
mtx_init(&mem->lazy_mutex, mtx_plain);
|
||||
enum tu_sparse_vma_flags sparse_flags =
|
||||
(alloc_flags & TU_BO_ALLOC_REPLAYABLE) ?
|
||||
TU_SPARSE_VMA_REPLAYABLE : TU_SPARSE_VMA_NONE;
|
||||
result = tu_sparse_vma_init(device, &mem->vk.base,
|
||||
&mem->lazy_vma, &mem->iova,
|
||||
sparse_flags,
|
||||
pAllocateInfo->allocationSize,
|
||||
client_address);
|
||||
} else {
|
||||
result = tu_bo_init_new_explicit_iova(
|
||||
device, &mem->vk.base, &mem->bo, pAllocateInfo->allocationSize,
|
||||
client_address, mem_property, alloc_flags, NULL, name);
|
||||
}
|
||||
}
|
||||
|
||||
if (result == VK_SUCCESS) {
|
||||
mem_heap_used = p_atomic_add_return(&mem_heap->used, mem->bo->size);
|
||||
if (mem_heap_used > mem_heap->size) {
|
||||
p_atomic_add(&mem_heap->used, -mem->bo->size);
|
||||
tu_bo_finish(device, mem->bo);
|
||||
result = vk_errorf(device, VK_ERROR_OUT_OF_DEVICE_MEMORY,
|
||||
"Out of heap memory");
|
||||
}
|
||||
if (result == VK_SUCCESS && !mem->lazy) {
|
||||
result = tu_add_to_heap(device, mem->bo);
|
||||
mem->iova = mem->bo->iova;
|
||||
}
|
||||
|
||||
if (result != VK_SUCCESS) {
|
||||
|
|
@ -3339,6 +3372,53 @@ tu_AllocateMemory(VkDevice _device,
|
|||
return VK_SUCCESS;
|
||||
}
|
||||
|
||||
VkResult
|
||||
tu_allocate_lazy_memory(struct tu_device *dev,
|
||||
struct tu_device_memory *mem)
|
||||
{
|
||||
assert(mem->lazy);
|
||||
|
||||
if (mem->lazy_initialized) {
|
||||
if (mem->bo)
|
||||
return VK_SUCCESS;
|
||||
else
|
||||
return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
|
||||
}
|
||||
|
||||
VkResult result = VK_SUCCESS;
|
||||
mtx_lock(&mem->lazy_mutex);
|
||||
if (!mem->lazy_initialized) {
|
||||
char name[64] = "lazy vkAllocateMemory()";
|
||||
if (dev->bo_sizes)
|
||||
snprintf(name, ARRAY_SIZE(name), "lazy vkAllocateMemory(%ldkb)",
|
||||
(long)DIV_ROUND_UP(mem->size, 1024));
|
||||
result =
|
||||
tu_bo_init_new_explicit_iova(dev, &mem->vk.base,
|
||||
&mem->bo, mem->size, 0, 0,
|
||||
TU_BO_ALLOC_NO_FLAGS,
|
||||
&mem->lazy_vma, name);
|
||||
mem->lazy_initialized = true;
|
||||
|
||||
if (result == VK_SUCCESS) {
|
||||
result = tu_add_to_heap(dev, mem->bo);
|
||||
|
||||
if (result != VK_SUCCESS) {
|
||||
tu_bo_finish(dev, mem->bo);
|
||||
mem->bo = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
mtx_unlock(&mem->lazy_mutex);
|
||||
|
||||
/* Fail if another thread won the race and failed to allocate a BO */
|
||||
if (result == VK_SUCCESS && !mem->bo) {
|
||||
return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
VKAPI_ATTR void VKAPI_CALL
|
||||
tu_FreeMemory(VkDevice _device,
|
||||
VkDeviceMemory _mem,
|
||||
|
|
@ -3352,8 +3432,16 @@ tu_FreeMemory(VkDevice _device,
|
|||
|
||||
TU_RMV(resource_destroy, device, mem);
|
||||
|
||||
p_atomic_add(&device->physical_device->heap.used, -mem->bo->size);
|
||||
tu_bo_finish(device, mem->bo);
|
||||
if (mem->bo) {
|
||||
p_atomic_add(&device->physical_device->heap.used, -mem->bo->size);
|
||||
tu_bo_finish(device, mem->bo);
|
||||
}
|
||||
|
||||
if (mem->lazy) {
|
||||
tu_sparse_vma_finish(device, &mem->lazy_vma);
|
||||
mtx_destroy(&mem->lazy_mutex);
|
||||
}
|
||||
|
||||
vk_device_memory_destroy(&device->vk, pAllocator, &mem->vk);
|
||||
}
|
||||
|
||||
|
|
@ -3438,10 +3526,11 @@ tu_InvalidateMappedMemoryRanges(VkDevice _device,
|
|||
|
||||
VKAPI_ATTR void VKAPI_CALL
|
||||
tu_GetDeviceMemoryCommitment(VkDevice device,
|
||||
VkDeviceMemory memory,
|
||||
VkDeviceMemory _memory,
|
||||
VkDeviceSize *pCommittedMemoryInBytes)
|
||||
{
|
||||
*pCommittedMemoryInBytes = 0;
|
||||
VK_FROM_HANDLE(tu_device_memory, memory, _memory);
|
||||
*pCommittedMemoryInBytes = memory->lazy_initialized ? memory->size : 0;
|
||||
}
|
||||
|
||||
VKAPI_ATTR VkResult VKAPI_CALL
|
||||
|
|
@ -3581,7 +3670,7 @@ tu_GetMemoryFdPropertiesKHR(VkDevice _device,
|
|||
VK_FROM_HANDLE(tu_device, device, _device);
|
||||
assert(handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT);
|
||||
pMemoryFdProperties->memoryTypeBits =
|
||||
(1 << device->physical_device->memory.type_count) - 1;
|
||||
(1 << device->physical_device->memory.non_lazy_type_count) - 1;
|
||||
return VK_SUCCESS;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -146,6 +146,7 @@ struct tu_physical_device
|
|||
bool has_preemption;
|
||||
|
||||
struct {
|
||||
uint32_t non_lazy_type_count;
|
||||
uint32_t type_count;
|
||||
VkMemoryPropertyFlags types[VK_MAX_MEMORY_TYPES];
|
||||
} memory;
|
||||
|
|
@ -478,6 +479,15 @@ struct tu_device_memory
|
|||
{
|
||||
struct vk_device_memory vk;
|
||||
|
||||
uint64_t iova;
|
||||
uint64_t size;
|
||||
|
||||
/* For lazy memory */
|
||||
bool lazy;
|
||||
bool lazy_initialized;
|
||||
struct tu_sparse_vma lazy_vma;
|
||||
mtx_t lazy_mutex;
|
||||
|
||||
struct tu_bo *bo;
|
||||
|
||||
/* for dedicated allocations */
|
||||
|
|
@ -486,6 +496,10 @@ struct tu_device_memory
|
|||
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_device_memory, vk.base, VkDeviceMemory,
|
||||
VK_OBJECT_TYPE_DEVICE_MEMORY)
|
||||
|
||||
VkResult
|
||||
tu_allocate_lazy_memory(struct tu_device *dev,
|
||||
struct tu_device_memory *mem);
|
||||
|
||||
struct tu_attachment_info
|
||||
{
|
||||
struct tu_image_view *attachment;
|
||||
|
|
|
|||
|
|
@ -563,7 +563,8 @@ tu_get_image_format_properties(
|
|||
}
|
||||
}
|
||||
|
||||
if (image_usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) {
|
||||
if (image_usage & (VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT |
|
||||
VK_IMAGE_USAGE_TRANSIENT_ATTACHMENT_BIT)) {
|
||||
if (!(format_feature_flags &
|
||||
(VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT |
|
||||
VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT))) {
|
||||
|
|
|
|||
|
|
@ -1005,10 +1005,11 @@ tu_image_bind(struct tu_device *device,
|
|||
}
|
||||
image->mem = mem;
|
||||
image->mem_offset = offset;
|
||||
image->iova = mem->bo->iova + offset;
|
||||
image->iova = mem->iova + offset;
|
||||
|
||||
if (image->vk.usage & (VK_IMAGE_USAGE_FRAGMENT_DENSITY_MAP_BIT_EXT |
|
||||
VK_IMAGE_USAGE_HOST_TRANSFER_BIT_EXT)) {
|
||||
assert(mem->bo); /* Transient images cannot have these usages */
|
||||
if (!mem->bo->map) {
|
||||
result = tu_bo_map(device, mem->bo, NULL);
|
||||
if (result != VK_SUCCESS)
|
||||
|
|
@ -1063,6 +1064,14 @@ tu_get_image_memory_requirements(struct tu_device *dev, struct tu_image *image,
|
|||
if (image->vk.create_flags & VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT)
|
||||
alignment = 65536;
|
||||
|
||||
/* Only expose the lazy memory type for images with TRANSIENT_ATTACHMENT
|
||||
* usage.
|
||||
*/
|
||||
uint32_t type_count =
|
||||
(image->vk.usage & VK_IMAGE_USAGE_TRANSIENT_ATTACHMENT_BIT) ?
|
||||
dev->physical_device->memory.type_count :
|
||||
dev->physical_device->memory.non_lazy_type_count;
|
||||
|
||||
pMemoryRequirements->memoryRequirements = (VkMemoryRequirements) {
|
||||
/* Due to how we fake the sparse tile size, the real size may not be
|
||||
* aligned. CTS doesn't like this, and real apps may also be surprised,
|
||||
|
|
@ -1070,7 +1079,7 @@ tu_get_image_memory_requirements(struct tu_device *dev, struct tu_image *image,
|
|||
*/
|
||||
.size = align64(image->total_size, alignment),
|
||||
.alignment = alignment,
|
||||
.memoryTypeBits = (1 << dev->physical_device->memory.type_count) - 1,
|
||||
.memoryTypeBits = (1 << type_count) - 1,
|
||||
};
|
||||
|
||||
vk_foreach_struct(ext, pMemoryRequirements->pNext) {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue