From 41fa96538656d19f129ae50fb974d184f8a93d25 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Fri, 10 Apr 2026 18:25:12 +0200 Subject: [PATCH] radv/amdgpu: emulate sparse residency for the SMEM loads with NULL PRT workaround Signed-off-by: Samuel Pitoiset Part-of: --- src/amd/vulkan/radv_radeon_winsys.h | 1 + src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.c | 87 ++++++++++++++++++- src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.h | 2 + .../vulkan/winsys/amdgpu/radv_amdgpu_winsys.c | 35 ++++++++ .../vulkan/winsys/amdgpu/radv_amdgpu_winsys.h | 5 ++ 5 files changed, 126 insertions(+), 4 deletions(-) diff --git a/src/amd/vulkan/radv_radeon_winsys.h b/src/amd/vulkan/radv_radeon_winsys.h index 47539caaa62..70e628f2594 100644 --- a/src/amd/vulkan/radv_radeon_winsys.h +++ b/src/amd/vulkan/radv_radeon_winsys.h @@ -53,6 +53,7 @@ enum radeon_bo_flag { /* bitfield */ RADEON_FLAG_VM_UPDATE_WAIT = (1 << 14), RADEON_FLAG_VM_PAD_1PAGE = (1 << 15), RADEON_FLAG_ENCRYPTED = (1 << 16), + RADEON_FLAG_EMULATE_SPARSE_RESIDENCY = (1 << 17), }; enum radeon_ctx_priority { diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.c index 918e9d4eb40..222a50b3c30 100644 --- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.c +++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.c @@ -134,18 +134,74 @@ radv_amdgpu_log_va_op(struct radv_amdgpu_winsys *ws, struct radv_amdgpu_winsys_b } } +static uint64_t +radv_amdgpu_virtual_bo_get_low_addr(struct radv_amdgpu_winsys *ws, struct radv_amdgpu_winsys_bo *bo) +{ + return bo->base.va & ~(1ull << ws->info.address_prt_wa_control_bit); +} + +static int +radv_amdgpu_virtual_bo_bind_low_null_prt(struct radv_amdgpu_winsys *ws, struct radv_amdgpu_winsys_bo *bo, + uint64_t bo_offset, uint64_t bo_size, uint32_t ops) +{ + const uint64_t low_va = radv_amdgpu_virtual_bo_get_low_addr(ws, bo); + uint64_t offset = 0; + + assert(util_is_aligned(bo_offset, 4096) && util_is_aligned(bo_size, 4096)); + + while (bo_size > 0) { + const uint64_t chunk_size = MIN2(bo_size, ws->null_prt_bug.bo->size); + int r; + + r = radv_amdgpu_bo_va_op(ws, radv_amdgpu_winsys_bo(ws->null_prt_bug.bo)->bo_handle, 0, chunk_size, + low_va + bo_offset + offset, 0, 0, ops); + if (r) + return r; + + offset += chunk_size; + bo_size -= chunk_size; + } + + return 0; +} + static int radv_amdgpu_virtual_bo_init_mapping(struct radv_amdgpu_winsys *ws, struct radv_amdgpu_winsys_bo *bo, uint64_t size) { + int r; - return radv_amdgpu_bo_va_op(ws, 0, 0, size, bo->base.va, 0, AMDGPU_VM_PAGE_PRT, AMDGPU_VA_OP_MAP); + r = radv_amdgpu_bo_va_op(ws, 0, 0, size, bo->base.va, 0, AMDGPU_VM_PAGE_PRT, AMDGPU_VA_OP_MAP); + if (r) + return r; + + if (bo->emulate_sparse_residency) { + /* Bind the "LOW" address space to the zero-initialized BO when it's allocated to emulate + * residency. + */ + r = radv_amdgpu_virtual_bo_bind_low_null_prt(ws, bo, 0, size, AMDGPU_VA_OP_MAP); + } + + return r; } static int radv_amdgpu_virtual_bo_clear_mapping(struct radv_amdgpu_winsys *ws, struct radv_amdgpu_winsys_bo *bo) { const uint64_t va_size = radv_amdgpu_bo_va_size(bo->base.size, bo->flags); - return radv_amdgpu_bo_va_op(ws, 0, 0, va_size, bo->base.va, 0, 0, AMDGPU_VA_OP_CLEAR); + int r; + + r = radv_amdgpu_bo_va_op(ws, 0, 0, va_size, bo->base.va, 0, 0, AMDGPU_VA_OP_CLEAR); + if (r) + return r; + + if (bo->emulate_sparse_residency) { + /* Clear the "LOW" address space mapping when it's released. */ + const uint64_t low_va = radv_amdgpu_virtual_bo_get_low_addr(ws, bo); + + r = radv_amdgpu_bo_va_op(ws, 0, 0, va_size, low_va, 0, 0, AMDGPU_VA_OP_CLEAR); + } + + return r; } static int @@ -158,8 +214,16 @@ radv_amdgpu_virtual_bo_map(struct radv_amdgpu_winsys *ws, struct radv_amdgpu_win if (r) return r; - radv_amdgpu_log_va_op(ws, bo, bo_offset, size, parent->base.va + offset); + if (parent->emulate_sparse_residency) { + /* Bind the "LOW" address space to the same BO. */ + const uint64_t low_va = radv_amdgpu_virtual_bo_get_low_addr(ws, parent); + r = radv_amdgpu_bo_va_op(ws, bo->bo_handle, bo_offset, size, low_va + offset, 0, 0, AMDGPU_VA_OP_REPLACE); + if (r) + return r; + } + + radv_amdgpu_log_va_op(ws, bo, bo_offset, size, parent->base.va + offset); return r; } @@ -173,8 +237,16 @@ radv_amdgpu_virtual_bo_unmap(struct radv_amdgpu_winsys *ws, struct radv_amdgpu_w if (r) return r; - radv_amdgpu_log_va_op(ws, NULL, 0, size, parent->base.va + offset); + if (parent->emulate_sparse_residency) { + /* Re-bind the "LOW" address space to the zero-initialized BO when it's unmapped to emulate + * residency. + */ + r = radv_amdgpu_virtual_bo_bind_low_null_prt(ws, parent, offset, size, AMDGPU_VA_OP_REPLACE); + if (r) + return r; + } + radv_amdgpu_log_va_op(ws, NULL, 0, size, parent->base.va + offset); return r; } @@ -356,6 +428,9 @@ radv_amdgpu_winsys_virtual_bo_create(struct radeon_winsys *_ws, uint64_t size, u assert(!replay_address || (flags & RADEON_FLAG_REPLAYABLE)); + if (flags & RADEON_FLAG_EMULATE_SPARSE_RESIDENCY) + replay_address &= ~(1ull << ws->info.address_prt_wa_control_bit); + const uint64_t va_size = radv_amdgpu_bo_va_size(size, flags); const uint64_t va_flags = AMDGPU_VA_RANGE_HIGH | (flags & RADEON_FLAG_32BIT ? AMDGPU_VA_RANGE_32_BIT : 0) | (flags & RADEON_FLAG_REPLAYABLE ? AMDGPU_VA_RANGE_REPLAYABLE : 0); @@ -371,8 +446,12 @@ radv_amdgpu_winsys_virtual_bo_create(struct radeon_winsys *_ws, uint64_t size, u bo->base.va = va; bo->base.size = size; bo->va_handle = va_handle; + bo->emulate_sparse_residency = !!(flags & RADEON_FLAG_EMULATE_SPARSE_RESIDENCY); bo->base.is_virtual = true; + if (bo->emulate_sparse_residency) + bo->base.va |= 1ull << ws->info.address_prt_wa_control_bit; + /* Reserve a PRT VA region. */ r = radv_amdgpu_virtual_bo_init_mapping(ws, bo, va_size); if (r) { diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.h b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.h index 86f83fdf93e..5458cf064e1 100644 --- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.h +++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.h @@ -35,6 +35,8 @@ struct radv_amdgpu_winsys_bo { uint32_t bo_handle; void *cpu_map; + + bool emulate_sparse_residency; }; static inline struct radv_amdgpu_winsys_bo * diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c index 3bdc4c914e8..344ab79d4d5 100644 --- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c +++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c @@ -113,6 +113,34 @@ radv_amdgpu_winsys_query_gpuvm_fault(struct radeon_winsys *rws, struct radv_wins static simple_mtx_t winsys_creation_mutex = SIMPLE_MTX_INITIALIZER; static struct hash_table *winsyses = NULL; +static VkResult +radv_amdgpu_null_prt_bug_init(struct radeon_winsys *rws) +{ + struct radv_amdgpu_winsys *ws = (struct radv_amdgpu_winsys *)rws; + + if (!ws->info.compiler_info.has_smem_with_null_prt_bug) + return VK_SUCCESS; + + /* Create a zero-allocated 8MiB BO that will be used to map partially resident sparse buffers at + * creation or when explicitly unmapped. + */ + return ws->base.buffer_create(&ws->base, 8 * 1024 * 1024 /* 8MiB */, 4096, RADEON_DOMAIN_VRAM, + RADEON_FLAG_NO_CPU_ACCESS | RADEON_FLAG_ZERO_VRAM | RADEON_FLAG_READ_ONLY | + RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_PREFER_LOCAL_BO, + RADV_BO_PRIORITY_VIRTUAL, 0, &ws->null_prt_bug.bo); +} + +static void +radv_amdgpu_null_prt_bug_finish(struct radeon_winsys *rws) +{ + struct radv_amdgpu_winsys *ws = (struct radv_amdgpu_winsys *)rws; + + if (!ws->info.compiler_info.has_smem_with_null_prt_bug) + return; + + ws->base.buffer_destroy(&ws->base, ws->null_prt_bug.bo); +} + static void radv_amdgpu_winsys_destroy(struct radeon_winsys *rws) { @@ -146,6 +174,9 @@ radv_amdgpu_winsys_destroy(struct radeon_winsys *rws) fclose(ws->bo_history_logfile); u_rwlock_destroy(&ws->log_bo_list_lock); + + radv_amdgpu_null_prt_bug_finish(rws); + ac_drm_device_deinitialize(ws->dev); FREE(rws); } @@ -346,6 +377,10 @@ radv_amdgpu_winsys_create(int fd, uint64_t debug_flags, uint64_t perftest_flags, radv_amdgpu_bo_init_functions(ws); radv_amdgpu_cs_init_functions(ws); + result = radv_amdgpu_null_prt_bug_init(&ws->base); + if (result != VK_SUCCESS) + goto winsys_fail; + _mesa_hash_table_insert(winsyses, (void *)ac_drm_device_get_cookie(dev), ws); simple_mtx_unlock(&winsys_creation_mutex); diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.h b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.h index f8b823b73ac..7aad807a433 100644 --- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.h +++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.h @@ -63,6 +63,11 @@ struct radv_amdgpu_winsys { uint64_t vm_timeline_seq_num; uint32_t refcount; + + struct { + /* A zero-allocated BO used to map the LOW address space of virtual allocations. */ + struct radeon_winsys_bo *bo; + } null_prt_bug; }; static inline struct radv_amdgpu_winsys *