radv/amdgpu: emulate sparse residency for the SMEM loads with NULL PRT workaround

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38698>
This commit is contained in:
Samuel Pitoiset 2026-04-10 18:25:12 +02:00 committed by Marge Bot
parent 0be39ce4ad
commit 41fa965386
5 changed files with 126 additions and 4 deletions

View file

@ -53,6 +53,7 @@ enum radeon_bo_flag { /* bitfield */
RADEON_FLAG_VM_UPDATE_WAIT = (1 << 14),
RADEON_FLAG_VM_PAD_1PAGE = (1 << 15),
RADEON_FLAG_ENCRYPTED = (1 << 16),
RADEON_FLAG_EMULATE_SPARSE_RESIDENCY = (1 << 17),
};
enum radeon_ctx_priority {

View file

@ -134,18 +134,74 @@ radv_amdgpu_log_va_op(struct radv_amdgpu_winsys *ws, struct radv_amdgpu_winsys_b
}
}
static uint64_t
radv_amdgpu_virtual_bo_get_low_addr(struct radv_amdgpu_winsys *ws, struct radv_amdgpu_winsys_bo *bo)
{
return bo->base.va & ~(1ull << ws->info.address_prt_wa_control_bit);
}
static int
radv_amdgpu_virtual_bo_bind_low_null_prt(struct radv_amdgpu_winsys *ws, struct radv_amdgpu_winsys_bo *bo,
uint64_t bo_offset, uint64_t bo_size, uint32_t ops)
{
const uint64_t low_va = radv_amdgpu_virtual_bo_get_low_addr(ws, bo);
uint64_t offset = 0;
assert(util_is_aligned(bo_offset, 4096) && util_is_aligned(bo_size, 4096));
while (bo_size > 0) {
const uint64_t chunk_size = MIN2(bo_size, ws->null_prt_bug.bo->size);
int r;
r = radv_amdgpu_bo_va_op(ws, radv_amdgpu_winsys_bo(ws->null_prt_bug.bo)->bo_handle, 0, chunk_size,
low_va + bo_offset + offset, 0, 0, ops);
if (r)
return r;
offset += chunk_size;
bo_size -= chunk_size;
}
return 0;
}
static int
radv_amdgpu_virtual_bo_init_mapping(struct radv_amdgpu_winsys *ws, struct radv_amdgpu_winsys_bo *bo, uint64_t size)
{
int r;
return radv_amdgpu_bo_va_op(ws, 0, 0, size, bo->base.va, 0, AMDGPU_VM_PAGE_PRT, AMDGPU_VA_OP_MAP);
r = radv_amdgpu_bo_va_op(ws, 0, 0, size, bo->base.va, 0, AMDGPU_VM_PAGE_PRT, AMDGPU_VA_OP_MAP);
if (r)
return r;
if (bo->emulate_sparse_residency) {
/* Bind the "LOW" address space to the zero-initialized BO when it's allocated to emulate
* residency.
*/
r = radv_amdgpu_virtual_bo_bind_low_null_prt(ws, bo, 0, size, AMDGPU_VA_OP_MAP);
}
return r;
}
static int
radv_amdgpu_virtual_bo_clear_mapping(struct radv_amdgpu_winsys *ws, struct radv_amdgpu_winsys_bo *bo)
{
const uint64_t va_size = radv_amdgpu_bo_va_size(bo->base.size, bo->flags);
return radv_amdgpu_bo_va_op(ws, 0, 0, va_size, bo->base.va, 0, 0, AMDGPU_VA_OP_CLEAR);
int r;
r = radv_amdgpu_bo_va_op(ws, 0, 0, va_size, bo->base.va, 0, 0, AMDGPU_VA_OP_CLEAR);
if (r)
return r;
if (bo->emulate_sparse_residency) {
/* Clear the "LOW" address space mapping when it's released. */
const uint64_t low_va = radv_amdgpu_virtual_bo_get_low_addr(ws, bo);
r = radv_amdgpu_bo_va_op(ws, 0, 0, va_size, low_va, 0, 0, AMDGPU_VA_OP_CLEAR);
}
return r;
}
static int
@ -158,8 +214,16 @@ radv_amdgpu_virtual_bo_map(struct radv_amdgpu_winsys *ws, struct radv_amdgpu_win
if (r)
return r;
radv_amdgpu_log_va_op(ws, bo, bo_offset, size, parent->base.va + offset);
if (parent->emulate_sparse_residency) {
/* Bind the "LOW" address space to the same BO. */
const uint64_t low_va = radv_amdgpu_virtual_bo_get_low_addr(ws, parent);
r = radv_amdgpu_bo_va_op(ws, bo->bo_handle, bo_offset, size, low_va + offset, 0, 0, AMDGPU_VA_OP_REPLACE);
if (r)
return r;
}
radv_amdgpu_log_va_op(ws, bo, bo_offset, size, parent->base.va + offset);
return r;
}
@ -173,8 +237,16 @@ radv_amdgpu_virtual_bo_unmap(struct radv_amdgpu_winsys *ws, struct radv_amdgpu_w
if (r)
return r;
radv_amdgpu_log_va_op(ws, NULL, 0, size, parent->base.va + offset);
if (parent->emulate_sparse_residency) {
/* Re-bind the "LOW" address space to the zero-initialized BO when it's unmapped to emulate
* residency.
*/
r = radv_amdgpu_virtual_bo_bind_low_null_prt(ws, parent, offset, size, AMDGPU_VA_OP_REPLACE);
if (r)
return r;
}
radv_amdgpu_log_va_op(ws, NULL, 0, size, parent->base.va + offset);
return r;
}
@ -356,6 +428,9 @@ radv_amdgpu_winsys_virtual_bo_create(struct radeon_winsys *_ws, uint64_t size, u
assert(!replay_address || (flags & RADEON_FLAG_REPLAYABLE));
if (flags & RADEON_FLAG_EMULATE_SPARSE_RESIDENCY)
replay_address &= ~(1ull << ws->info.address_prt_wa_control_bit);
const uint64_t va_size = radv_amdgpu_bo_va_size(size, flags);
const uint64_t va_flags = AMDGPU_VA_RANGE_HIGH | (flags & RADEON_FLAG_32BIT ? AMDGPU_VA_RANGE_32_BIT : 0) |
(flags & RADEON_FLAG_REPLAYABLE ? AMDGPU_VA_RANGE_REPLAYABLE : 0);
@ -371,8 +446,12 @@ radv_amdgpu_winsys_virtual_bo_create(struct radeon_winsys *_ws, uint64_t size, u
bo->base.va = va;
bo->base.size = size;
bo->va_handle = va_handle;
bo->emulate_sparse_residency = !!(flags & RADEON_FLAG_EMULATE_SPARSE_RESIDENCY);
bo->base.is_virtual = true;
if (bo->emulate_sparse_residency)
bo->base.va |= 1ull << ws->info.address_prt_wa_control_bit;
/* Reserve a PRT VA region. */
r = radv_amdgpu_virtual_bo_init_mapping(ws, bo, va_size);
if (r) {

View file

@ -35,6 +35,8 @@ struct radv_amdgpu_winsys_bo {
uint32_t bo_handle;
void *cpu_map;
bool emulate_sparse_residency;
};
static inline struct radv_amdgpu_winsys_bo *

View file

@ -113,6 +113,34 @@ radv_amdgpu_winsys_query_gpuvm_fault(struct radeon_winsys *rws, struct radv_wins
static simple_mtx_t winsys_creation_mutex = SIMPLE_MTX_INITIALIZER;
static struct hash_table *winsyses = NULL;
static VkResult
radv_amdgpu_null_prt_bug_init(struct radeon_winsys *rws)
{
struct radv_amdgpu_winsys *ws = (struct radv_amdgpu_winsys *)rws;
if (!ws->info.compiler_info.has_smem_with_null_prt_bug)
return VK_SUCCESS;
/* Create a zero-allocated 8MiB BO that will be used to map partially resident sparse buffers at
* creation or when explicitly unmapped.
*/
return ws->base.buffer_create(&ws->base, 8 * 1024 * 1024 /* 8MiB */, 4096, RADEON_DOMAIN_VRAM,
RADEON_FLAG_NO_CPU_ACCESS | RADEON_FLAG_ZERO_VRAM | RADEON_FLAG_READ_ONLY |
RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_PREFER_LOCAL_BO,
RADV_BO_PRIORITY_VIRTUAL, 0, &ws->null_prt_bug.bo);
}
static void
radv_amdgpu_null_prt_bug_finish(struct radeon_winsys *rws)
{
struct radv_amdgpu_winsys *ws = (struct radv_amdgpu_winsys *)rws;
if (!ws->info.compiler_info.has_smem_with_null_prt_bug)
return;
ws->base.buffer_destroy(&ws->base, ws->null_prt_bug.bo);
}
static void
radv_amdgpu_winsys_destroy(struct radeon_winsys *rws)
{
@ -146,6 +174,9 @@ radv_amdgpu_winsys_destroy(struct radeon_winsys *rws)
fclose(ws->bo_history_logfile);
u_rwlock_destroy(&ws->log_bo_list_lock);
radv_amdgpu_null_prt_bug_finish(rws);
ac_drm_device_deinitialize(ws->dev);
FREE(rws);
}
@ -346,6 +377,10 @@ radv_amdgpu_winsys_create(int fd, uint64_t debug_flags, uint64_t perftest_flags,
radv_amdgpu_bo_init_functions(ws);
radv_amdgpu_cs_init_functions(ws);
result = radv_amdgpu_null_prt_bug_init(&ws->base);
if (result != VK_SUCCESS)
goto winsys_fail;
_mesa_hash_table_insert(winsyses, (void *)ac_drm_device_get_cookie(dev), ws);
simple_mtx_unlock(&winsys_creation_mutex);

View file

@ -63,6 +63,11 @@ struct radv_amdgpu_winsys {
uint64_t vm_timeline_seq_num;
uint32_t refcount;
struct {
/* A zero-allocated BO used to map the LOW address space of virtual allocations. */
struct radeon_winsys_bo *bo;
} null_prt_bug;
};
static inline struct radv_amdgpu_winsys *