winsys/amdgpu: add RADEON_FLAG_UNCACHED for faster blits over PCIe

Small blits benefit more. Good access pattern is required.

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4935>
This commit is contained in:
Marek Olšák 2020-05-06 14:46:13 -04:00 committed by Marge Bot
parent cbbc18bc67
commit 36c0124804
2 changed files with 79 additions and 27 deletions

View file

@ -67,7 +67,8 @@ enum radeon_bo_flag
RADEON_FLAG_NO_INTERPROCESS_SHARING = (1 << 4),
RADEON_FLAG_READ_ONLY = (1 << 5),
RADEON_FLAG_32BIT = (1 << 6),
RADEON_FLAG_ENCRYPTED = (1 << 7)
RADEON_FLAG_ENCRYPTED = (1 << 7),
RADEON_FLAG_UNCACHED = (1 << 8), /* only gfx9 and newer */
};
enum radeon_dependency_flag
@ -712,6 +713,11 @@ enum radeon_heap
RADEON_HEAP_GTT_WC_READ_ONLY_32BIT,
RADEON_HEAP_GTT_WC_32BIT,
RADEON_HEAP_GTT,
RADEON_HEAP_GTT_UNCACHED_WC,
RADEON_HEAP_GTT_UNCACHED_WC_READ_ONLY,
RADEON_HEAP_GTT_UNCACHED_WC_READ_ONLY_32BIT,
RADEON_HEAP_GTT_UNCACHED_WC_32BIT,
RADEON_HEAP_GTT_UNCACHED,
RADEON_MAX_SLAB_HEAPS,
RADEON_MAX_CACHED_HEAPS = RADEON_MAX_SLAB_HEAPS,
};
@ -730,6 +736,11 @@ static inline enum radeon_bo_domain radeon_domain_from_heap(enum radeon_heap hea
case RADEON_HEAP_GTT_WC_READ_ONLY_32BIT:
case RADEON_HEAP_GTT_WC_32BIT:
case RADEON_HEAP_GTT:
case RADEON_HEAP_GTT_UNCACHED_WC:
case RADEON_HEAP_GTT_UNCACHED_WC_READ_ONLY:
case RADEON_HEAP_GTT_UNCACHED_WC_READ_ONLY_32BIT:
case RADEON_HEAP_GTT_UNCACHED_WC_32BIT:
case RADEON_HEAP_GTT_UNCACHED:
return RADEON_DOMAIN_GTT;
default:
assert(0);
@ -739,36 +750,69 @@ static inline enum radeon_bo_domain radeon_domain_from_heap(enum radeon_heap hea
static inline unsigned radeon_flags_from_heap(enum radeon_heap heap)
{
unsigned flags =
RADEON_FLAG_NO_INTERPROCESS_SHARING | (heap != RADEON_HEAP_GTT ? RADEON_FLAG_GTT_WC : 0);
unsigned flags = RADEON_FLAG_NO_INTERPROCESS_SHARING;
switch (heap) {
case RADEON_HEAP_GTT:
case RADEON_HEAP_GTT_UNCACHED:
break;
default:
flags |= RADEON_FLAG_GTT_WC;
}
switch (heap) {
case RADEON_HEAP_GTT_UNCACHED_WC:
case RADEON_HEAP_GTT_UNCACHED_WC_READ_ONLY:
case RADEON_HEAP_GTT_UNCACHED_WC_READ_ONLY_32BIT:
case RADEON_HEAP_GTT_UNCACHED_WC_32BIT:
case RADEON_HEAP_GTT_UNCACHED:
flags |= RADEON_FLAG_UNCACHED;
break;
default:
break;
}
switch (heap) {
case RADEON_HEAP_VRAM_READ_ONLY:
case RADEON_HEAP_VRAM_READ_ONLY_32BIT:
case RADEON_HEAP_GTT_WC_READ_ONLY:
case RADEON_HEAP_GTT_WC_READ_ONLY_32BIT:
case RADEON_HEAP_GTT_UNCACHED_WC_READ_ONLY:
case RADEON_HEAP_GTT_UNCACHED_WC_READ_ONLY_32BIT:
flags |= RADEON_FLAG_READ_ONLY;
break;
default:
break;
}
switch (heap) {
case RADEON_HEAP_VRAM_READ_ONLY_32BIT:
case RADEON_HEAP_VRAM_32BIT:
case RADEON_HEAP_GTT_WC_READ_ONLY_32BIT:
case RADEON_HEAP_GTT_WC_32BIT:
case RADEON_HEAP_GTT_UNCACHED_WC_READ_ONLY_32BIT:
case RADEON_HEAP_GTT_UNCACHED_WC_32BIT:
flags |= RADEON_FLAG_32BIT;
default:
break;
}
switch (heap) {
case RADEON_HEAP_VRAM_NO_CPU_ACCESS:
return flags | RADEON_FLAG_NO_CPU_ACCESS;
case RADEON_HEAP_VRAM_READ_ONLY:
case RADEON_HEAP_GTT_WC_READ_ONLY:
return flags | RADEON_FLAG_READ_ONLY;
case RADEON_HEAP_VRAM_READ_ONLY_32BIT:
case RADEON_HEAP_GTT_WC_READ_ONLY_32BIT:
return flags | RADEON_FLAG_READ_ONLY | RADEON_FLAG_32BIT;
case RADEON_HEAP_VRAM_32BIT:
case RADEON_HEAP_GTT_WC_32BIT:
return flags | RADEON_FLAG_32BIT;
case RADEON_HEAP_VRAM:
case RADEON_HEAP_GTT_WC:
case RADEON_HEAP_GTT:
flags |= RADEON_FLAG_NO_CPU_ACCESS;
break;
default:
return flags;
break;
}
return flags;
}
/* Return the heap index for winsys allocators, or -1 on failure. */
static inline int radeon_get_heap_index(enum radeon_bo_domain domain, enum radeon_bo_flag flags)
{
bool uncached;
/* VRAM implies WC (write combining) */
assert(!(domain & RADEON_DOMAIN_VRAM) || flags & RADEON_FLAG_GTT_WC);
/* NO_CPU_ACCESS implies VRAM only. */
@ -779,7 +823,7 @@ static inline int radeon_get_heap_index(enum radeon_bo_domain domain, enum radeo
return -1;
/* Unsupported flags: NO_SUBALLOC, SPARSE. */
if (flags & ~(RADEON_FLAG_GTT_WC | RADEON_FLAG_NO_CPU_ACCESS |
if (flags & ~(RADEON_FLAG_GTT_WC | RADEON_FLAG_NO_CPU_ACCESS | RADEON_FLAG_UNCACHED |
RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_READ_ONLY | RADEON_FLAG_32BIT))
return -1;
@ -806,15 +850,20 @@ static inline int radeon_get_heap_index(enum radeon_bo_domain domain, enum radeo
}
break;
case RADEON_DOMAIN_GTT:
uncached = flags & RADEON_FLAG_UNCACHED;
switch (flags & (RADEON_FLAG_GTT_WC | RADEON_FLAG_READ_ONLY | RADEON_FLAG_32BIT)) {
case RADEON_FLAG_GTT_WC | RADEON_FLAG_READ_ONLY | RADEON_FLAG_32BIT:
return RADEON_HEAP_GTT_WC_READ_ONLY_32BIT;
return uncached ? RADEON_HEAP_GTT_UNCACHED_WC_READ_ONLY_32BIT
: RADEON_HEAP_GTT_WC_READ_ONLY_32BIT;
case RADEON_FLAG_GTT_WC | RADEON_FLAG_READ_ONLY:
return RADEON_HEAP_GTT_WC_READ_ONLY;
return uncached ? RADEON_HEAP_GTT_UNCACHED_WC_READ_ONLY
: RADEON_HEAP_GTT_WC_READ_ONLY;
case RADEON_FLAG_GTT_WC | RADEON_FLAG_32BIT:
return RADEON_HEAP_GTT_WC_32BIT;
return uncached ? RADEON_HEAP_GTT_UNCACHED_WC_32BIT
: RADEON_HEAP_GTT_WC_32BIT;
case RADEON_FLAG_GTT_WC:
return RADEON_HEAP_GTT_WC;
return uncached ? RADEON_HEAP_GTT_UNCACHED_WC : RADEON_HEAP_GTT_WC;
case RADEON_FLAG_READ_ONLY | RADEON_FLAG_32BIT:
case RADEON_FLAG_READ_ONLY:
assert(!"READ_ONLY without WC is disallowed");
@ -823,7 +872,7 @@ static inline int radeon_get_heap_index(enum radeon_bo_domain domain, enum radeo
assert(!"32BIT without WC is disallowed");
return -1;
case 0:
return RADEON_HEAP_GTT;
return uncached ? RADEON_HEAP_GTT_UNCACHED : RADEON_HEAP_GTT;
}
break;
default:

View file

@ -554,6 +554,9 @@ static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws,
if (!(flags & RADEON_FLAG_READ_ONLY))
vm_flags |= AMDGPU_VM_PAGE_WRITEABLE;
if (flags & RADEON_FLAG_UNCACHED)
vm_flags |= AMDGPU_VM_MTYPE_UC;
r = amdgpu_bo_va_op_raw(ws->dev, buf_handle, 0, size, va, vm_flags,
AMDGPU_VA_OP_MAP);
if (r)