panvk: Advertise a HOST_CACHED memory type if we have WC maps

If the GPU is IO coherent, we expose one memory type that's both
host-coherent and host-cached. Otherwise we expose one type that's
host-uncached and host-coherent, and one that's host-cached and
host-noncoherent.

By default, we advertise <cached,non-coherent> before
<non-cached,coherent> because that's the combination providing the
best perfs in situations where the user knows how to deal with the
non-coherent nature of the GPU.

Unfortunately, the CTS has a few bugs (missing or incorrect flush/inval
calls) forcing us to re-order things. We might drop the flag at some
point (some fixes have been submitted, others are on their way).

Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Christoph Pillmayer <christoph.pillmayer@arm.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36385>
This commit is contained in:
Faith Ekstrand 2025-07-24 21:25:20 +00:00 committed by Boris Brezillon
parent 2afef24d3f
commit 1c7793ea0b
7 changed files with 119 additions and 10 deletions

View file

@ -94,7 +94,7 @@ panfrost-g52-vk:arm64:
FDO_CI_CONCURRENT: 6 # We get OOMkills if we go too wide since VKCTS 1.4.4.0
MESA_VK_IGNORE_CONFORMANCE_WARNING: 1
PAN_I_WANT_A_BROKEN_VULKAN_DRIVER: 1
PANVK_DEBUG: "no_known_warn,sync"
PANVK_DEBUG: "no_known_warn,sync,coherent_before_cached"
DEQP_SUITE: panfrost-g52-vk
HWCI_START_WESTON: 1
@ -184,7 +184,7 @@ panfrost-g610-vk:arm64:
# Using more than 4 cores cause instabilities
FDO_CI_CONCURRENT: 4
MESA_VK_IGNORE_CONFORMANCE_WARNING: 1
PANVK_DEBUG: "no_known_warn,sync,cs"
PANVK_DEBUG: "no_known_warn,sync,cs,coherent_before_cached"
DEQP_SUITE: panfrost-g610-vk
DEQP_FRACTION: 5
HWCI_START_WESTON: 1
@ -208,7 +208,7 @@ panfrost-g925-vk:arm64:
variables:
DRIVER_NAME: panvk
MESA_VK_IGNORE_CONFORMANCE_WARNING: 1
PANVK_DEBUG: "no_known_warn,sync,cs"
PANVK_DEBUG: "no_known_warn,sync,cs,coherent_before_cached"
DEQP_SUITE: panfrost-g925-vk
HWCI_START_WESTON: 1

View file

@ -90,12 +90,17 @@ panvk_AllocateMemory(VkDevice _device,
}
VK_FROM_HANDLE(panvk_device, device, _device);
struct panvk_physical_device *physical_device =
to_panvk_physical_device(device->vk.physical);
struct panvk_device_memory *mem;
bool can_be_exported = false;
VkResult result;
assert(pAllocateInfo->sType == VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO);
const VkMemoryType *type =
&physical_device->memory.types[pAllocateInfo->memoryTypeIndex];
const VkExportMemoryAllocateInfo *export_info =
vk_find_struct_const(pAllocateInfo->pNext, EXPORT_MEMORY_ALLOCATE_INFO);
@ -130,9 +135,19 @@ panvk_AllocateMemory(VkDevice _device,
goto err_destroy_mem;
}
} else {
uint32_t bo_flags = 0;
/* We don't do cached on exported buffers to keep the pre-WB_MMAP
* behavior.
*/
if (!can_be_exported &&
(type->propertyFlags & VK_MEMORY_PROPERTY_HOST_CACHED_BIT))
bo_flags |= PAN_KMOD_BO_FLAG_WB_MMAP;
bo_flags = panvk_device_adjust_bo_flags(device, bo_flags);
mem->bo = pan_kmod_bo_alloc(device->kmod.dev,
can_be_exported ? NULL : device->kmod.vm,
pAllocateInfo->allocationSize, 0);
pAllocateInfo->allocationSize, bo_flags);
if (!mem->bo) {
result = panvk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
goto err_destroy_mem;
@ -401,8 +416,38 @@ panvk_GetMemoryFdPropertiesKHR(VkDevice _device,
to_panvk_physical_device(device->vk.physical);
assert(handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT);
pMemoryFdProperties->memoryTypeBits =
BITFIELD_MASK(phys_dev->memory.type_count);
struct pan_kmod_bo *bo = pan_kmod_bo_import(device->kmod.dev, fd, 0);
if (!bo)
return VK_ERROR_INVALID_EXTERNAL_HANDLE;
pMemoryFdProperties->memoryTypeBits = 0;
/* Keep things simple by only allowing host-visible if the BO doesn't require
* kernel-side synchronization going through the dma-buf exporter, which is
* reflected through the PAN_KMOD_BO_FLAG_FORCE_FULL_KERNEL_SYNC flag.
*/
const bool can_do_host_visible = !(bo->flags & PAN_KMOD_BO_FLAG_NO_MMAP);
const bool can_do_host_coherent = !(bo->flags & PAN_KMOD_BO_FLAG_WB_MMAP) ||
(bo->flags & PAN_KMOD_BO_FLAG_IO_COHERENT);
const bool can_do_host_cached = (bo->flags & PAN_KMOD_BO_FLAG_WB_MMAP);
pMemoryFdProperties->memoryTypeBits = 0;
for (uint32_t i = 0; i < phys_dev->memory.type_count; i++) {
if (!can_do_host_visible && (phys_dev->memory.types[i].propertyFlags &
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT))
continue;
if (!can_do_host_coherent && (phys_dev->memory.types[i].propertyFlags &
VK_MEMORY_PROPERTY_HOST_COHERENT_BIT))
continue;
if (!can_do_host_cached && (phys_dev->memory.types[i].propertyFlags &
VK_MEMORY_PROPERTY_HOST_CACHED_BIT))
continue;
pMemoryFdProperties->memoryTypeBits |= BITFIELD_BIT(i);
}
pan_kmod_bo_put(bo);
return VK_SUCCESS;
}

View file

@ -48,6 +48,7 @@ static const struct debug_control panvk_debug_options[] = {
{"wsi_afbc", PANVK_DEBUG_WSI_AFBC},
{"no_wb_mmap", PANVK_DEBUG_NO_WB_MMAP},
{"no_user_mmap_sync", PANVK_DEBUG_NO_USER_MMAP_SYNC},
{"coherent_before_cached", PANVK_DEBUG_COHERENT_BEFORE_CACHED},
{NULL, 0},
};

View file

@ -32,6 +32,7 @@ enum panvk_debug_flags {
PANVK_DEBUG_WSI_AFBC = 1 << 13,
PANVK_DEBUG_NO_WB_MMAP = 1 << 14,
PANVK_DEBUG_NO_USER_MMAP_SYNC = 1 << 15,
PANVK_DEBUG_COHERENT_BEFORE_CACHED = 1 << 16,
};
extern uint64_t panvk_debug;

View file

@ -242,20 +242,80 @@ static VkResult
get_device_heaps(struct panvk_physical_device *device,
const struct panvk_instance *instance)
{
int host_coherent_not_cached_idx = -1;
int host_cached_not_coherent_idx = -1;
device->memory.heap_count = 1;
device->memory.heaps[0] = (VkMemoryHeap) {
.size = get_system_heap_size(),
.flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT,
};
device->memory.type_count = 1;
device->memory.types[0] = (VkMemoryType) {
device->memory.type_count = 0;
/* We don't have VRAM, but we expose a device-local only type so we can
* prevent imported dma-bufs that come from other drivers/subsystems from
* being CPU-mapped.
*/
device->memory.types[device->memory.type_count++] = (VkMemoryType) {
.propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
.heapIndex = 0,
};
if (device->kmod.dev->props.is_io_coherent) {
assert(device->memory.type_count < ARRAY_SIZE(device->memory.types));
/* If the device is coherent, we just have one memory type that's both
* host-cached and host-coherent. */
device->memory.types[device->memory.type_count++] = (VkMemoryType) {
.propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
VK_MEMORY_PROPERTY_HOST_CACHED_BIT |
VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
.heapIndex = 0,
};
}
if (!PANVK_DEBUG(NO_WB_MMAP) &&
(device->kmod.dev->props.supported_bo_flags & PAN_KMOD_BO_FLAG_WB_MMAP)) {
assert(device->memory.type_count < ARRAY_SIZE(device->memory.types));
host_cached_not_coherent_idx = device->memory.type_count;
device->memory.types[device->memory.type_count++] = (VkMemoryType) {
.propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
.heapIndex = 0,
};
}
assert(device->memory.type_count < ARRAY_SIZE(device->memory.types));
host_coherent_not_cached_idx = device->memory.type_count;
device->memory.types[device->memory.type_count++] = (VkMemoryType) {
.propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
.heapIndex = 0,
};
/* Ideally, we'd place HOST_CACHED first for perf reasons, but there's
* so many broken CTS tests (missing or invalid flush/invalidate
* calls), and so many added at each version that it gets impossible to
* catch up. So, keep things ordered in a way that the first HOST_VISIBLE
* type is also the one requiring no CPU cache maintenance if we're asked
* to.
*/
if (PANVK_DEBUG(COHERENT_BEFORE_CACHED) &&
host_cached_not_coherent_idx != -1 &&
host_coherent_not_cached_idx != -1 &&
host_coherent_not_cached_idx > host_cached_not_coherent_idx) {
VkMemoryType host_cached_not_coherent_type =
device->memory.types[host_cached_not_coherent_idx];
device->memory.types[host_cached_not_coherent_idx] =
device->memory.types[host_coherent_not_cached_idx];
device->memory.types[host_coherent_not_cached_idx] =
host_cached_not_coherent_type;
}
return VK_SUCCESS;
}

View file

@ -61,7 +61,7 @@ struct panvk_physical_device {
VkMemoryHeap heaps[1];
uint32_t heap_count;
VkMemoryType types[1];
VkMemoryType types[4];
uint32_t type_count;
} memory;

View file

@ -808,7 +808,9 @@ panvk_per_arch(get_physical_device_properties)(
.standardSampleLocations = true,
.optimalBufferCopyOffsetAlignment = 64,
.optimalBufferCopyRowPitchAlignment = 64,
.nonCoherentAtomSize = 64,
/* If we can't detect the cacheline size, assume 64 bytes cachelines. */
.nonCoherentAtomSize = util_has_cache_ops() ? util_cache_granularity() : 64,
/* Vulkan 1.0 sparse properties */
.sparseResidencyNonResidentStrict = false,