From 1c7793ea0bba93295ed6149152d00e5efa8a7267 Mon Sep 17 00:00:00 2001 From: Faith Ekstrand Date: Thu, 24 Jul 2025 21:25:20 +0000 Subject: [PATCH] panvk: Advertise a HOST_CACHED memory type if we have WC maps If the GPU is IO coherent, we expose one memory type that's both host-coherent and host-cached. Otherwise we expose one type that's host-uncached and host-coherent, and one that's host-cached and host-noncoherent. By default, we advertise before because that's the combination providing the best perfs in situations where the user knows how to deal with the non-coherent nature of the GPU. Unfortunately, the CTS has a few bugs (missing or incorrect flush/inval calls) forcing us to re-order things. We might drop the flag at some point (some fixes have been submitted, others are on their way). Reviewed-by: Boris Brezillon Reviewed-by: Christoph Pillmayer Part-of: --- src/panfrost/ci/gitlab-ci.yml | 6 +- src/panfrost/vulkan/panvk_device_memory.c | 51 ++++++++++++++- src/panfrost/vulkan/panvk_instance.c | 1 + src/panfrost/vulkan/panvk_instance.h | 1 + src/panfrost/vulkan/panvk_physical_device.c | 64 ++++++++++++++++++- src/panfrost/vulkan/panvk_physical_device.h | 2 +- .../vulkan/panvk_vX_physical_device.c | 4 +- 7 files changed, 119 insertions(+), 10 deletions(-) diff --git a/src/panfrost/ci/gitlab-ci.yml b/src/panfrost/ci/gitlab-ci.yml index 9d8c572a586..0aca9a6d150 100644 --- a/src/panfrost/ci/gitlab-ci.yml +++ b/src/panfrost/ci/gitlab-ci.yml @@ -94,7 +94,7 @@ panfrost-g52-vk:arm64: FDO_CI_CONCURRENT: 6 # We get OOMkills if we go too wide since VKCTS 1.4.4.0 MESA_VK_IGNORE_CONFORMANCE_WARNING: 1 PAN_I_WANT_A_BROKEN_VULKAN_DRIVER: 1 - PANVK_DEBUG: "no_known_warn,sync" + PANVK_DEBUG: "no_known_warn,sync,coherent_before_cached" DEQP_SUITE: panfrost-g52-vk HWCI_START_WESTON: 1 @@ -184,7 +184,7 @@ panfrost-g610-vk:arm64: # Using more than 4 cores cause instabilities FDO_CI_CONCURRENT: 4 MESA_VK_IGNORE_CONFORMANCE_WARNING: 1 - PANVK_DEBUG: "no_known_warn,sync,cs" + PANVK_DEBUG: "no_known_warn,sync,cs,coherent_before_cached" DEQP_SUITE: panfrost-g610-vk DEQP_FRACTION: 5 HWCI_START_WESTON: 1 @@ -208,7 +208,7 @@ panfrost-g925-vk:arm64: variables: DRIVER_NAME: panvk MESA_VK_IGNORE_CONFORMANCE_WARNING: 1 - PANVK_DEBUG: "no_known_warn,sync,cs" + PANVK_DEBUG: "no_known_warn,sync,cs,coherent_before_cached" DEQP_SUITE: panfrost-g925-vk HWCI_START_WESTON: 1 diff --git a/src/panfrost/vulkan/panvk_device_memory.c b/src/panfrost/vulkan/panvk_device_memory.c index 85f1369a1ff..50586bd21ad 100644 --- a/src/panfrost/vulkan/panvk_device_memory.c +++ b/src/panfrost/vulkan/panvk_device_memory.c @@ -90,12 +90,17 @@ panvk_AllocateMemory(VkDevice _device, } VK_FROM_HANDLE(panvk_device, device, _device); + struct panvk_physical_device *physical_device = + to_panvk_physical_device(device->vk.physical); struct panvk_device_memory *mem; bool can_be_exported = false; VkResult result; assert(pAllocateInfo->sType == VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO); + const VkMemoryType *type = + &physical_device->memory.types[pAllocateInfo->memoryTypeIndex]; + const VkExportMemoryAllocateInfo *export_info = vk_find_struct_const(pAllocateInfo->pNext, EXPORT_MEMORY_ALLOCATE_INFO); @@ -130,9 +135,19 @@ panvk_AllocateMemory(VkDevice _device, goto err_destroy_mem; } } else { + uint32_t bo_flags = 0; + + /* We don't do cached on exported buffers to keep the pre-WB_MMAP + * behavior. + */ + if (!can_be_exported && + (type->propertyFlags & VK_MEMORY_PROPERTY_HOST_CACHED_BIT)) + bo_flags |= PAN_KMOD_BO_FLAG_WB_MMAP; + + bo_flags = panvk_device_adjust_bo_flags(device, bo_flags); mem->bo = pan_kmod_bo_alloc(device->kmod.dev, can_be_exported ? NULL : device->kmod.vm, - pAllocateInfo->allocationSize, 0); + pAllocateInfo->allocationSize, bo_flags); if (!mem->bo) { result = panvk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY); goto err_destroy_mem; @@ -401,8 +416,38 @@ panvk_GetMemoryFdPropertiesKHR(VkDevice _device, to_panvk_physical_device(device->vk.physical); assert(handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT); - pMemoryFdProperties->memoryTypeBits = - BITFIELD_MASK(phys_dev->memory.type_count); + + struct pan_kmod_bo *bo = pan_kmod_bo_import(device->kmod.dev, fd, 0); + if (!bo) + return VK_ERROR_INVALID_EXTERNAL_HANDLE; + + pMemoryFdProperties->memoryTypeBits = 0; + + /* Keep things simple by only allowing host-visible if the BO doesn't require + * kernel-side synchronization going through the dma-buf exporter, which is + * reflected through the PAN_KMOD_BO_FLAG_FORCE_FULL_KERNEL_SYNC flag. + */ + const bool can_do_host_visible = !(bo->flags & PAN_KMOD_BO_FLAG_NO_MMAP); + const bool can_do_host_coherent = !(bo->flags & PAN_KMOD_BO_FLAG_WB_MMAP) || + (bo->flags & PAN_KMOD_BO_FLAG_IO_COHERENT); + const bool can_do_host_cached = (bo->flags & PAN_KMOD_BO_FLAG_WB_MMAP); + + pMemoryFdProperties->memoryTypeBits = 0; + for (uint32_t i = 0; i < phys_dev->memory.type_count; i++) { + if (!can_do_host_visible && (phys_dev->memory.types[i].propertyFlags & + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)) + continue; + if (!can_do_host_coherent && (phys_dev->memory.types[i].propertyFlags & + VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) + continue; + if (!can_do_host_cached && (phys_dev->memory.types[i].propertyFlags & + VK_MEMORY_PROPERTY_HOST_CACHED_BIT)) + continue; + + pMemoryFdProperties->memoryTypeBits |= BITFIELD_BIT(i); + } + + pan_kmod_bo_put(bo); return VK_SUCCESS; } diff --git a/src/panfrost/vulkan/panvk_instance.c b/src/panfrost/vulkan/panvk_instance.c index d5bade584f4..53a6aa316ce 100644 --- a/src/panfrost/vulkan/panvk_instance.c +++ b/src/panfrost/vulkan/panvk_instance.c @@ -48,6 +48,7 @@ static const struct debug_control panvk_debug_options[] = { {"wsi_afbc", PANVK_DEBUG_WSI_AFBC}, {"no_wb_mmap", PANVK_DEBUG_NO_WB_MMAP}, {"no_user_mmap_sync", PANVK_DEBUG_NO_USER_MMAP_SYNC}, + {"coherent_before_cached", PANVK_DEBUG_COHERENT_BEFORE_CACHED}, {NULL, 0}, }; diff --git a/src/panfrost/vulkan/panvk_instance.h b/src/panfrost/vulkan/panvk_instance.h index bf474f6a201..1189a118f39 100644 --- a/src/panfrost/vulkan/panvk_instance.h +++ b/src/panfrost/vulkan/panvk_instance.h @@ -32,6 +32,7 @@ enum panvk_debug_flags { PANVK_DEBUG_WSI_AFBC = 1 << 13, PANVK_DEBUG_NO_WB_MMAP = 1 << 14, PANVK_DEBUG_NO_USER_MMAP_SYNC = 1 << 15, + PANVK_DEBUG_COHERENT_BEFORE_CACHED = 1 << 16, }; extern uint64_t panvk_debug; diff --git a/src/panfrost/vulkan/panvk_physical_device.c b/src/panfrost/vulkan/panvk_physical_device.c index 96f3a302a99..796409332e4 100644 --- a/src/panfrost/vulkan/panvk_physical_device.c +++ b/src/panfrost/vulkan/panvk_physical_device.c @@ -242,20 +242,80 @@ static VkResult get_device_heaps(struct panvk_physical_device *device, const struct panvk_instance *instance) { + int host_coherent_not_cached_idx = -1; + int host_cached_not_coherent_idx = -1; + device->memory.heap_count = 1; device->memory.heaps[0] = (VkMemoryHeap) { .size = get_system_heap_size(), .flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, }; - device->memory.type_count = 1; - device->memory.types[0] = (VkMemoryType) { + device->memory.type_count = 0; + + /* We don't have VRAM, but we expose a device-local only type so we can + * prevent imported dma-bufs that come from other drivers/subsystems from + * being CPU-mapped. + */ + device->memory.types[device->memory.type_count++] = (VkMemoryType) { + .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, + .heapIndex = 0, + }; + + if (device->kmod.dev->props.is_io_coherent) { + assert(device->memory.type_count < ARRAY_SIZE(device->memory.types)); + /* If the device is coherent, we just have one memory type that's both + * host-cached and host-coherent. */ + device->memory.types[device->memory.type_count++] = (VkMemoryType) { + .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | + VK_MEMORY_PROPERTY_HOST_CACHED_BIT | + VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + .heapIndex = 0, + }; + } + + if (!PANVK_DEBUG(NO_WB_MMAP) && + (device->kmod.dev->props.supported_bo_flags & PAN_KMOD_BO_FLAG_WB_MMAP)) { + assert(device->memory.type_count < ARRAY_SIZE(device->memory.types)); + host_cached_not_coherent_idx = device->memory.type_count; + device->memory.types[device->memory.type_count++] = (VkMemoryType) { + .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | + VK_MEMORY_PROPERTY_HOST_CACHED_BIT, + .heapIndex = 0, + }; + } + + assert(device->memory.type_count < ARRAY_SIZE(device->memory.types)); + host_coherent_not_cached_idx = device->memory.type_count; + device->memory.types[device->memory.type_count++] = (VkMemoryType) { .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, .heapIndex = 0, }; + /* Ideally, we'd place HOST_CACHED first for perf reasons, but there's + * so many broken CTS tests (missing or invalid flush/invalidate + * calls), and so many added at each version that it gets impossible to + * catch up. So, keep things ordered in a way that the first HOST_VISIBLE + * type is also the one requiring no CPU cache maintenance if we're asked + * to. + */ + if (PANVK_DEBUG(COHERENT_BEFORE_CACHED) && + host_cached_not_coherent_idx != -1 && + host_coherent_not_cached_idx != -1 && + host_coherent_not_cached_idx > host_cached_not_coherent_idx) { + VkMemoryType host_cached_not_coherent_type = + device->memory.types[host_cached_not_coherent_idx]; + + device->memory.types[host_cached_not_coherent_idx] = + device->memory.types[host_coherent_not_cached_idx]; + device->memory.types[host_coherent_not_cached_idx] = + host_cached_not_coherent_type; + } + return VK_SUCCESS; } diff --git a/src/panfrost/vulkan/panvk_physical_device.h b/src/panfrost/vulkan/panvk_physical_device.h index 81dbb4992d2..14d0f9a687f 100644 --- a/src/panfrost/vulkan/panvk_physical_device.h +++ b/src/panfrost/vulkan/panvk_physical_device.h @@ -61,7 +61,7 @@ struct panvk_physical_device { VkMemoryHeap heaps[1]; uint32_t heap_count; - VkMemoryType types[1]; + VkMemoryType types[4]; uint32_t type_count; } memory; diff --git a/src/panfrost/vulkan/panvk_vX_physical_device.c b/src/panfrost/vulkan/panvk_vX_physical_device.c index 885e4fcdcb3..3fca6f3ff11 100644 --- a/src/panfrost/vulkan/panvk_vX_physical_device.c +++ b/src/panfrost/vulkan/panvk_vX_physical_device.c @@ -808,7 +808,9 @@ panvk_per_arch(get_physical_device_properties)( .standardSampleLocations = true, .optimalBufferCopyOffsetAlignment = 64, .optimalBufferCopyRowPitchAlignment = 64, - .nonCoherentAtomSize = 64, + + /* If we can't detect the cacheline size, assume 64 bytes cachelines. */ + .nonCoherentAtomSize = util_has_cache_ops() ? util_cache_granularity() : 64, /* Vulkan 1.0 sparse properties */ .sparseResidencyNonResidentStrict = false,