From 7dcd2a4c8710eb9b928f543fb8532f4fefb4b551 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Wed, 18 Mar 2026 16:31:40 +0100 Subject: [PATCH] radv: implement VK_KHR_device_fault Signed-off-by: Samuel Pitoiset Part-of: --- src/amd/vulkan/radv_device.c | 138 ++++++++++++++++++++++++++--------- 1 file changed, 104 insertions(+), 34 deletions(-) diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index 473f1e08534..90799f2bc69 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -1880,38 +1880,67 @@ radv_GetDeviceImageSubresourceLayout(VkDevice device, const VkDeviceImageSubreso radv_DestroyImage(device, image, NULL); } +static VkDeviceFaultAddressInfoKHR +radv_get_device_fault_addr_info(struct radv_device *device, bool *vm_fault_occurred) +{ + const struct radv_physical_device *pdev = radv_device_physical(device); + struct radv_winsys_gpuvm_fault_info fault_info = {0}; + VkDeviceFaultAddressInfoKHR addr_fault_info = {0}; + + *vm_fault_occurred = radv_vm_fault_occurred(device, &fault_info); + + if (*vm_fault_occurred) { + addr_fault_info.reportedAddress = ((int64_t)fault_info.addr << 16) >> 16; + addr_fault_info.addressPrecision = 4096; /* 4K page granularity */ + + if (pdev->info.gfx_level >= GFX10) { + addr_fault_info.addressType = G_00A130_RW(fault_info.status) ? VK_DEVICE_FAULT_ADDRESS_TYPE_WRITE_INVALID_KHR + : VK_DEVICE_FAULT_ADDRESS_TYPE_READ_INVALID_KHR; + } else { + /* Not sure how to get the access status on GFX6-9. */ + addr_fault_info.addressType = VK_DEVICE_FAULT_ADDRESS_TYPE_NONE_KHR; + } + } + + return addr_fault_info; +} + +static VkDeviceFaultVendorBinaryHeaderVersionOneKHR +radv_get_device_fault_vendor_binary_header(struct radv_device *device) +{ + const struct radv_physical_device *pdev = radv_device_physical(device); + const struct radv_instance *instance = radv_physical_device_instance(pdev); + VkDeviceFaultVendorBinaryHeaderVersionOneKHR hdr; + + hdr.headerSize = sizeof(VkDeviceFaultVendorBinaryHeaderVersionOneKHR); + hdr.headerVersion = VK_DEVICE_FAULT_VENDOR_BINARY_HEADER_VERSION_ONE_KHR; + hdr.vendorID = pdev->vk.properties.vendorID; + hdr.deviceID = pdev->vk.properties.deviceID; + hdr.driverVersion = pdev->vk.properties.driverVersion; + memcpy(hdr.pipelineCacheUUID, pdev->cache_uuid, VK_UUID_SIZE); + hdr.applicationNameOffset = 0; + hdr.applicationVersion = instance->vk.app_info.app_version; + hdr.engineNameOffset = 0; + hdr.engineVersion = instance->vk.app_info.engine_version; + hdr.apiVersion = instance->vk.app_info.api_version; + + return hdr; +} + /* VK_EXT_device_fault */ VKAPI_ATTR VkResult VKAPI_CALL radv_GetDeviceFaultInfoEXT(VkDevice _device, VkDeviceFaultCountsEXT *pFaultCounts, VkDeviceFaultInfoEXT *pFaultInfo) { - VK_OUTARRAY_MAKE_TYPED(VkDeviceFaultAddressInfoEXT, out, pFaultInfo ? pFaultInfo->pAddressInfos : NULL, + VK_OUTARRAY_MAKE_TYPED(VkDeviceFaultAddressInfoKHR, out, pFaultInfo ? pFaultInfo->pAddressInfos : NULL, &pFaultCounts->addressInfoCount); - struct radv_winsys_gpuvm_fault_info fault_info = {0}; VK_FROM_HANDLE(radv_device, device, _device); - const struct radv_physical_device *pdev = radv_device_physical(device); - const struct radv_instance *instance = radv_physical_device_instance(pdev); bool vm_fault_occurred = false; - /* Query if a GPUVM fault happened. */ - vm_fault_occurred = radv_vm_fault_occurred(device, &fault_info); - pFaultCounts->vendorInfoCount = 0; pFaultCounts->vendorBinarySize = 0; if (device->gpu_hang_report) { - VkDeviceFaultVendorBinaryHeaderVersionOneEXT hdr; - - hdr.headerSize = sizeof(VkDeviceFaultVendorBinaryHeaderVersionOneEXT); - hdr.headerVersion = VK_DEVICE_FAULT_VENDOR_BINARY_HEADER_VERSION_ONE_EXT; - hdr.vendorID = pdev->vk.properties.vendorID; - hdr.deviceID = pdev->vk.properties.deviceID; - hdr.driverVersion = pdev->vk.properties.driverVersion; - memcpy(hdr.pipelineCacheUUID, pdev->cache_uuid, VK_UUID_SIZE); - hdr.applicationNameOffset = 0; - hdr.applicationVersion = instance->vk.app_info.app_version; - hdr.engineNameOffset = 0; - hdr.engineVersion = instance->vk.app_info.engine_version; - hdr.apiVersion = instance->vk.app_info.api_version; + VkDeviceFaultVendorBinaryHeaderVersionOneKHR hdr = radv_get_device_fault_vendor_binary_header(device); pFaultCounts->vendorBinarySize = sizeof(hdr) + strlen(device->gpu_hang_report); if (pFaultInfo) { @@ -1921,24 +1950,65 @@ radv_GetDeviceFaultInfoEXT(VkDevice _device, VkDeviceFaultCountsEXT *pFaultCount } } - if (vm_fault_occurred) { - VkDeviceFaultAddressInfoEXT addr_fault_info = { - .reportedAddress = ((int64_t)fault_info.addr << 16) >> 16, - .addressPrecision = 4096, /* 4K page granularity */ - }; + VkDeviceFaultAddressInfoKHR addr_fault_info = radv_get_device_fault_addr_info(device, &vm_fault_occurred); + if (vm_fault_occurred) { if (pFaultInfo) strncpy(pFaultInfo->description, "A GPUVM fault has been detected", sizeof(pFaultInfo->description)); - - if (pdev->info.gfx_level >= GFX10) { - addr_fault_info.addressType = G_00A130_RW(fault_info.status) ? VK_DEVICE_FAULT_ADDRESS_TYPE_WRITE_INVALID_EXT - : VK_DEVICE_FAULT_ADDRESS_TYPE_READ_INVALID_EXT; - } else { - /* Not sure how to get the access status on GFX6-9. */ - addr_fault_info.addressType = VK_DEVICE_FAULT_ADDRESS_TYPE_NONE_EXT; - } - vk_outarray_append_typed(VkDeviceFaultAddressInfoEXT, &out, elem) *elem = addr_fault_info; + vk_outarray_append_typed(VkDeviceFaultAddressInfoKHR, &out, elem) *elem = addr_fault_info; } return vk_outarray_status(&out); } + +/* VK_KHR_device_fault */ +VKAPI_ATTR VkResult VKAPI_CALL +radv_GetDeviceFaultDebugInfoKHR(VkDevice _device, VkDeviceFaultDebugInfoKHR *pDebugInfo) +{ + VK_FROM_HANDLE(radv_device, device, _device); + + pDebugInfo->vendorBinarySize = 0; + + if (device->gpu_hang_report) { + VkDeviceFaultVendorBinaryHeaderVersionOneKHR hdr = radv_get_device_fault_vendor_binary_header(device); + + pDebugInfo->vendorBinarySize = sizeof(hdr) + strlen(device->gpu_hang_report); + if (pDebugInfo->pVendorBinaryData) { + memcpy(pDebugInfo->pVendorBinaryData, &hdr, sizeof(hdr)); + memcpy((char *)pDebugInfo->pVendorBinaryData + sizeof(hdr), device->gpu_hang_report, + strlen(device->gpu_hang_report)); + } + } + + return VK_SUCCESS; +} + +VKAPI_ATTR VkResult VKAPI_CALL +radv_GetDeviceFaultReportsKHR(VkDevice _device, uint64_t timeout, uint32_t *pFaultCounts, + VkDeviceFaultInfoKHR *pFaultInfo) +{ + VK_OUTARRAY_MAKE_TYPED(VkDeviceFaultInfoKHR, out, pFaultInfo, pFaultCounts); + VK_FROM_HANDLE(radv_device, device, _device); + VkDeviceFaultAddressInfoKHR addr_fault_info; + bool vm_fault_occurred = false; + bool timed_out = false; + + uint64_t abs_timeout = os_time_get_absolute_timeout(timeout); + do { + addr_fault_info = radv_get_device_fault_addr_info(device, &vm_fault_occurred); + } while (timeout > 0 && !vm_fault_occurred && !(timed_out = (abs_timeout < os_time_get_nano()))); + + if (!vm_fault_occurred) + return VK_TIMEOUT; + + VkDeviceFaultInfoKHR fault_info = { + .sType = VK_STRUCTURE_TYPE_DEVICE_FAULT_INFO_KHR, + .flags = VK_DEVICE_FAULT_FLAG_MEMORY_ADDRESS_KHR, + .faultAddressInfo = addr_fault_info, + }; + strncpy(fault_info.description, "A GPUVM fault has been detected", sizeof(fault_info.description)); + + vk_outarray_append_typed(VkDeviceFaultInfoKHR, &out, elem) *elem = fault_info; + + return vk_outarray_status(&out); +}