radv: implement VK_KHR_device_fault

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41648>
This commit is contained in:
Samuel Pitoiset 2026-03-18 16:31:40 +01:00
parent f6b06ea3de
commit 7dcd2a4c87

View file

@ -1880,38 +1880,67 @@ radv_GetDeviceImageSubresourceLayout(VkDevice device, const VkDeviceImageSubreso
radv_DestroyImage(device, image, NULL);
}
static VkDeviceFaultAddressInfoKHR
radv_get_device_fault_addr_info(struct radv_device *device, bool *vm_fault_occurred)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
struct radv_winsys_gpuvm_fault_info fault_info = {0};
VkDeviceFaultAddressInfoKHR addr_fault_info = {0};
*vm_fault_occurred = radv_vm_fault_occurred(device, &fault_info);
if (*vm_fault_occurred) {
addr_fault_info.reportedAddress = ((int64_t)fault_info.addr << 16) >> 16;
addr_fault_info.addressPrecision = 4096; /* 4K page granularity */
if (pdev->info.gfx_level >= GFX10) {
addr_fault_info.addressType = G_00A130_RW(fault_info.status) ? VK_DEVICE_FAULT_ADDRESS_TYPE_WRITE_INVALID_KHR
: VK_DEVICE_FAULT_ADDRESS_TYPE_READ_INVALID_KHR;
} else {
/* Not sure how to get the access status on GFX6-9. */
addr_fault_info.addressType = VK_DEVICE_FAULT_ADDRESS_TYPE_NONE_KHR;
}
}
return addr_fault_info;
}
static VkDeviceFaultVendorBinaryHeaderVersionOneKHR
radv_get_device_fault_vendor_binary_header(struct radv_device *device)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
const struct radv_instance *instance = radv_physical_device_instance(pdev);
VkDeviceFaultVendorBinaryHeaderVersionOneKHR hdr;
hdr.headerSize = sizeof(VkDeviceFaultVendorBinaryHeaderVersionOneKHR);
hdr.headerVersion = VK_DEVICE_FAULT_VENDOR_BINARY_HEADER_VERSION_ONE_KHR;
hdr.vendorID = pdev->vk.properties.vendorID;
hdr.deviceID = pdev->vk.properties.deviceID;
hdr.driverVersion = pdev->vk.properties.driverVersion;
memcpy(hdr.pipelineCacheUUID, pdev->cache_uuid, VK_UUID_SIZE);
hdr.applicationNameOffset = 0;
hdr.applicationVersion = instance->vk.app_info.app_version;
hdr.engineNameOffset = 0;
hdr.engineVersion = instance->vk.app_info.engine_version;
hdr.apiVersion = instance->vk.app_info.api_version;
return hdr;
}
/* VK_EXT_device_fault */
VKAPI_ATTR VkResult VKAPI_CALL
radv_GetDeviceFaultInfoEXT(VkDevice _device, VkDeviceFaultCountsEXT *pFaultCounts, VkDeviceFaultInfoEXT *pFaultInfo)
{
VK_OUTARRAY_MAKE_TYPED(VkDeviceFaultAddressInfoEXT, out, pFaultInfo ? pFaultInfo->pAddressInfos : NULL,
VK_OUTARRAY_MAKE_TYPED(VkDeviceFaultAddressInfoKHR, out, pFaultInfo ? pFaultInfo->pAddressInfos : NULL,
&pFaultCounts->addressInfoCount);
struct radv_winsys_gpuvm_fault_info fault_info = {0};
VK_FROM_HANDLE(radv_device, device, _device);
const struct radv_physical_device *pdev = radv_device_physical(device);
const struct radv_instance *instance = radv_physical_device_instance(pdev);
bool vm_fault_occurred = false;
/* Query if a GPUVM fault happened. */
vm_fault_occurred = radv_vm_fault_occurred(device, &fault_info);
pFaultCounts->vendorInfoCount = 0;
pFaultCounts->vendorBinarySize = 0;
if (device->gpu_hang_report) {
VkDeviceFaultVendorBinaryHeaderVersionOneEXT hdr;
hdr.headerSize = sizeof(VkDeviceFaultVendorBinaryHeaderVersionOneEXT);
hdr.headerVersion = VK_DEVICE_FAULT_VENDOR_BINARY_HEADER_VERSION_ONE_EXT;
hdr.vendorID = pdev->vk.properties.vendorID;
hdr.deviceID = pdev->vk.properties.deviceID;
hdr.driverVersion = pdev->vk.properties.driverVersion;
memcpy(hdr.pipelineCacheUUID, pdev->cache_uuid, VK_UUID_SIZE);
hdr.applicationNameOffset = 0;
hdr.applicationVersion = instance->vk.app_info.app_version;
hdr.engineNameOffset = 0;
hdr.engineVersion = instance->vk.app_info.engine_version;
hdr.apiVersion = instance->vk.app_info.api_version;
VkDeviceFaultVendorBinaryHeaderVersionOneKHR hdr = radv_get_device_fault_vendor_binary_header(device);
pFaultCounts->vendorBinarySize = sizeof(hdr) + strlen(device->gpu_hang_report);
if (pFaultInfo) {
@ -1921,24 +1950,65 @@ radv_GetDeviceFaultInfoEXT(VkDevice _device, VkDeviceFaultCountsEXT *pFaultCount
}
}
if (vm_fault_occurred) {
VkDeviceFaultAddressInfoEXT addr_fault_info = {
.reportedAddress = ((int64_t)fault_info.addr << 16) >> 16,
.addressPrecision = 4096, /* 4K page granularity */
};
VkDeviceFaultAddressInfoKHR addr_fault_info = radv_get_device_fault_addr_info(device, &vm_fault_occurred);
if (vm_fault_occurred) {
if (pFaultInfo)
strncpy(pFaultInfo->description, "A GPUVM fault has been detected", sizeof(pFaultInfo->description));
if (pdev->info.gfx_level >= GFX10) {
addr_fault_info.addressType = G_00A130_RW(fault_info.status) ? VK_DEVICE_FAULT_ADDRESS_TYPE_WRITE_INVALID_EXT
: VK_DEVICE_FAULT_ADDRESS_TYPE_READ_INVALID_EXT;
} else {
/* Not sure how to get the access status on GFX6-9. */
addr_fault_info.addressType = VK_DEVICE_FAULT_ADDRESS_TYPE_NONE_EXT;
}
vk_outarray_append_typed(VkDeviceFaultAddressInfoEXT, &out, elem) *elem = addr_fault_info;
vk_outarray_append_typed(VkDeviceFaultAddressInfoKHR, &out, elem) *elem = addr_fault_info;
}
return vk_outarray_status(&out);
}
/* VK_KHR_device_fault */
VKAPI_ATTR VkResult VKAPI_CALL
radv_GetDeviceFaultDebugInfoKHR(VkDevice _device, VkDeviceFaultDebugInfoKHR *pDebugInfo)
{
VK_FROM_HANDLE(radv_device, device, _device);
pDebugInfo->vendorBinarySize = 0;
if (device->gpu_hang_report) {
VkDeviceFaultVendorBinaryHeaderVersionOneKHR hdr = radv_get_device_fault_vendor_binary_header(device);
pDebugInfo->vendorBinarySize = sizeof(hdr) + strlen(device->gpu_hang_report);
if (pDebugInfo->pVendorBinaryData) {
memcpy(pDebugInfo->pVendorBinaryData, &hdr, sizeof(hdr));
memcpy((char *)pDebugInfo->pVendorBinaryData + sizeof(hdr), device->gpu_hang_report,
strlen(device->gpu_hang_report));
}
}
return VK_SUCCESS;
}
VKAPI_ATTR VkResult VKAPI_CALL
radv_GetDeviceFaultReportsKHR(VkDevice _device, uint64_t timeout, uint32_t *pFaultCounts,
VkDeviceFaultInfoKHR *pFaultInfo)
{
VK_OUTARRAY_MAKE_TYPED(VkDeviceFaultInfoKHR, out, pFaultInfo, pFaultCounts);
VK_FROM_HANDLE(radv_device, device, _device);
VkDeviceFaultAddressInfoKHR addr_fault_info;
bool vm_fault_occurred = false;
bool timed_out = false;
uint64_t abs_timeout = os_time_get_absolute_timeout(timeout);
do {
addr_fault_info = radv_get_device_fault_addr_info(device, &vm_fault_occurred);
} while (timeout > 0 && !vm_fault_occurred && !(timed_out = (abs_timeout < os_time_get_nano())));
if (!vm_fault_occurred)
return VK_TIMEOUT;
VkDeviceFaultInfoKHR fault_info = {
.sType = VK_STRUCTURE_TYPE_DEVICE_FAULT_INFO_KHR,
.flags = VK_DEVICE_FAULT_FLAG_MEMORY_ADDRESS_KHR,
.faultAddressInfo = addr_fault_info,
};
strncpy(fault_info.description, "A GPUVM fault has been detected", sizeof(fault_info.description));
vk_outarray_append_typed(VkDeviceFaultInfoKHR, &out, elem) *elem = fault_info;
return vk_outarray_status(&out);
}