diff --git a/docs/drivers/amd/hang-debugging.rst b/docs/drivers/amd/hang-debugging.rst index 54d5fd393f6..1edc09290af 100644 --- a/docs/drivers/amd/hang-debugging.rst +++ b/docs/drivers/amd/hang-debugging.rst @@ -31,6 +31,7 @@ there are a couple of files: * ``*.spv``: SPIR-V binaries of the pipeline that was bound when the hang occurred. +* ``addr_binding_report.log``: VK_EXT_address_binding_report logs. * ``app_info.log``: ``VkApplicationInfo`` fields. * ``bo_history.log``: A list of every GPU memory allocation and deallocation. If the GPU hang was caused by a page fault, you can use diff --git a/src/amd/vulkan/radv_debug.c b/src/amd/vulkan/radv_debug.c index b860ca0679f..693e033a63a 100644 --- a/src/amd/vulkan/radv_debug.c +++ b/src/amd/vulkan/radv_debug.c @@ -15,6 +15,7 @@ #endif #include +#include "spirv/nir_spirv.h" #include "util/mesa-sha1.h" #include "util/os_time.h" #include "ac_debug.h" @@ -27,7 +28,9 @@ #include "radv_pipeline_rt.h" #include "radv_shader.h" #include "sid.h" -#include "spirv/nir_spirv.h" + +#include "vk_common_entrypoints.h" +#include "vk_enum_to_str.h" #define COLOR_RESET "\033[0m" #define COLOR_RED "\033[31m" @@ -37,6 +40,105 @@ #define RADV_DUMP_DIR "radv_dumps" +static void +radv_dump_address_binding_report(const struct radv_address_binding_report *report, FILE *f) +{ + fprintf(f, "timestamp=%llu, VA=%.16llx-%.16llx, binding_type=%s, object_type=%s, object_handle=0x%llx\n", + (long long)report->timestamp, (long long)report->va, (long long)(report->va + report->size), + (report->binding_type == VK_DEVICE_ADDRESS_BINDING_TYPE_BIND_EXT) ? "bind" : "unbind", + vk_ObjectType_to_str(report->object_type), (long long)report->object_handle); +} + +static void +radv_dump_address_binding_reports(struct radv_device *device, FILE *f) +{ + struct radv_address_binding_tracker *tracker = device->addr_binding_tracker; + + simple_mtx_lock(&tracker->mtx); + util_dynarray_foreach (&tracker->reports, struct radv_address_binding_report, report) + radv_dump_address_binding_report(report, f); + simple_mtx_unlock(&tracker->mtx); +} + +static VkBool32 VKAPI_PTR +radv_address_binding_callback(VkDebugUtilsMessageSeverityFlagBitsEXT message_severity, + VkDebugUtilsMessageTypeFlagsEXT message_types, + const VkDebugUtilsMessengerCallbackDataEXT *callback_data, void *userdata) +{ + struct radv_address_binding_tracker *tracker = userdata; + const VkDeviceAddressBindingCallbackDataEXT *data; + + if (!callback_data) + return VK_FALSE; + + data = vk_find_struct_const(callback_data->pNext, DEVICE_ADDRESS_BINDING_CALLBACK_DATA_EXT); + if (!data) + return VK_FALSE; + + simple_mtx_lock(&tracker->mtx); + + for (uint32_t i = 0; i < callback_data->objectCount; i++) { + struct radv_address_binding_report report = { + .timestamp = os_time_get_nano(), + .va = data->baseAddress & ((1ull << 48) - 1), + .size = data->size, + .flags = data->flags, + .binding_type = data->bindingType, + .object_handle = callback_data->pObjects[i].objectHandle, + .object_type = callback_data->pObjects[i].objectType, + }; + + util_dynarray_append(&tracker->reports, struct radv_address_binding_report, report); + } + + simple_mtx_unlock(&tracker->mtx); + + return VK_FALSE; +} + +static bool +radv_init_adress_binding_report(struct radv_device *device) +{ + struct radv_physical_device *pdev = radv_device_physical(device); + struct radv_instance *instance = radv_physical_device_instance(pdev); + VkResult result; + + device->addr_binding_tracker = calloc(1, sizeof(*device->addr_binding_tracker)); + if (!device->addr_binding_tracker) + return false; + + simple_mtx_init(&device->addr_binding_tracker->mtx, mtx_plain); + util_dynarray_init(&device->addr_binding_tracker->reports, NULL); + + VkDebugUtilsMessengerCreateInfoEXT create_info = { + .messageSeverity = VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT, + .pUserData = device->addr_binding_tracker, + .pfnUserCallback = radv_address_binding_callback, + .messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_DEVICE_ADDRESS_BINDING_BIT_EXT, + }; + + result = vk_common_CreateDebugUtilsMessengerEXT(radv_instance_to_handle(instance), &create_info, NULL, + &device->addr_binding_tracker->messenger); + if (result != VK_SUCCESS) + return false; + + return true; +} + +static void +radv_finish_address_binding_report(struct radv_device *device) +{ + struct radv_physical_device *pdev = radv_device_physical(device); + struct radv_instance *instance = radv_physical_device_instance(pdev); + struct radv_address_binding_tracker *tracker = device->addr_binding_tracker; + + util_dynarray_fini(&tracker->reports); + simple_mtx_destroy(&tracker->mtx); + + vk_common_DestroyDebugUtilsMessengerEXT(radv_instance_to_handle(instance), tracker->messenger, NULL); + free(device->addr_binding_tracker); +} + bool radv_init_trace(struct radv_device *device) { @@ -58,6 +160,9 @@ radv_init_trace(struct radv_device *device) if (!device->trace_data) return false; + if (!radv_init_adress_binding_report(device)) + return false; + return true; } @@ -66,6 +171,9 @@ radv_finish_trace(struct radv_device *device) { struct radeon_winsys *ws = device->ws; + if (device->addr_binding_tracker) + radv_finish_address_binding_report(device); + if (unlikely(device->trace_bo)) { ws->buffer_make_resident(ws, device->trace_bo, false); radv_bo_destroy(device, NULL, device->trace_bo); @@ -728,6 +836,7 @@ enum radv_device_fault_chunk { RADV_DEVICE_FAULT_CHUNK_REGISTERS, RADV_DEVICE_FAULT_CHUNK_BO_RANGES, RADV_DEVICE_FAULT_CHUNK_BO_HISTORY, + RADV_DEVICE_FAULT_CHUNK_ADDR_BINDING_REPORT, RADV_DEVICE_FAULT_CHUNK_VM_FAULT, RADV_DEVICE_FAULT_CHUNK_APP_INFO, RADV_DEVICE_FAULT_CHUNK_GPU_INFO, @@ -801,8 +910,9 @@ radv_check_gpu_hangs(struct radv_queue *queue, const struct radv_winsys_submit_i char *ptr; size_t size; } chunks[RADV_DEVICE_FAULT_CHUNK_COUNT] = { - {"trace"}, {"pipeline"}, {"umr_waves"}, {"umr_ring"}, {"registers"}, {"bo_ranges"}, - {"bo_history"}, {"vm_fault"}, {"app_info"}, {"gpu_info"}, {"dmesg"}, + {"trace"}, {"pipeline"}, {"umr_waves"}, {"umr_ring"}, + {"registers"}, {"bo_ranges"}, {"bo_history"}, {"addr_binding_report"}, + {"vm_fault"}, {"app_info"}, {"gpu_info"}, {"dmesg"}, }; char *wave_dump = NULL; @@ -846,6 +956,9 @@ radv_check_gpu_hangs(struct radv_queue *queue, const struct radv_winsys_submit_i case RADV_DEVICE_FAULT_CHUNK_BO_HISTORY: device->ws->dump_bo_log(device->ws, f); break; + case RADV_DEVICE_FAULT_CHUNK_ADDR_BINDING_REPORT: + radv_dump_address_binding_reports(device, f); + break; case RADV_DEVICE_FAULT_CHUNK_VM_FAULT: if (vm_fault_occurred) radv_dump_vm_fault(device, &fault_info, f); diff --git a/src/amd/vulkan/radv_debug.h b/src/amd/vulkan/radv_debug.h index 52f5e82f68e..6647fa096e2 100644 --- a/src/amd/vulkan/radv_debug.h +++ b/src/amd/vulkan/radv_debug.h @@ -137,4 +137,20 @@ struct radv_trace_data { VkDispatchIndirectCommand indirect_dispatch; }; +struct radv_address_binding_report { + uint64_t timestamp; /* CPU timestamp */ + uint64_t va; + uint64_t size; + VkDeviceAddressBindingFlagsEXT flags; + VkDeviceAddressBindingTypeEXT binding_type; + uint64_t object_handle; + VkObjectType object_type; +}; + +struct radv_address_binding_tracker { + VkDebugUtilsMessengerEXT messenger; + struct util_dynarray reports; + simple_mtx_t mtx; +}; + #endif /* RADV_DEBUG_H */ diff --git a/src/amd/vulkan/radv_device.h b/src/amd/vulkan/radv_device.h index dbd7f961c38..6ae588212c0 100644 --- a/src/amd/vulkan/radv_device.h +++ b/src/amd/vulkan/radv_device.h @@ -540,6 +540,8 @@ struct radv_device { /* PSO cache stats */ simple_mtx_t pso_cache_stats_mtx; struct radv_pso_cache_stats pso_cache_stats[RADV_PIPELINE_TYPE_COUNT]; + + struct radv_address_binding_tracker *addr_binding_tracker; }; VK_DEFINE_HANDLE_CASTS(radv_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)