radv: dump address binding report with RADV_DEBUG=hang

This contains much more info than the BO history from the winsys and
it will be helpful for debugging.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32403>
This commit is contained in:
Samuel Pitoiset 2024-11-29 08:46:59 +01:00 committed by Marge Bot
parent 1ae6fcfbaf
commit 1b68a92c59
4 changed files with 135 additions and 3 deletions

View file

@ -31,6 +31,7 @@ there are a couple of files:
* ``*.spv``: SPIR-V binaries of the pipeline that was bound when the hang * ``*.spv``: SPIR-V binaries of the pipeline that was bound when the hang
occurred. occurred.
* ``addr_binding_report.log``: VK_EXT_address_binding_report logs.
* ``app_info.log``: ``VkApplicationInfo`` fields. * ``app_info.log``: ``VkApplicationInfo`` fields.
* ``bo_history.log``: A list of every GPU memory allocation and deallocation. * ``bo_history.log``: A list of every GPU memory allocation and deallocation.
If the GPU hang was caused by a page fault, you can use If the GPU hang was caused by a page fault, you can use

View file

@ -15,6 +15,7 @@
#endif #endif
#include <sys/stat.h> #include <sys/stat.h>
#include "spirv/nir_spirv.h"
#include "util/mesa-sha1.h" #include "util/mesa-sha1.h"
#include "util/os_time.h" #include "util/os_time.h"
#include "ac_debug.h" #include "ac_debug.h"
@ -27,7 +28,9 @@
#include "radv_pipeline_rt.h" #include "radv_pipeline_rt.h"
#include "radv_shader.h" #include "radv_shader.h"
#include "sid.h" #include "sid.h"
#include "spirv/nir_spirv.h"
#include "vk_common_entrypoints.h"
#include "vk_enum_to_str.h"
#define COLOR_RESET "\033[0m" #define COLOR_RESET "\033[0m"
#define COLOR_RED "\033[31m" #define COLOR_RED "\033[31m"
@ -37,6 +40,105 @@
#define RADV_DUMP_DIR "radv_dumps" #define RADV_DUMP_DIR "radv_dumps"
static void
radv_dump_address_binding_report(const struct radv_address_binding_report *report, FILE *f)
{
fprintf(f, "timestamp=%llu, VA=%.16llx-%.16llx, binding_type=%s, object_type=%s, object_handle=0x%llx\n",
(long long)report->timestamp, (long long)report->va, (long long)(report->va + report->size),
(report->binding_type == VK_DEVICE_ADDRESS_BINDING_TYPE_BIND_EXT) ? "bind" : "unbind",
vk_ObjectType_to_str(report->object_type), (long long)report->object_handle);
}
static void
radv_dump_address_binding_reports(struct radv_device *device, FILE *f)
{
struct radv_address_binding_tracker *tracker = device->addr_binding_tracker;
simple_mtx_lock(&tracker->mtx);
util_dynarray_foreach (&tracker->reports, struct radv_address_binding_report, report)
radv_dump_address_binding_report(report, f);
simple_mtx_unlock(&tracker->mtx);
}
static VkBool32 VKAPI_PTR
radv_address_binding_callback(VkDebugUtilsMessageSeverityFlagBitsEXT message_severity,
VkDebugUtilsMessageTypeFlagsEXT message_types,
const VkDebugUtilsMessengerCallbackDataEXT *callback_data, void *userdata)
{
struct radv_address_binding_tracker *tracker = userdata;
const VkDeviceAddressBindingCallbackDataEXT *data;
if (!callback_data)
return VK_FALSE;
data = vk_find_struct_const(callback_data->pNext, DEVICE_ADDRESS_BINDING_CALLBACK_DATA_EXT);
if (!data)
return VK_FALSE;
simple_mtx_lock(&tracker->mtx);
for (uint32_t i = 0; i < callback_data->objectCount; i++) {
struct radv_address_binding_report report = {
.timestamp = os_time_get_nano(),
.va = data->baseAddress & ((1ull << 48) - 1),
.size = data->size,
.flags = data->flags,
.binding_type = data->bindingType,
.object_handle = callback_data->pObjects[i].objectHandle,
.object_type = callback_data->pObjects[i].objectType,
};
util_dynarray_append(&tracker->reports, struct radv_address_binding_report, report);
}
simple_mtx_unlock(&tracker->mtx);
return VK_FALSE;
}
static bool
radv_init_adress_binding_report(struct radv_device *device)
{
struct radv_physical_device *pdev = radv_device_physical(device);
struct radv_instance *instance = radv_physical_device_instance(pdev);
VkResult result;
device->addr_binding_tracker = calloc(1, sizeof(*device->addr_binding_tracker));
if (!device->addr_binding_tracker)
return false;
simple_mtx_init(&device->addr_binding_tracker->mtx, mtx_plain);
util_dynarray_init(&device->addr_binding_tracker->reports, NULL);
VkDebugUtilsMessengerCreateInfoEXT create_info = {
.messageSeverity = VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT,
.pUserData = device->addr_binding_tracker,
.pfnUserCallback = radv_address_binding_callback,
.messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_DEVICE_ADDRESS_BINDING_BIT_EXT,
};
result = vk_common_CreateDebugUtilsMessengerEXT(radv_instance_to_handle(instance), &create_info, NULL,
&device->addr_binding_tracker->messenger);
if (result != VK_SUCCESS)
return false;
return true;
}
static void
radv_finish_address_binding_report(struct radv_device *device)
{
struct radv_physical_device *pdev = radv_device_physical(device);
struct radv_instance *instance = radv_physical_device_instance(pdev);
struct radv_address_binding_tracker *tracker = device->addr_binding_tracker;
util_dynarray_fini(&tracker->reports);
simple_mtx_destroy(&tracker->mtx);
vk_common_DestroyDebugUtilsMessengerEXT(radv_instance_to_handle(instance), tracker->messenger, NULL);
free(device->addr_binding_tracker);
}
bool bool
radv_init_trace(struct radv_device *device) radv_init_trace(struct radv_device *device)
{ {
@ -58,6 +160,9 @@ radv_init_trace(struct radv_device *device)
if (!device->trace_data) if (!device->trace_data)
return false; return false;
if (!radv_init_adress_binding_report(device))
return false;
return true; return true;
} }
@ -66,6 +171,9 @@ radv_finish_trace(struct radv_device *device)
{ {
struct radeon_winsys *ws = device->ws; struct radeon_winsys *ws = device->ws;
if (device->addr_binding_tracker)
radv_finish_address_binding_report(device);
if (unlikely(device->trace_bo)) { if (unlikely(device->trace_bo)) {
ws->buffer_make_resident(ws, device->trace_bo, false); ws->buffer_make_resident(ws, device->trace_bo, false);
radv_bo_destroy(device, NULL, device->trace_bo); radv_bo_destroy(device, NULL, device->trace_bo);
@ -728,6 +836,7 @@ enum radv_device_fault_chunk {
RADV_DEVICE_FAULT_CHUNK_REGISTERS, RADV_DEVICE_FAULT_CHUNK_REGISTERS,
RADV_DEVICE_FAULT_CHUNK_BO_RANGES, RADV_DEVICE_FAULT_CHUNK_BO_RANGES,
RADV_DEVICE_FAULT_CHUNK_BO_HISTORY, RADV_DEVICE_FAULT_CHUNK_BO_HISTORY,
RADV_DEVICE_FAULT_CHUNK_ADDR_BINDING_REPORT,
RADV_DEVICE_FAULT_CHUNK_VM_FAULT, RADV_DEVICE_FAULT_CHUNK_VM_FAULT,
RADV_DEVICE_FAULT_CHUNK_APP_INFO, RADV_DEVICE_FAULT_CHUNK_APP_INFO,
RADV_DEVICE_FAULT_CHUNK_GPU_INFO, RADV_DEVICE_FAULT_CHUNK_GPU_INFO,
@ -801,8 +910,9 @@ radv_check_gpu_hangs(struct radv_queue *queue, const struct radv_winsys_submit_i
char *ptr; char *ptr;
size_t size; size_t size;
} chunks[RADV_DEVICE_FAULT_CHUNK_COUNT] = { } chunks[RADV_DEVICE_FAULT_CHUNK_COUNT] = {
{"trace"}, {"pipeline"}, {"umr_waves"}, {"umr_ring"}, {"registers"}, {"bo_ranges"}, {"trace"}, {"pipeline"}, {"umr_waves"}, {"umr_ring"},
{"bo_history"}, {"vm_fault"}, {"app_info"}, {"gpu_info"}, {"dmesg"}, {"registers"}, {"bo_ranges"}, {"bo_history"}, {"addr_binding_report"},
{"vm_fault"}, {"app_info"}, {"gpu_info"}, {"dmesg"},
}; };
char *wave_dump = NULL; char *wave_dump = NULL;
@ -846,6 +956,9 @@ radv_check_gpu_hangs(struct radv_queue *queue, const struct radv_winsys_submit_i
case RADV_DEVICE_FAULT_CHUNK_BO_HISTORY: case RADV_DEVICE_FAULT_CHUNK_BO_HISTORY:
device->ws->dump_bo_log(device->ws, f); device->ws->dump_bo_log(device->ws, f);
break; break;
case RADV_DEVICE_FAULT_CHUNK_ADDR_BINDING_REPORT:
radv_dump_address_binding_reports(device, f);
break;
case RADV_DEVICE_FAULT_CHUNK_VM_FAULT: case RADV_DEVICE_FAULT_CHUNK_VM_FAULT:
if (vm_fault_occurred) if (vm_fault_occurred)
radv_dump_vm_fault(device, &fault_info, f); radv_dump_vm_fault(device, &fault_info, f);

View file

@ -137,4 +137,20 @@ struct radv_trace_data {
VkDispatchIndirectCommand indirect_dispatch; VkDispatchIndirectCommand indirect_dispatch;
}; };
struct radv_address_binding_report {
uint64_t timestamp; /* CPU timestamp */
uint64_t va;
uint64_t size;
VkDeviceAddressBindingFlagsEXT flags;
VkDeviceAddressBindingTypeEXT binding_type;
uint64_t object_handle;
VkObjectType object_type;
};
struct radv_address_binding_tracker {
VkDebugUtilsMessengerEXT messenger;
struct util_dynarray reports;
simple_mtx_t mtx;
};
#endif /* RADV_DEBUG_H */ #endif /* RADV_DEBUG_H */

View file

@ -540,6 +540,8 @@ struct radv_device {
/* PSO cache stats */ /* PSO cache stats */
simple_mtx_t pso_cache_stats_mtx; simple_mtx_t pso_cache_stats_mtx;
struct radv_pso_cache_stats pso_cache_stats[RADV_PIPELINE_TYPE_COUNT]; struct radv_pso_cache_stats pso_cache_stats[RADV_PIPELINE_TYPE_COUNT];
struct radv_address_binding_tracker *addr_binding_tracker;
}; };
VK_DEFINE_HANDLE_CASTS(radv_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE) VK_DEFINE_HANDLE_CASTS(radv_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)