anv/bvh: Dump BVH synchronously upon command buffer completion

Modified the BVH dumping mechanism to synchronously wait for the command
buffer to complete before saving BVH data to files. This approach is
more robust compared to the previous method of dumping during
acceleration strucutre destruction.

Note: if DEBUG_BVH_ANY is enabled but intel-rt is disabled, we will wait
for nothing.

Signed-off-by: Kevin Chuang <kaiwenjon23@gmail.com>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32585>
This commit is contained in:
Kevin Chuang 2024-12-11 02:21:06 -08:00 committed by Marge Bot
parent c695043e81
commit 1b55f10105
6 changed files with 156 additions and 190 deletions

View file

@ -479,6 +479,7 @@ VkResult anv_CreateDevice(
list_inithead(&device->memory_objects);
list_inithead(&device->image_private_objects);
list_inithead(&device->bvh_dumps);
if (pthread_mutex_init(&device->mutex, NULL) != 0) {
result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);

View file

@ -1843,12 +1843,15 @@ enum bvh_dump_type {
BVH_IR_AS
};
struct bvh_dump_struct {
struct anv_bvh_dump {
struct anv_bo *bo;
uint32_t bvh_id;
uint64_t dump_size;
VkGeometryTypeKHR geometry_type;
enum bvh_dump_type dump_type;
/* Link in the anv_device.bvh_dumps list */
struct list_head link;
};
struct anv_device_astc_emu {
@ -1887,6 +1890,9 @@ struct anv_device {
/** List of anv_image objects with a private binding for implicit CCS */
struct list_head image_private_objects;
/** List of anv_bvh_dump objects that get dumped on cmd buf completion */
struct list_head bvh_dumps;
/** Memory pool for batch buffers */
struct anv_bo_pool batch_bo_pool;
/** Memory pool for utrace timestamp buffers */
@ -2290,6 +2296,8 @@ VkResult anv_device_print_init(struct anv_device *device);
void anv_device_print_fini(struct anv_device *device);
void anv_device_print_shader_prints(struct anv_device *device);
void anv_dump_bvh_to_files(struct anv_device *device);
VkResult anv_queue_init(struct anv_device *device, struct anv_queue *queue,
const VkDeviceQueueCreateInfo *pCreateInfo,
uint32_t index_in_family);
@ -2318,6 +2326,12 @@ anv_queue_post_submit(struct anv_queue *queue, VkResult submit_result)
if (INTEL_DEBUG(DEBUG_SHADER_PRINT))
anv_device_print_shader_prints(queue->device);
#if ANV_SUPPORT_RT && !ANV_SUPPORT_RT_GRL
/* The recorded bvh is dumped to files upon command buffer completion */
if (INTEL_DEBUG(DEBUG_BVH_ANY))
anv_dump_bvh_to_files(queue->device);
#endif
return result;
}

View file

@ -92,9 +92,9 @@ anv_queue_init(struct anv_device *device, struct anv_queue *queue,
}
/* Add a debug fence to wait on submissions if we're using the synchronized
* submission feature or the shader-print feature.
* submission feature, shader-print feature, or BVH dump.
*/
if (INTEL_DEBUG(DEBUG_SYNC | DEBUG_SHADER_PRINT)) {
if (INTEL_DEBUG(DEBUG_SYNC | DEBUG_SHADER_PRINT | DEBUG_BVH_ANY)) {
result = vk_sync_create(&device->vk,
&device->physical->sync_syncobj_type,
0, 0, &queue->sync);

View file

@ -27,6 +27,7 @@
#include <string.h>
#include <errno.h>
#include <assert.h>
#include <sys/stat.h>
#include "anv_private.h"
#include "vk_enum_to_str.h"
@ -225,3 +226,89 @@ anv_device_print_shader_prints(struct anv_device *device)
simple_mtx_unlock(&device->printf.mutex);
}
static void
create_directory(const char *dir, const char *sub_dir)
{
char full_path[PATH_MAX];
snprintf(full_path, sizeof(full_path), "%s/%s", dir, sub_dir);
if (mkdir(dir, 0777) == -1 && errno != EEXIST) {
perror("Error creating directory");
return;
}
if (mkdir(full_path, 0777) == -1 && errno != EEXIST) {
perror("Error creating sub directory");
return;
}
}
static void
create_bvh_dump_file(struct anv_bvh_dump *bvh)
{
if (bvh == NULL) {
fprintf(stderr, "Error: BVH DUMP structure is NULL\n");
return;
}
char file_name[256];
const char *dump_directory = "bvh_dump";
const char *dump_sub_directory = NULL;
switch (bvh->dump_type) {
case BVH_ANV:
dump_sub_directory = "BVH_ANV";
break;
case BVH_IR_HDR:
dump_sub_directory = "BVH_IR_HDR";
break;
case BVH_IR_AS:
dump_sub_directory = "BVH_IR_AS";
break;
default:
unreachable("invalid dump type");
}
create_directory(dump_directory, dump_sub_directory);
snprintf(file_name, sizeof(file_name),
bvh->geometry_type == VK_GEOMETRY_TYPE_INSTANCES_KHR
? "%s/%s/tlas_%d.txt"
: "%s/%s/blas_%d.txt",
dump_directory, dump_sub_directory, bvh->bvh_id);
FILE *file = fopen(file_name, "w");
if (file == NULL) {
perror("Error creating file");
return;
}
fprintf(stderr, "BVH Dump File created: %s\n", file_name);
uint8_t *addr = (uint8_t *)(bvh->bo->map);
/* Dump every bytes like this: B0 B1 B2 B3 ... B15 */
for (uint64_t i = 0; i < bvh->dump_size; i++) {
uint8_t result = *(volatile uint8_t *)((uint8_t *)addr + i);
fprintf(file, "%02" PRIx8 " ", result);
if ((i + 1) % 16 == 0) {
fprintf(file, "\n");
}
}
fclose(file);
}
void anv_dump_bvh_to_files(struct anv_device *device)
{
/* device->mutex is acquired in anv_queue_submit, so no need to lock here. */
list_for_each_entry_safe(struct anv_bvh_dump, bvh_dump, &device->bvh_dumps,
link) {
create_bvh_dump_file(bvh_dump);
anv_device_release_bo(device, bvh_dump->bo);
list_del(&bvh_dump->link);
free(bvh_dump);
}
}

View file

@ -3,11 +3,11 @@
1. `INTEL_DEBUG=bvh_tlas,bvh_blas` will generate `tlas_{id}.txt` or `blas_{id}.txt` in `bvh_dump/BVH_ANV` directory.
2. `INTEL_DEBUG=bvh_tlas_ir_hdr,bvh_blas_ir_hdr` will generate `tlas_{id}.txt` or `blas_{id}.txt` in `bvh_dump/BVH_IR_HDR` directory.
3. `INTEL_DEBUG=bvh_tlas_ir_as,bvh_blas_ir_as` will generate `tlas_{id}.txt` or `blas_{id}.txt` in `bvh_dump/BVH_IR_AS` directory.
4. `INTEL_DEBUG=bvh_no_build` will skip the intel-specific-encoding part. If gpu hang is seen, this is the first step to isolate the problem. If toggled on and gpu doesn't hang anymore, that means it was either encode.comp was spinning, or the built bvh has issues so gpu hung during bvh traversal.
4. `INTEL_DEBUG=bvh_no_build` will skip the intel-specific-encoding part. This will make the bvh NULL and the HW traversal will see this as a miss.
The dumped text file contains memory dump, byte-by-byte in hex. The contents are contiguous memory of a certain region.
1. The dump in `BVH_ANV` starts from the beginning of `anv_accel_struct_header` to the end of the bvh. Nodes/leaves are packed tightly after the header, encoded in a way that our HW expects.
2. The dump in `BVH_IR_HDR` records the contents of `vk_ir_header` sitting at the beginning of ir bvh.
1. The dump in `BVH_ANV` starts from the beginning of `anv_accel_struct_header` to the end of the bvh.
2. The dump in `BVH_IR_HDR` records `vk_ir_header`, which sits at the beginning of ir bvh.
3. The dump in `BVH_IR_AS` records all `vk_ir_{leaf_type}_node` and `vk_ir_box_node` in ir bvh. The region starts from where leaves are encoded to the end of ir bvh.
### Decode the dump
@ -34,5 +34,4 @@ python3 -m http.server 8000
### Note and Limitations:
1. The python script use `ctypes` to interpret the memory dump, so the structure defined in the script should match the structure defined in the driver.
2. The memory dump is a snapshot of a VkBuffer captured at the end of `CmdBuildAccelerationStructure` call. It won't capture any bvh obtained from `CmdCopy`.
3. The memory dump of captured bvhs so far are saved to files at the moment when `DestroyAccelerationStructure` is called every time.
4. If ANY dump is enabled, we will nullify anv tlas bvh and send all 0s to the gpu. Doing this can prevent gpu hang caused by incorrect bvh traversal. However, the actual contents are still saved to files for debugging.
3. CPU will wait for the the command buffer to be completed, and saves the recorded bvh into files on disk.

View file

@ -4,7 +4,6 @@
#include "anv_private.h"
#include <sys/stat.h>
#include <math.h>
#include "util/u_debug.h"
@ -27,18 +26,9 @@
#if GFX_VERx10 >= 125
/* TODO: Dumping things on destory doesn't look robust. Would be nice to track
* the debug operation when the command buffer is executed and synchronously
* wait on the command buffer and write the data to disk upon completion.
*
* Each time a CmdBuildAS is completed, we append one element to bvhDumpArray.
* When DestroyAccelerationStructure is called every time, we dump the
* accumulated elements so far to files.
*/
/* Id to track bvh_dump */
static uint32_t blas_id = 0;
static uint32_t tlas_id = 0;
static struct bvh_dump_struct *bvhDumpArray = NULL;
static uint32_t bvh_dump_array_size = 0;
static void
begin_debug_marker(VkCommandBuffer commandBuffer,
@ -111,56 +101,19 @@ end_debug_marker(VkCommandBuffer commandBuffer)
}
}
/* clear out everything from (header + bvh_offset) to the end */
static void
clear_out_anv_bvh(struct anv_cmd_buffer *cmd_buffer,
VkDeviceAddress header_addr, struct bvh_layout bvh_layout)
{
uint64_t offset = bvh_layout.bvh_offset;
uint64_t clear_size = bvh_layout.size - bvh_layout.bvh_offset;
assert(clear_size % 4 == 0);
struct anv_address anv_bvh_addr = anv_address_from_u64(header_addr + offset);
anv_cmd_buffer_fill_area(cmd_buffer, anv_bvh_addr, clear_size, 0, false);
genx_batch_emit_pipe_control(&cmd_buffer->batch, cmd_buffer->device->info,
cmd_buffer->state.current_pipeline,
ANV_PIPE_END_OF_PIPE_SYNC_BIT |
ANV_PIPE_DATA_CACHE_FLUSH_BIT |
ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT);
}
static void
expand_bvh_dump_array()
{
/* Reallocate bvh dump array */
bvhDumpArray =
(struct bvh_dump_struct *)realloc(bvhDumpArray,
(bvh_dump_array_size + 1) *
sizeof(struct bvh_dump_struct));
if (bvhDumpArray == NULL) {
perror("Failed to reallocate memory for bvh dump array.");
}
bvh_dump_array_size++;
}
static void
append_bvh_dump(struct anv_cmd_buffer *cmd_buffer, VkDeviceAddress src,
uint64_t dump_size, VkGeometryTypeKHR geometry_type,
enum bvh_dump_type dump_type)
add_bvh_dump(struct anv_cmd_buffer *cmd_buffer,
VkDeviceAddress src,
uint64_t dump_size,
VkGeometryTypeKHR geometry_type,
enum bvh_dump_type dump_type)
{
assert(dump_size % 4 == 0);
expand_bvh_dump_array();
struct anv_device *device = cmd_buffer->device;
struct bvh_dump_struct *latestElement =
bvhDumpArray + bvh_dump_array_size - 1;
struct anv_bo *bo = NULL;
VkResult result = anv_device_alloc_bo(device, "dump_bvh", dump_size,
VkResult result = anv_device_alloc_bo(device, "bvh_dump", dump_size,
ANV_BO_ALLOC_MAPPED |
ANV_BO_ALLOC_HOST_CACHED_COHERENT, 0,
&bo);
@ -170,50 +123,55 @@ append_bvh_dump(struct anv_cmd_buffer *cmd_buffer, VkDeviceAddress src,
return;
}
latestElement->bo = bo;
latestElement->bvh_id = geometry_type == VK_GEOMETRY_TYPE_INSTANCES_KHR ?
struct anv_bvh_dump *bvh_dump = malloc(sizeof(struct anv_bvh_dump));
bvh_dump->bo = bo;
bvh_dump->bvh_id = geometry_type == VK_GEOMETRY_TYPE_INSTANCES_KHR ?
tlas_id : blas_id;
latestElement->dump_size = dump_size;
latestElement->geometry_type = geometry_type;
latestElement->dump_type = dump_type;
bvh_dump->dump_size = dump_size;
bvh_dump->geometry_type = geometry_type;
bvh_dump->dump_type = dump_type;
struct anv_address dst_addr = { .bo = latestElement->bo, .offset = 0 };
struct anv_address dst_addr = { .bo = bvh_dump->bo, .offset = 0 };
struct anv_address src_addr = anv_address_from_u64(src);
anv_cmd_copy_addr(cmd_buffer, src_addr, dst_addr, latestElement->dump_size);
anv_cmd_copy_addr(cmd_buffer, src_addr, dst_addr, bvh_dump->dump_size);
genx_batch_emit_pipe_control(&cmd_buffer->batch, cmd_buffer->device->info,
cmd_buffer->state.current_pipeline,
ANV_PIPE_CS_STALL_BIT);
pthread_mutex_lock(&device->mutex);
list_addtail(&bvh_dump->link, &device->bvh_dumps);
pthread_mutex_unlock(&device->mutex);
}
static void
debug_dump_bvh(struct anv_cmd_buffer *cmd_buffer, VkDeviceAddress header_addr,
uint64_t bvh_anv_size, VkDeviceAddress intermediate_header_addr,
VkDeviceAddress intermediate_as_addr, uint32_t leaf_count,
VkGeometryTypeKHR geometry_type)
debug_record_as_to_bvh_dump(struct anv_cmd_buffer *cmd_buffer,
VkDeviceAddress header_addr,
uint64_t bvh_anv_size,
VkDeviceAddress intermediate_header_addr,
VkDeviceAddress intermediate_as_addr,
uint32_t leaf_count,
VkGeometryTypeKHR geometry_type)
{
if (INTEL_DEBUG(DEBUG_BVH_BLAS) &&
geometry_type != VK_GEOMETRY_TYPE_INSTANCES_KHR) {
append_bvh_dump(cmd_buffer, header_addr, bvh_anv_size, geometry_type,
BVH_ANV);
add_bvh_dump(cmd_buffer, header_addr, bvh_anv_size, geometry_type,
BVH_ANV);
}
if (INTEL_DEBUG(DEBUG_BVH_TLAS) &&
geometry_type == VK_GEOMETRY_TYPE_INSTANCES_KHR) {
append_bvh_dump(cmd_buffer, header_addr, bvh_anv_size, geometry_type,
BVH_ANV);
add_bvh_dump(cmd_buffer, header_addr, bvh_anv_size, geometry_type,
BVH_ANV);
}
if (INTEL_DEBUG(DEBUG_BVH_BLAS_IR_HDR) &&
geometry_type != VK_GEOMETRY_TYPE_INSTANCES_KHR) {
append_bvh_dump(cmd_buffer, intermediate_header_addr,
sizeof(struct vk_ir_header), geometry_type, BVH_IR_HDR);
add_bvh_dump(cmd_buffer, intermediate_header_addr,
sizeof(struct vk_ir_header), geometry_type, BVH_IR_HDR);
}
if (INTEL_DEBUG(DEBUG_BVH_TLAS_IR_HDR) &&
geometry_type == VK_GEOMETRY_TYPE_INSTANCES_KHR) {
append_bvh_dump(cmd_buffer, intermediate_header_addr,
sizeof(struct vk_ir_header), geometry_type, BVH_IR_HDR);
add_bvh_dump(cmd_buffer, intermediate_header_addr,
sizeof(struct vk_ir_header), geometry_type, BVH_IR_HDR);
}
uint32_t internal_node_count = MAX2(leaf_count, 2) - 1;
@ -235,18 +193,16 @@ debug_dump_bvh(struct anv_cmd_buffer *cmd_buffer, VkDeviceAddress header_addr,
unreachable("invalid geometry type");
}
append_bvh_dump(cmd_buffer, intermediate_as_addr,
internal_node_total_size + leaf_total_size,
geometry_type, BVH_IR_AS);
add_bvh_dump(cmd_buffer, intermediate_as_addr, internal_node_total_size +
leaf_total_size, geometry_type, BVH_IR_AS);
}
if (INTEL_DEBUG(DEBUG_BVH_TLAS_IR_AS) &&
geometry_type == VK_GEOMETRY_TYPE_INSTANCES_KHR) {
uint64_t leaf_total_size = sizeof(struct vk_ir_instance_node) *
leaf_count;
append_bvh_dump(cmd_buffer, intermediate_as_addr,
internal_node_total_size + leaf_total_size,
geometry_type, BVH_IR_AS);
add_bvh_dump(cmd_buffer, intermediate_as_addr, internal_node_total_size +
leaf_total_size, geometry_type, BVH_IR_AS);
}
@ -590,16 +546,15 @@ anv_init_header(VkCommandBuffer commandBuffer,
}
if (INTEL_DEBUG(DEBUG_BVH_ANY)) {
debug_dump_bvh(cmd_buffer, header_addr, bvh_layout.size,
intermediate_header_addr, intermediate_as_addr,
leaf_count, geometry_type);
/* Nullify tlas and send zeros to gpu, so that tlas traversal will return
* early. Doing this can prevent the gpu hang caused by incorrect bvh
* traversal.
*/
if (geometry_type == VK_GEOMETRY_TYPE_INSTANCES_KHR)
clear_out_anv_bvh(cmd_buffer, header_addr, bvh_layout);
genx_batch_emit_pipe_control(&cmd_buffer->batch, cmd_buffer->device->info,
cmd_buffer->state.current_pipeline,
ANV_PIPE_END_OF_PIPE_SYNC_BIT |
ANV_PIPE_DATA_CACHE_FLUSH_BIT |
ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT);
debug_record_as_to_bvh_dump(cmd_buffer, header_addr, bvh_layout.size,
intermediate_header_addr, intermediate_as_addr,
leaf_count, geometry_type);
}
}
@ -1002,102 +957,12 @@ genX(WriteAccelerationStructuresPropertiesKHR)(
return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT);
}
static void
create_directory(const char *dir, const char *sub_dir)
{
char full_path[PATH_MAX];
snprintf(full_path, sizeof(full_path), "%s/%s", dir, sub_dir);
if (mkdir(dir, 0777) == -1 && errno != EEXIST) {
perror("Error creating directory");
return;
}
if (mkdir(full_path, 0777) == -1 && errno != EEXIST) {
perror("Error creating sub directory");
return;
}
}
static void
create_dump_file(struct bvh_dump_struct *bvh)
{
if (bvh == NULL) {
fprintf(stderr, "Error: BVH DUMP structure is NULL\n");
return;
}
char file_name[256];
const char *dump_directory = "bvh_dump";
const char *dump_sub_directory = NULL;
switch (bvh->dump_type) {
case BVH_ANV:
dump_sub_directory = "BVH_ANV";
break;
case BVH_IR_HDR:
dump_sub_directory = "BVH_IR_HDR";
break;
case BVH_IR_AS:
dump_sub_directory = "BVH_IR_AS";
break;
default:
unreachable("invalid dump type");
}
create_directory(dump_directory, dump_sub_directory);
snprintf(file_name, sizeof(file_name),
bvh->geometry_type == VK_GEOMETRY_TYPE_INSTANCES_KHR
? "%s/%s/tlas_%d.txt"
: "%s/%s/blas_%d.txt",
dump_directory, dump_sub_directory, bvh->bvh_id);
FILE *file = fopen(file_name, "w");
if (file == NULL) {
perror("Error creating file");
return;
}
fprintf(stderr, "Dump File created: %s\n", file_name);
uint8_t *addr = (uint8_t *)(bvh->bo->map);
/* Dump every bytes like this: B0 B1 B2 B3 ... B15 */
for (uint64_t i = 0; i < bvh->dump_size; i++) {
uint8_t result = *(volatile uint8_t *)((uint8_t *)addr + i);
fprintf(file, "%02" PRIx8 " ", result);
if ((i + 1) % 16 == 0) {
fprintf(file, "\n");
}
}
fclose(file);
}
void
genX(DestroyAccelerationStructureKHR)(
VkDevice _device,
VkAccelerationStructureKHR accelerationStructure,
const VkAllocationCallbacks* pAllocator)
{
if (INTEL_DEBUG(DEBUG_BVH_ANY)) {
/* create bvh dump file */
ANV_FROM_HANDLE(anv_device, device, _device);
for (uint32_t i = 0; i < bvh_dump_array_size; i++) {
struct bvh_dump_struct *bvh = bvhDumpArray + i;
create_dump_file(bvh);
if (bvh && bvh->bo) {
anv_device_release_bo(device, bvh->bo);
}
}
free(bvhDumpArray);
bvhDumpArray = NULL;
bvh_dump_array_size = 0;
}
vk_common_DestroyAccelerationStructureKHR(_device, accelerationStructure,
pAllocator);
}