radv/rra: Dump basic ray history tokens

This only dumps the begin tokens. Tokens are written to a buffer
containing a 12 byte header at the beginning.

We use an intermediate format for the ray history tokens because the RRA
format is very inefficient.

Reviewed-by: Friedrich Vock <friedrich.vock@gmx.de>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25548>
This commit is contained in:
Konstantin Seurer 2023-09-30 22:37:16 +02:00
parent 26939f016d
commit 767f628079
9 changed files with 778 additions and 30 deletions

View file

@ -32,10 +32,43 @@ VKAPI_ATTR VkResult VKAPI_CALL
rra_QueuePresentKHR(VkQueue _queue, const VkPresentInfoKHR *pPresentInfo)
{
RADV_FROM_HANDLE(radv_queue, queue, _queue);
if (queue->device->rra_trace.triggered) {
queue->device->rra_trace.triggered = false;
if (_mesa_hash_table_num_entries(queue->device->rra_trace.accel_structs) == 0) {
fprintf(stderr, "radv: No acceleration structures captured, not saving RRA trace.\n");
} else {
char filename[2048];
time_t t = time(NULL);
struct tm now = *localtime(&t);
snprintf(filename, sizeof(filename), "/tmp/%s_%04d.%02d.%02d_%02d.%02d.%02d.rra", util_get_process_name(),
1900 + now.tm_year, now.tm_mon + 1, now.tm_mday, now.tm_hour, now.tm_min, now.tm_sec);
VkResult result = radv_rra_dump_trace(_queue, filename);
if (result == VK_SUCCESS)
fprintf(stderr, "radv: RRA capture saved to '%s'\n", filename);
else
fprintf(stderr, "radv: Failed to save RRA capture!\n");
}
}
VkResult result = queue->device->layer_dispatch.rra.QueuePresentKHR(_queue, pPresentInfo);
if (result != VK_SUCCESS && result != VK_SUBOPTIMAL_KHR)
return result;
VkDevice _device = radv_device_to_handle(queue->device);
radv_rra_trace_clear_ray_history(_device, &queue->device->rra_trace);
if (queue->device->rra_trace.triggered) {
result = queue->device->layer_dispatch.rra.DeviceWaitIdle(_device);
if (result != VK_SUCCESS)
return result;
struct radv_ray_history_header *header = queue->device->rra_trace.ray_history_data;
header->offset = sizeof(struct radv_ray_history_header);
}
if (!queue->device->rra_trace.copy_after_build)
return VK_SUCCESS;
@ -46,7 +79,7 @@ rra_QueuePresentKHR(VkQueue _queue, const VkPresentInfoKHR *pPresentInfo)
if (!data->is_dead)
continue;
radv_destroy_rra_accel_struct_data(radv_device_to_handle(queue->device), data);
radv_destroy_rra_accel_struct_data(_device, data);
_mesa_hash_table_remove(accel_structs, entry);
}
@ -292,3 +325,45 @@ rra_DestroyAccelerationStructureKHR(VkDevice _device, VkAccelerationStructureKHR
device->layer_dispatch.rra.DestroyAccelerationStructureKHR(_device, _structure, pAllocator);
}
VKAPI_ATTR VkResult VKAPI_CALL
rra_QueueSubmit2KHR(VkQueue _queue, uint32_t submitCount, const VkSubmitInfo2 *pSubmits, VkFence _fence)
{
RADV_FROM_HANDLE(radv_queue, queue, _queue);
struct radv_device *device = queue->device;
VkResult result = device->layer_dispatch.rra.QueueSubmit2KHR(_queue, submitCount, pSubmits, _fence);
if (result != VK_SUCCESS || !device->rra_trace.triggered)
return result;
uint32_t total_trace_count = 0;
simple_mtx_lock(&device->rra_trace.data_mtx);
for (uint32_t submit_index = 0; submit_index < submitCount; submit_index++) {
for (uint32_t i = 0; i < pSubmits[submit_index].commandBufferInfoCount; i++) {
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, pSubmits[submit_index].pCommandBufferInfos[i].commandBuffer);
uint32_t trace_count =
util_dynarray_num_elements(&cmd_buffer->ray_history, struct radv_rra_ray_history_data *);
if (!trace_count)
continue;
total_trace_count += trace_count;
util_dynarray_append_dynarray(&device->rra_trace.ray_history, &cmd_buffer->ray_history);
}
}
if (!total_trace_count) {
simple_mtx_unlock(&device->rra_trace.data_mtx);
return result;
}
result = device->layer_dispatch.rra.DeviceWaitIdle(radv_device_to_handle(device));
struct radv_ray_history_header *header = device->rra_trace.ray_history_data;
header->submit_base_index += total_trace_count;
simple_mtx_unlock(&device->rra_trace.data_mtx);
return result;
}

View file

@ -570,6 +570,12 @@ radv_build_ray_traversal(struct radv_device *device, nir_builder *b, const struc
}
nir_push_else(b, NULL);
{
if (args->vars.iteration_instance_count) {
nir_def *iteration_instance_count = nir_load_deref(b, args->vars.iteration_instance_count);
iteration_instance_count = nir_iadd_imm(b, iteration_instance_count, 1 << 16);
nir_store_deref(b, args->vars.iteration_instance_count, iteration_instance_count, 0x1);
}
/* instance */
nir_def *instance_node_addr = build_node_to_addr(device, b, global_bvh_node, false);
nir_store_deref(b, args->vars.instance_addr, instance_node_addr, 1);
@ -670,6 +676,12 @@ radv_build_ray_traversal(struct radv_device *device, nir_builder *b, const struc
insert_traversal_triangle_case(device, b, args, &ray_flags, result, global_bvh_node);
}
nir_pop_if(b, NULL);
if (args->vars.iteration_instance_count) {
nir_def *iteration_instance_count = nir_load_deref(b, args->vars.iteration_instance_count);
iteration_instance_count = nir_iadd_imm(b, iteration_instance_count, 1);
nir_store_deref(b, args->vars.iteration_instance_count, iteration_instance_count, 0x1);
}
}
nir_pop_loop(b, NULL);

View file

@ -118,6 +118,9 @@ struct radv_ray_traversal_vars {
/* Information about the current instance used for culling. */
nir_deref_instr *instance_addr;
nir_deref_instr *sbt_offset_and_flags;
/* Statistics. Iteration count in the low 16 bits, candidate instance counts in the high 16 bits. */
nir_deref_instr *iteration_instance_count;
};
struct radv_ray_traversal_args {

View file

@ -855,6 +855,137 @@ radv_nir_lower_rt_io(nir_shader *nir, bool monolithic, uint32_t payload_offset)
}
}
static nir_def *
radv_build_token_begin(nir_builder *b, struct rt_variables *vars, nir_def *hit, enum radv_packed_token_type token_type,
nir_def *token_size, uint32_t max_token_size)
{
struct radv_rra_trace_data *rra_trace = &vars->device->rra_trace;
assert(rra_trace->ray_history_addr);
assert(rra_trace->ray_history_buffer_size >= max_token_size);
nir_def *ray_history_addr = nir_imm_int64(b, rra_trace->ray_history_addr);
nir_def *launch_id = nir_load_ray_launch_id(b);
nir_def *trace = nir_imm_true(b);
for (uint32_t i = 0; i < 3; i++) {
nir_def *remainder = nir_umod_imm(b, nir_channel(b, launch_id, i), rra_trace->ray_history_resolution_scale);
trace = nir_iand(b, trace, nir_ieq_imm(b, remainder, 0));
}
nir_push_if(b, trace);
static_assert(offsetof(struct radv_ray_history_header, offset) == 0, "Unexpected offset");
nir_def *base_offset = nir_global_atomic(b, 32, ray_history_addr, token_size, .atomic_op = nir_atomic_op_iadd);
/* Abuse the dword alignment of token_size to add an invalid bit to offset. */
trace = nir_ieq_imm(b, nir_iand_imm(b, base_offset, 1), 0);
nir_def *in_bounds = nir_ule_imm(b, base_offset, rra_trace->ray_history_buffer_size - max_token_size);
/* Make sure we don't overwrite the header in case of an overflow. */
in_bounds = nir_iand(b, in_bounds, nir_uge_imm(b, base_offset, sizeof(struct radv_ray_history_header)));
nir_push_if(b, nir_iand(b, trace, in_bounds));
nir_def *dst_addr = nir_iadd(b, ray_history_addr, nir_u2u64(b, base_offset));
nir_def *launch_size = nir_load_ray_launch_size(b);
nir_def *launch_id_comps[3];
nir_def *launch_size_comps[3];
for (uint32_t i = 0; i < 3; i++) {
launch_id_comps[i] = nir_udiv_imm(b, nir_channel(b, launch_id, i), rra_trace->ray_history_resolution_scale);
launch_size_comps[i] = nir_udiv_imm(b, nir_channel(b, launch_size, i), rra_trace->ray_history_resolution_scale);
}
nir_def *global_index =
nir_iadd(b, launch_id_comps[0],
nir_iadd(b, nir_imul(b, launch_id_comps[1], launch_size_comps[0]),
nir_imul(b, launch_id_comps[2], nir_imul(b, launch_size_comps[0], launch_size_comps[1]))));
nir_def *launch_index_and_hit = nir_bcsel(b, hit, nir_ior_imm(b, global_index, 1u << 29u), global_index);
nir_build_store_global(b, nir_ior_imm(b, launch_index_and_hit, token_type << 30), dst_addr, .align_mul = 4);
return nir_iadd_imm(b, dst_addr, 4);
}
static void
radv_build_token_end(nir_builder *b)
{
nir_pop_if(b, NULL);
nir_pop_if(b, NULL);
}
static void
radv_build_end_trace_token(nir_builder *b, struct rt_variables *vars, nir_def *tmax, nir_def *hit,
nir_def *iteration_instance_count)
{
nir_def *token_size = nir_bcsel(b, hit, nir_imm_int(b, sizeof(struct radv_packed_end_trace_token)),
nir_imm_int(b, offsetof(struct radv_packed_end_trace_token, primitive_id)));
nir_def *dst_addr = radv_build_token_begin(b, vars, hit, radv_packed_token_end_trace, token_size,
sizeof(struct radv_packed_end_trace_token));
{
nir_build_store_global(b, nir_load_var(b, vars->accel_struct), dst_addr, .align_mul = 4);
dst_addr = nir_iadd_imm(b, dst_addr, 8);
nir_def *dispatch_indices =
nir_load_smem_amd(b, 2, nir_imm_int64(b, vars->device->rra_trace.ray_history_addr),
nir_imm_int(b, offsetof(struct radv_ray_history_header, dispatch_index)), .align_mul = 4);
nir_def *dispatch_index = nir_iadd(b, nir_channel(b, dispatch_indices, 0), nir_channel(b, dispatch_indices, 1));
nir_def *dispatch_and_flags = nir_iand_imm(b, nir_load_var(b, vars->cull_mask_and_flags), 0xFFFF);
dispatch_and_flags = nir_ior(b, dispatch_and_flags, dispatch_index);
nir_build_store_global(b, dispatch_and_flags, dst_addr, .align_mul = 4);
dst_addr = nir_iadd_imm(b, dst_addr, 4);
nir_def *shifted_cull_mask = nir_iand_imm(b, nir_load_var(b, vars->cull_mask_and_flags), 0xFF000000);
nir_def *packed_args = nir_load_var(b, vars->sbt_offset);
packed_args = nir_ior(b, packed_args, nir_ishl_imm(b, nir_load_var(b, vars->sbt_stride), 4));
packed_args = nir_ior(b, packed_args, nir_ishl_imm(b, nir_load_var(b, vars->miss_index), 8));
packed_args = nir_ior(b, packed_args, shifted_cull_mask);
nir_build_store_global(b, packed_args, dst_addr, .align_mul = 4);
dst_addr = nir_iadd_imm(b, dst_addr, 4);
nir_build_store_global(b, nir_load_var(b, vars->origin), dst_addr, .align_mul = 4);
dst_addr = nir_iadd_imm(b, dst_addr, 12);
nir_build_store_global(b, nir_load_var(b, vars->tmin), dst_addr, .align_mul = 4);
dst_addr = nir_iadd_imm(b, dst_addr, 4);
nir_build_store_global(b, nir_load_var(b, vars->direction), dst_addr, .align_mul = 4);
dst_addr = nir_iadd_imm(b, dst_addr, 12);
nir_build_store_global(b, tmax, dst_addr, .align_mul = 4);
dst_addr = nir_iadd_imm(b, dst_addr, 4);
nir_build_store_global(b, iteration_instance_count, dst_addr, .align_mul = 4);
dst_addr = nir_iadd_imm(b, dst_addr, 4);
nir_push_if(b, hit);
{
nir_build_store_global(b, nir_load_var(b, vars->primitive_id), dst_addr, .align_mul = 4);
dst_addr = nir_iadd_imm(b, dst_addr, 4);
nir_def *geometry_id = nir_iand_imm(b, nir_load_var(b, vars->geometry_id_and_flags), 0xFFFFFFF);
nir_build_store_global(b, geometry_id, dst_addr, .align_mul = 4);
dst_addr = nir_iadd_imm(b, dst_addr, 4);
nir_def *instance_id_and_hit_kind =
nir_build_load_global(b, 1, 32,
nir_iadd_imm(b, nir_load_var(b, vars->instance_addr),
offsetof(struct radv_bvh_instance_node, instance_id)));
instance_id_and_hit_kind =
nir_ior(b, instance_id_and_hit_kind, nir_ishl_imm(b, nir_load_var(b, vars->hit_kind), 24));
nir_build_store_global(b, instance_id_and_hit_kind, dst_addr, .align_mul = 4);
dst_addr = nir_iadd_imm(b, dst_addr, 4);
nir_build_store_global(b, nir_load_var(b, vars->tmax), dst_addr, .align_mul = 4);
dst_addr = nir_iadd_imm(b, dst_addr, 4);
}
nir_pop_if(b, NULL);
}
radv_build_token_end(b);
}
static nir_function_impl *
lower_any_hit_for_intersection(nir_shader *any_hit)
{
@ -1432,6 +1563,14 @@ radv_build_traversal(struct radv_device *device, struct radv_ray_tracing_pipelin
.sbt_offset_and_flags = nir_build_deref_var(b, trav_vars.sbt_offset_and_flags),
};
nir_variable *iteration_instance_count = NULL;
if (vars->device->rra_trace.ray_history_addr) {
iteration_instance_count =
nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "iteration_instance_count");
nir_store_var(b, iteration_instance_count, nir_imm_int(b, 0), 0x1);
trav_vars_args.iteration_instance_count = nir_build_deref_var(b, iteration_instance_count);
}
struct traversal_data data = {
.device = device,
.vars = vars,
@ -1464,8 +1603,14 @@ radv_build_traversal(struct radv_device *device, struct radv_ray_tracing_pipelin
.data = &data,
};
nir_def *original_tmax = nir_load_var(b, vars->tmax);
radv_build_ray_traversal(device, b, &args);
if (vars->device->rra_trace.ray_history_addr)
radv_build_end_trace_token(b, vars, original_tmax, nir_load_var(b, trav_vars.hit),
nir_load_var(b, iteration_instance_count));
nir_metadata_preserve(nir_shader_get_entrypoint(b->shader), nir_metadata_none);
radv_nir_lower_hit_attrib_derefs(b->shader);

View file

@ -326,6 +326,8 @@ radv_destroy_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer)
struct radv_cmd_buffer *cmd_buffer = container_of(vk_cmd_buffer, struct radv_cmd_buffer, vk);
if (cmd_buffer->qf != RADV_QUEUE_SPARSE) {
util_dynarray_fini(&cmd_buffer->ray_history);
list_for_each_entry_safe (struct radv_cmd_buffer_upload, up, &cmd_buffer->upload.list, list) {
radv_rmv_log_command_buffer_bo_destroy(cmd_buffer->device, up->upload_bo);
cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, up->upload_bo);
@ -404,6 +406,8 @@ radv_create_cmd_buffer(struct vk_command_pool *pool, struct vk_command_buffer **
for (unsigned i = 0; i < MAX_BIND_POINTS; i++)
vk_object_base_init(&device->vk, &cmd_buffer->descriptors[i].push_set.set.base, VK_OBJECT_TYPE_DESCRIPTOR_SET);
util_dynarray_init(&cmd_buffer->ray_history, NULL);
}
*cmd_buffer_out = &cmd_buffer->vk;
@ -438,6 +442,8 @@ radv_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer, UNUSED VkCommandB
free(up);
}
util_dynarray_clear(&cmd_buffer->ray_history);
cmd_buffer->push_constant_stages = 0;
cmd_buffer->scratch_size_per_wave_needed = 0;
cmd_buffer->scratch_waves_wanted = 0;
@ -10334,6 +10340,71 @@ radv_indirect_dispatch(struct radv_cmd_buffer *cmd_buffer, struct radeon_winsys_
radv_compute_dispatch(cmd_buffer, &info);
}
static void
radv_trace_trace_rays(struct radv_cmd_buffer *cmd_buffer, const VkTraceRaysIndirectCommand2KHR *cmd,
uint64_t indirect_va)
{
if (!cmd || indirect_va)
return;
struct radv_rra_ray_history_data *data = malloc(sizeof(struct radv_rra_ray_history_data));
if (!data)
return;
uint32_t width = DIV_ROUND_UP(cmd->width, cmd_buffer->device->rra_trace.ray_history_resolution_scale);
uint32_t height = DIV_ROUND_UP(cmd->height, cmd_buffer->device->rra_trace.ray_history_resolution_scale);
uint32_t depth = DIV_ROUND_UP(cmd->depth, cmd_buffer->device->rra_trace.ray_history_resolution_scale);
struct radv_rra_ray_history_counter counter = {
.dispatch_size = {width, height, depth},
.hit_shader_count = cmd->hitShaderBindingTableSize / cmd->hitShaderBindingTableStride,
.miss_shader_count = cmd->missShaderBindingTableSize / cmd->missShaderBindingTableStride,
.shader_count = cmd_buffer->state.rt_pipeline->stage_count,
.pipeline_api_hash = cmd_buffer->state.rt_pipeline->base.base.pipeline_hash,
.mode = 1,
.stride = sizeof(uint32_t),
.data_size = 0,
.ray_id_begin = 0,
.ray_id_end = 0xFFFFFFFF,
.pipeline_type = RADV_RRA_PIPELINE_RAY_TRACING,
};
struct radv_rra_ray_history_dispatch_size dispatch_size = {
.size = {width, height, depth},
};
struct radv_rra_ray_history_traversal_flags traversal_flags = {0};
data->metadata = (struct radv_rra_ray_history_metadata){
.counter_info.type = RADV_RRA_COUNTER_INFO,
.counter_info.size = sizeof(struct radv_rra_ray_history_counter),
.counter = counter,
.dispatch_size_info.type = RADV_RRA_DISPATCH_SIZE,
.dispatch_size_info.size = sizeof(struct radv_rra_ray_history_dispatch_size),
.dispatch_size = dispatch_size,
.traversal_flags_info.type = RADV_RRA_TRAVERSAL_FLAGS,
.traversal_flags_info.size = sizeof(struct radv_rra_ray_history_traversal_flags),
.traversal_flags = traversal_flags,
};
uint32_t dispatch_index = util_dynarray_num_elements(&cmd_buffer->ray_history, struct radv_rra_ray_history_data *)
<< 16;
util_dynarray_append(&cmd_buffer->ray_history, struct radv_rra_ray_history_data *, data);
cmd_buffer->state.flush_bits |=
RADV_CMD_FLAG_INV_SCACHE | RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
radv_src_access_flush(cmd_buffer, VK_ACCESS_2_SHADER_READ_BIT | VK_ACCESS_2_SHADER_WRITE_BIT, NULL) |
radv_dst_access_flush(cmd_buffer, VK_ACCESS_2_SHADER_READ_BIT | VK_ACCESS_2_SHADER_WRITE_BIT, NULL);
radv_update_buffer_cp(
cmd_buffer,
cmd_buffer->device->rra_trace.ray_history_addr + offsetof(struct radv_ray_history_header, dispatch_index),
&dispatch_index, sizeof(dispatch_index));
}
enum radv_rt_mode {
radv_rt_mode_direct,
radv_rt_mode_indirect,
@ -10366,6 +10437,9 @@ radv_trace_rays(struct radv_cmd_buffer *cmd_buffer, VkTraceRaysIndirectCommand2K
if (cmd_buffer->device->instance->debug_flags & RADV_DEBUG_NO_RT)
return;
if (unlikely(cmd_buffer->device->rra_trace.ray_history_buffer))
radv_trace_trace_rays(cmd_buffer, tables, indirect_va);
struct radv_compute_pipeline *pipeline = &cmd_buffer->state.rt_pipeline->base;
struct radv_shader *rt_prolog = cmd_buffer->state.rt_prolog;
uint32_t base_reg = rt_prolog->info.user_data_0;

View file

@ -630,28 +630,8 @@ capture_trace(VkQueue _queue)
VkResult result = VK_SUCCESS;
char filename[2048];
struct tm now;
time_t t;
t = time(NULL);
now = *localtime(&t);
if (queue->device->instance->vk.trace_mode & RADV_TRACE_MODE_RRA) {
if (_mesa_hash_table_num_entries(queue->device->rra_trace.accel_structs) == 0) {
fprintf(stderr, "radv: No acceleration structures captured, not saving RRA trace.\n");
} else {
snprintf(filename, sizeof(filename), "/tmp/%s_%04d.%02d.%02d_%02d.%02d.%02d.rra", util_get_process_name(),
1900 + now.tm_year, now.tm_mon + 1, now.tm_mday, now.tm_hour, now.tm_min, now.tm_sec);
result = radv_rra_dump_trace(_queue, filename);
if (result == VK_SUCCESS)
fprintf(stderr, "radv: RRA capture saved to '%s'\n", filename);
else
fprintf(stderr, "radv: Failed to save RRA capture!\n");
}
}
if (queue->device->instance->vk.trace_mode & RADV_TRACE_MODE_RRA)
queue->device->rra_trace.triggered = true;
if (queue->device->vk.memory_trace_data.is_enabled) {
simple_mtx_lock(&queue->device->vk.memory_trace_data.token_mtx);
@ -1064,7 +1044,9 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr
}
if ((device->instance->vk.trace_mode & RADV_TRACE_MODE_RRA) && radv_enable_rt(physical_device, false)) {
radv_rra_trace_init(device);
result = radv_rra_trace_init(device);
if (result != VK_SUCCESS)
goto fail;
}
if (device->vk.enabled_features.rayTracingPipelineShaderGroupHandleCaptureReplay) {
@ -1087,6 +1069,8 @@ fail:
radv_sqtt_finish(device);
radv_rra_trace_finish(radv_device_to_handle(device), &device->rra_trace);
radv_spm_finish(device);
radv_trap_handler_finish(device);

View file

@ -803,12 +803,13 @@ radv_rt_pipeline_create(VkDevice _device, VkPipelineCache _cache, const VkRayTra
goto fail;
bool keep_executable_info = radv_pipeline_capture_shaders(device, pipeline->base.base.create_flags);
bool emit_ray_history = !!device->rra_trace.ray_history_buffer;
radv_hash_rt_shaders(device, pipeline->sha1, stages, pCreateInfo, pipeline->groups);
pipeline->base.base.pipeline_hash = *(uint64_t *)pipeline->sha1;
bool cache_hit = false;
if (!keep_executable_info)
if (!keep_executable_info && !emit_ray_history)
cache_hit = radv_ray_tracing_pipeline_cache_search(device, cache, pipeline, pCreateInfo);
if (!cache_hit) {
@ -828,7 +829,7 @@ radv_rt_pipeline_create(VkDevice _device, VkPipelineCache _cache, const VkRayTra
radv_rmv_log_rt_pipeline_create(device, pipeline);
if (!cache_hit)
if (!cache_hit && !emit_ray_history)
radv_ray_tracing_pipeline_cache_insert(device, cache, pipeline, pCreateInfo->stageCount, pipeline->sha1);
/* write shader VAs into group handles */

View file

@ -847,13 +847,131 @@ struct radv_rra_accel_struct_data {
void radv_destroy_rra_accel_struct_data(VkDevice device, struct radv_rra_accel_struct_data *data);
struct radv_ray_history_header {
uint32_t offset;
uint32_t dispatch_index;
uint32_t submit_base_index;
};
enum radv_packed_token_type {
radv_packed_token_end_trace,
};
struct radv_packed_token_header {
uint32_t launch_index : 29;
uint32_t hit : 1;
uint32_t token_type : 2;
};
struct radv_packed_end_trace_token {
struct radv_packed_token_header header;
uint32_t accel_struct_lo;
uint32_t accel_struct_hi;
uint32_t flags : 16;
uint32_t dispatch_index : 16;
uint32_t sbt_offset : 4;
uint32_t sbt_stride : 4;
uint32_t miss_index : 16;
uint32_t cull_mask : 8;
float origin[3];
float tmin;
float direction[3];
float tmax;
uint32_t iteration_count : 16;
uint32_t instance_count : 16;
uint32_t primitive_id;
uint32_t geometry_id;
uint32_t instance_id : 24;
uint32_t hit_kind : 8;
float t;
};
static_assert(sizeof(struct radv_packed_end_trace_token) == 72, "Unexpected radv_packed_end_trace_token size");
enum radv_rra_ray_history_metadata_type {
RADV_RRA_COUNTER_INFO = 1,
RADV_RRA_DISPATCH_SIZE = 2,
RADV_RRA_TRAVERSAL_FLAGS = 3,
};
struct radv_rra_ray_history_metadata_info {
enum radv_rra_ray_history_metadata_type type : 32;
uint32_t padding;
uint64_t size;
};
enum radv_rra_pipeline_type {
RADV_RRA_PIPELINE_RAY_TRACING,
};
struct radv_rra_ray_history_counter {
uint32_t dispatch_size[3];
uint32_t hit_shader_count;
uint32_t miss_shader_count;
uint32_t shader_count;
uint64_t pipeline_api_hash;
uint32_t mode;
uint32_t mask;
uint32_t stride;
uint32_t data_size;
uint32_t lost_token_size;
uint32_t ray_id_begin;
uint32_t ray_id_end;
enum radv_rra_pipeline_type pipeline_type : 32;
};
struct radv_rra_ray_history_dispatch_size {
uint32_t size[3];
uint32_t padding;
};
struct radv_rra_ray_history_traversal_flags {
uint32_t box_sort_mode : 1;
uint32_t node_ptr_flags : 1;
uint32_t reserved : 30;
uint32_t padding;
};
struct radv_rra_ray_history_metadata {
struct radv_rra_ray_history_metadata_info counter_info;
struct radv_rra_ray_history_counter counter;
struct radv_rra_ray_history_metadata_info dispatch_size_info;
struct radv_rra_ray_history_dispatch_size dispatch_size;
struct radv_rra_ray_history_metadata_info traversal_flags_info;
struct radv_rra_ray_history_traversal_flags traversal_flags;
};
static_assert(sizeof(struct radv_rra_ray_history_metadata) == 136,
"radv_rra_ray_history_metadata does not match RRA expectations");
struct radv_rra_ray_history_data {
struct radv_rra_ray_history_metadata metadata;
};
struct radv_rra_trace_data {
struct hash_table *accel_structs;
struct hash_table_u64 *accel_struct_vas;
simple_mtx_t data_mtx;
bool validate_as;
bool copy_after_build;
bool triggered;
uint32_t copy_memory_index;
struct util_dynarray ray_history;
VkBuffer ray_history_buffer;
VkDeviceMemory ray_history_memory;
void *ray_history_data;
uint64_t ray_history_addr;
uint32_t ray_history_buffer_size;
uint32_t ray_history_resolution_scale;
};
enum radv_dispatch_table {
@ -1776,6 +1894,8 @@ struct radv_cmd_buffer {
uint64_t shader_upload_seq;
uint32_t sqtt_cb_id;
struct util_dynarray ray_history;
};
static inline bool
@ -3002,9 +3122,10 @@ VkResult radv_sqtt_get_timed_cmdbuf(struct radv_queue *queue, struct radeon_wins
VkResult radv_sqtt_acquire_gpu_timestamp(struct radv_device *device, struct radeon_winsys_bo **gpu_timestamp_bo,
uint32_t *gpu_timestamp_offset, void **gpu_timestamp_ptr);
void radv_rra_trace_init(struct radv_device *device);
VkResult radv_rra_trace_init(struct radv_device *device);
VkResult radv_rra_dump_trace(VkQueue vk_queue, char *filename);
void radv_rra_trace_clear_ray_history(VkDevice _device, struct radv_rra_trace_data *data);
void radv_rra_trace_finish(VkDevice vk_device, struct radv_rra_trace_data *data);
void radv_memory_trace_init(struct radv_device *device);

View file

@ -42,6 +42,7 @@ static_assert(sizeof(struct rra_file_header) == 32, "rra_file_header does not ma
enum rra_chunk_version {
RADV_RRA_ASIC_API_INFO_CHUNK_VERSION = 0x1,
RADV_RRA_RAY_HISTORY_CHUNK_VERSION = 0x2,
RADV_RRA_ACCEL_STRUCT_CHUNK_VERSION = 0xF0005,
};
@ -894,7 +895,7 @@ exit:
return result;
}
void
VkResult
radv_rra_trace_init(struct radv_device *device)
{
device->rra_trace.validate_as = debug_get_bool_option("RADV_RRA_TRACE_VALIDATE", false);
@ -906,11 +907,91 @@ radv_rra_trace_init(struct radv_device *device)
device->rra_trace.copy_memory_index = radv_find_memory_index(
device->physical_device,
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT);
util_dynarray_init(&device->rra_trace.ray_history, NULL);
device->rra_trace.ray_history_buffer_size = debug_get_num_option("RADV_RRA_TRACE_HISTORY_SIZE", 100 * 1024 * 1024);
if (device->rra_trace.ray_history_buffer_size <
sizeof(struct radv_ray_history_header) + sizeof(struct radv_packed_end_trace_token))
return VK_SUCCESS;
device->rra_trace.ray_history_resolution_scale = debug_get_num_option("RADV_RRA_TRACE_RESOLUTION_SCALE", 1);
device->rra_trace.ray_history_resolution_scale = MAX2(device->rra_trace.ray_history_resolution_scale, 1);
VkBufferCreateInfo buffer_create_info = {
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
.pNext =
&(VkBufferUsageFlags2CreateInfoKHR){
.sType = VK_STRUCTURE_TYPE_BUFFER_USAGE_FLAGS_2_CREATE_INFO_KHR,
.usage = VK_BUFFER_USAGE_2_TRANSFER_SRC_BIT_KHR | VK_BUFFER_USAGE_2_SHADER_DEVICE_ADDRESS_BIT_KHR,
},
.size = device->rra_trace.ray_history_buffer_size,
};
VkDevice _device = radv_device_to_handle(device);
VkResult result = radv_CreateBuffer(_device, &buffer_create_info, NULL, &device->rra_trace.ray_history_buffer);
if (result != VK_SUCCESS)
return result;
VkMemoryRequirements requirements;
vk_common_GetBufferMemoryRequirements(_device, device->rra_trace.ray_history_buffer, &requirements);
VkMemoryAllocateInfo alloc_info = {
.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
.allocationSize = requirements.size,
.memoryTypeIndex = radv_find_memory_index(device->physical_device, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
VK_MEMORY_PROPERTY_HOST_COHERENT_BIT),
};
result = radv_AllocateMemory(_device, &alloc_info, NULL, &device->rra_trace.ray_history_memory);
if (result != VK_SUCCESS)
return result;
result = vk_common_MapMemory(_device, device->rra_trace.ray_history_memory, 0, VK_WHOLE_SIZE, 0,
(void **)&device->rra_trace.ray_history_data);
if (result != VK_SUCCESS)
return result;
result = vk_common_BindBufferMemory(_device, device->rra_trace.ray_history_buffer,
device->rra_trace.ray_history_memory, 0);
VkBufferDeviceAddressInfo addr_info = {
.sType = VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO,
.buffer = device->rra_trace.ray_history_buffer,
};
device->rra_trace.ray_history_addr = radv_GetBufferDeviceAddress(_device, &addr_info);
struct radv_ray_history_header *ray_history_header = device->rra_trace.ray_history_data;
memset(ray_history_header, 0, sizeof(struct radv_ray_history_header));
ray_history_header->offset = 1;
return result;
}
void
radv_rra_trace_clear_ray_history(VkDevice _device, struct radv_rra_trace_data *data)
{
util_dynarray_foreach (&data->ray_history, struct radv_rra_ray_history_data *, _entry) {
struct radv_rra_ray_history_data *entry = *_entry;
free(entry);
}
util_dynarray_clear(&data->ray_history);
}
void
radv_rra_trace_finish(VkDevice vk_device, struct radv_rra_trace_data *data)
{
radv_DestroyBuffer(vk_device, data->ray_history_buffer, NULL);
if (data->ray_history_memory)
vk_common_UnmapMemory(vk_device, data->ray_history_memory);
radv_FreeMemory(vk_device, data->ray_history_memory, NULL);
radv_rra_trace_clear_ray_history(vk_device, data);
util_dynarray_fini(&data->ray_history);
if (data->accel_structs)
hash_table_foreach (data->accel_structs, entry)
radv_destroy_rra_accel_struct_data(vk_device, entry->data);
@ -953,6 +1034,8 @@ struct rra_copy_context {
void *mapped_data;
struct hash_entry **entries;
uint32_t min_size;
};
static VkResult
@ -962,7 +1045,7 @@ rra_copy_context_init(struct rra_copy_context *ctx)
if (device->rra_trace.copy_after_build)
return VK_SUCCESS;
uint32_t max_size = 0;
uint32_t max_size = ctx->min_size;
uint32_t accel_struct_count = _mesa_hash_table_num_entries(device->rra_trace.accel_structs);
for (unsigned i = 0; i < accel_struct_count; i++) {
struct radv_rra_accel_struct_data *data = ctx->entries[i]->data;
@ -1115,6 +1198,119 @@ rra_unmap_accel_struct_data(struct rra_copy_context *ctx, uint32_t i)
vk_common_UnmapMemory(ctx->device, data->memory);
}
enum rra_ray_history_token_type {
rra_ray_history_token_begin,
rra_ray_history_token_tlas,
rra_ray_history_token_blas,
rra_ray_history_token_end,
rra_ray_history_token_call,
rra_ray_history_token_timestamp,
rra_ray_history_token_ahit_status,
rra_ray_history_token_call2,
rra_ray_history_token_isec_status,
rra_ray_history_token_end2,
rra_ray_history_token_begin2,
rra_ray_history_token_normal = 0xFFFF,
};
struct rra_ray_history_id_token {
uint32_t id : 30;
uint32_t reserved : 1;
uint32_t has_control : 1;
};
static_assert(sizeof(struct rra_ray_history_id_token) == 4, "rra_ray_history_id_token does not match RRA expectations");
struct rra_ray_history_control_token {
uint32_t type : 16;
uint32_t length : 8;
uint32_t data : 8;
};
static_assert(sizeof(struct rra_ray_history_control_token) == 4,
"rra_ray_history_control_token does not match RRA expectations");
struct rra_ray_history_begin_token {
uint32_t wave_id;
uint32_t launch_ids[3];
uint32_t accel_struct_lo;
uint32_t accel_struct_hi;
uint32_t ray_flags;
uint32_t cull_mask : 8;
uint32_t stb_offset : 4;
uint32_t stb_stride : 4;
uint32_t miss_index : 16;
float origin[3];
float tmin;
float direction[3];
float tmax;
};
static_assert(sizeof(struct rra_ray_history_begin_token) == 64,
"rra_ray_history_begin_token does not match RRA expectations");
struct rra_ray_history_begin2_token {
struct rra_ray_history_begin_token base;
uint32_t call_instruction_id;
uint32_t unique_wave_id;
uint32_t parent_unique_wave_id;
};
static_assert(sizeof(struct rra_ray_history_begin2_token) == 76,
"rra_ray_history_begin2_token does not match RRA expectations");
struct rra_ray_history_end_token {
uint32_t primitive_index;
uint32_t geometry_index;
};
static_assert(sizeof(struct rra_ray_history_end_token) == 8,
"rra_ray_history_end_token does not match RRA expectations");
struct rra_ray_history_end2_token {
struct rra_ray_history_end_token base;
uint32_t instance_index : 24;
uint32_t hit_kind : 8;
uint32_t iteration_count;
uint32_t candidate_instance_count;
float t;
};
static_assert(sizeof(struct rra_ray_history_end2_token) == 24,
"rra_ray_history_end2_token does not match RRA expectations");
struct rra_ray_history_tlas_token {
uint64_t addr;
};
static_assert(sizeof(struct rra_ray_history_tlas_token) == 8,
"rra_ray_history_tlas_token does not match RRA expectations");
struct rra_ray_history_blas_token {
uint64_t addr;
};
static_assert(sizeof(struct rra_ray_history_blas_token) == 8,
"rra_ray_history_blas_token does not match RRA expectations");
struct rra_ray_history_call_token {
uint32_t addr[2];
};
static_assert(sizeof(struct rra_ray_history_call_token) == 8,
"rra_ray_history_call_token does not match RRA expectations");
struct rra_ray_history_call2_token {
struct rra_ray_history_call_token base;
uint32_t sbt_index;
};
static_assert(sizeof(struct rra_ray_history_call2_token) == 12,
"rra_ray_history_call2_token does not match RRA expectations");
struct rra_ray_history_isec_token {
float t;
uint32_t hit_kind;
};
static_assert(sizeof(struct rra_ray_history_isec_token) == 8,
"rra_ray_history_isec_token does not match RRA expectations");
struct rra_ray_history_timestamp_token {
uint64_t gpu_timestamp;
};
static_assert(sizeof(struct rra_ray_history_timestamp_token) == 8,
"rra_ray_history_timestamp_token does not match RRA expectations");
VkResult
radv_rra_dump_trace(VkQueue vk_queue, char *filename)
{
@ -1127,13 +1323,22 @@ radv_rra_dump_trace(VkQueue vk_queue, char *filename)
return result;
uint64_t *accel_struct_offsets = NULL;
uint64_t *ray_history_offsets = NULL;
struct hash_entry **hash_entries = NULL;
FILE *file = NULL;
uint32_t struct_count = _mesa_hash_table_num_entries(device->rra_trace.accel_structs);
accel_struct_offsets = calloc(struct_count, sizeof(uint64_t));
if (!accel_struct_offsets)
return VK_ERROR_OUT_OF_HOST_MEMORY;
uint32_t dispatch_count =
util_dynarray_num_elements(&device->rra_trace.ray_history, struct radv_rra_ray_history_data *);
ray_history_offsets = calloc(dispatch_count, sizeof(uint64_t));
if (!ray_history_offsets) {
result = VK_ERROR_OUT_OF_HOST_MEMORY;
goto cleanup;
}
hash_entries = malloc(sizeof(*hash_entries) * struct_count);
if (!hash_entries) {
@ -1175,6 +1380,7 @@ radv_rra_dump_trace(VkQueue vk_queue, char *filename)
.queue = vk_queue,
.entries = hash_entries,
.family_index = queue->vk.queue_family_index,
.min_size = device->rra_trace.ray_history_buffer_size,
};
result = rra_copy_context_init(&copy_ctx);
@ -1197,6 +1403,118 @@ radv_rra_dump_trace(VkQueue vk_queue, char *filename)
written_accel_struct_count++;
}
uint64_t ray_history_offset = (uint64_t)ftell(file);
uint32_t ray_history_index = 0xFFFFFFFF;
struct radv_rra_ray_history_data *ray_history = NULL;
uint8_t *history = device->rra_trace.ray_history_data;
struct radv_ray_history_header *history_header = (void *)history;
uint32_t history_buffer_size_mb = device->rra_trace.ray_history_buffer_size / 1024 / 1024;
uint32_t history_size_mb = history_header->offset / 1024 / 1024;
if (history_header->offset > device->rra_trace.ray_history_buffer_size) {
fprintf(stderr, "radv: rra: The ray history buffer size (%u MB) is to small. %u MB is required.\n",
history_buffer_size_mb, history_size_mb);
} else {
fprintf(stderr, "radv: rra: Ray history buffer size = %u MB, ray history size = %u MB.\n", history_buffer_size_mb,
history_size_mb);
}
uint32_t token_size;
for (uint32_t offset = sizeof(struct radv_ray_history_header); offset < history_header->offset;
offset += token_size) {
struct radv_packed_end_trace_token *src = (void *)(history + offset);
token_size = src->header.hit ? sizeof(struct radv_packed_end_trace_token)
: offsetof(struct radv_packed_end_trace_token, primitive_id);
if (src->dispatch_index != ray_history_index) {
ray_history_index = src->dispatch_index;
assert(ray_history_index < dispatch_count);
ray_history = *util_dynarray_element(&device->rra_trace.ray_history, struct radv_rra_ray_history_data *,
ray_history_index);
assert(!ray_history_offsets[ray_history_index]);
ray_history_offsets[ray_history_index] = (uint64_t)ftell(file);
fwrite(&ray_history->metadata, sizeof(struct radv_rra_ray_history_metadata), 1, file);
}
uint32_t *dispatch_size = ray_history->metadata.dispatch_size.size;
uint32_t x = src->header.launch_index % dispatch_size[0];
uint32_t y = (src->header.launch_index / dispatch_size[0]) % dispatch_size[1];
uint32_t z = src->header.launch_index / (dispatch_size[0] * dispatch_size[1]);
struct rra_ray_history_id_token begin_id = {
.id = src->header.launch_index,
.has_control = true,
};
struct rra_ray_history_control_token begin_control = {
.type = rra_ray_history_token_begin,
.length = sizeof(struct rra_ray_history_begin_token) / 4,
};
struct rra_ray_history_begin_token begin = {
.wave_id = src->header.launch_index / 32,
.launch_ids = {x, y, z},
.accel_struct_lo = src->accel_struct_lo,
.accel_struct_hi = src->accel_struct_hi & 0x1FFFFFF,
.ray_flags = src->flags,
.cull_mask = src->cull_mask,
.stb_offset = src->sbt_offset,
.stb_stride = src->sbt_stride,
.miss_index = src->miss_index,
.origin[0] = src->origin[0],
.origin[1] = src->origin[1],
.origin[2] = src->origin[2],
.tmin = src->tmin,
.direction[0] = src->direction[0],
.direction[1] = src->direction[1],
.direction[2] = src->direction[2],
.tmax = src->tmax,
};
fwrite(&begin_id, sizeof(begin_id), 1, file);
fwrite(&begin_control, sizeof(begin_control), 1, file);
fwrite(&begin, sizeof(begin), 1, file);
struct rra_ray_history_id_token end_id = {
.id = src->header.launch_index,
.has_control = true,
};
struct rra_ray_history_control_token end_control = {
.type = rra_ray_history_token_end2,
.length = sizeof(struct rra_ray_history_end2_token) / 4,
};
struct rra_ray_history_end2_token end = {
.base.primitive_index = 0xFFFFFFFF,
.base.geometry_index = 0xFFFFFFFF,
.iteration_count = src->iteration_count,
.candidate_instance_count = src->instance_count,
};
if (src->header.hit) {
end.base.primitive_index = src->primitive_id;
end.base.geometry_index = src->geometry_id;
end.instance_index = src->instance_id;
end.hit_kind = src->hit_kind;
end.t = src->t;
}
fwrite(&end_id, sizeof(end_id), 1, file);
fwrite(&end_control, sizeof(end_control), 1, file);
fwrite(&end, sizeof(end), 1, file);
}
for (uint32_t i = 0; i < dispatch_count; i++) {
if (ray_history_offsets[i])
continue;
ray_history = *util_dynarray_element(&device->rra_trace.ray_history, struct radv_rra_ray_history_data *, i);
ray_history_offsets[i] = (uint64_t)ftell(file);
fwrite(&ray_history->metadata, sizeof(struct radv_rra_ray_history_metadata), 1, file);
}
history_header->offset = 1;
rra_copy_context_finish(&copy_ctx);
uint64_t chunk_info_offset = (uint64_t)ftell(file);
@ -1204,10 +1522,24 @@ radv_rra_dump_trace(VkQueue vk_queue, char *filename)
rra_dump_chunk_description(asic_info_offset, 0, sizeof(struct rra_asic_info), "AsicInfo",
RADV_RRA_ASIC_API_INFO_CHUNK_VERSION, file);
for (uint32_t i = 0; i < dispatch_count; i++) {
uint64_t tokens_size;
if (i == dispatch_count - 1)
tokens_size = (uint64_t)(chunk_info_offset - ray_history_offsets[i]);
else
tokens_size = (uint64_t)(ray_history_offsets[i + 1] - ray_history_offsets[i]);
tokens_size -= sizeof(struct radv_rra_ray_history_metadata);
rra_dump_chunk_description(ray_history_offsets[i], 0, sizeof(struct radv_rra_ray_history_metadata),
"HistoryMetadata", RADV_RRA_RAY_HISTORY_CHUNK_VERSION, file);
rra_dump_chunk_description(ray_history_offsets[i] + sizeof(struct radv_rra_ray_history_metadata), 0, tokens_size,
"HistoryTokensRaw", RADV_RRA_RAY_HISTORY_CHUNK_VERSION, file);
}
for (uint32_t i = 0; i < written_accel_struct_count; ++i) {
uint64_t accel_struct_size;
if (i == written_accel_struct_count - 1)
accel_struct_size = (uint64_t)(chunk_info_offset - accel_struct_offsets[i]);
accel_struct_size = (uint64_t)(ray_history_offset - accel_struct_offsets[i]);
else
accel_struct_size = (uint64_t)(accel_struct_offsets[i + 1] - accel_struct_offsets[i]);
@ -1227,6 +1559,7 @@ cleanup:
fclose(file);
free(hash_entries);
free(ray_history_offsets);
free(accel_struct_offsets);
return result;
}