diff --git a/src/amd/vulkan/layers/radv_rra_layer.c b/src/amd/vulkan/layers/radv_rra_layer.c index 7a1c42083f2..a5578b9f69d 100644 --- a/src/amd/vulkan/layers/radv_rra_layer.c +++ b/src/amd/vulkan/layers/radv_rra_layer.c @@ -32,10 +32,43 @@ VKAPI_ATTR VkResult VKAPI_CALL rra_QueuePresentKHR(VkQueue _queue, const VkPresentInfoKHR *pPresentInfo) { RADV_FROM_HANDLE(radv_queue, queue, _queue); + + if (queue->device->rra_trace.triggered) { + queue->device->rra_trace.triggered = false; + + if (_mesa_hash_table_num_entries(queue->device->rra_trace.accel_structs) == 0) { + fprintf(stderr, "radv: No acceleration structures captured, not saving RRA trace.\n"); + } else { + char filename[2048]; + time_t t = time(NULL); + struct tm now = *localtime(&t); + snprintf(filename, sizeof(filename), "/tmp/%s_%04d.%02d.%02d_%02d.%02d.%02d.rra", util_get_process_name(), + 1900 + now.tm_year, now.tm_mon + 1, now.tm_mday, now.tm_hour, now.tm_min, now.tm_sec); + + VkResult result = radv_rra_dump_trace(_queue, filename); + if (result == VK_SUCCESS) + fprintf(stderr, "radv: RRA capture saved to '%s'\n", filename); + else + fprintf(stderr, "radv: Failed to save RRA capture!\n"); + } + } + VkResult result = queue->device->layer_dispatch.rra.QueuePresentKHR(_queue, pPresentInfo); if (result != VK_SUCCESS && result != VK_SUBOPTIMAL_KHR) return result; + VkDevice _device = radv_device_to_handle(queue->device); + radv_rra_trace_clear_ray_history(_device, &queue->device->rra_trace); + + if (queue->device->rra_trace.triggered) { + result = queue->device->layer_dispatch.rra.DeviceWaitIdle(_device); + if (result != VK_SUCCESS) + return result; + + struct radv_ray_history_header *header = queue->device->rra_trace.ray_history_data; + header->offset = sizeof(struct radv_ray_history_header); + } + if (!queue->device->rra_trace.copy_after_build) return VK_SUCCESS; @@ -46,7 +79,7 @@ rra_QueuePresentKHR(VkQueue _queue, const VkPresentInfoKHR *pPresentInfo) if (!data->is_dead) continue; - radv_destroy_rra_accel_struct_data(radv_device_to_handle(queue->device), data); + radv_destroy_rra_accel_struct_data(_device, data); _mesa_hash_table_remove(accel_structs, entry); } @@ -292,3 +325,45 @@ rra_DestroyAccelerationStructureKHR(VkDevice _device, VkAccelerationStructureKHR device->layer_dispatch.rra.DestroyAccelerationStructureKHR(_device, _structure, pAllocator); } + +VKAPI_ATTR VkResult VKAPI_CALL +rra_QueueSubmit2KHR(VkQueue _queue, uint32_t submitCount, const VkSubmitInfo2 *pSubmits, VkFence _fence) +{ + RADV_FROM_HANDLE(radv_queue, queue, _queue); + struct radv_device *device = queue->device; + + VkResult result = device->layer_dispatch.rra.QueueSubmit2KHR(_queue, submitCount, pSubmits, _fence); + if (result != VK_SUCCESS || !device->rra_trace.triggered) + return result; + + uint32_t total_trace_count = 0; + + simple_mtx_lock(&device->rra_trace.data_mtx); + + for (uint32_t submit_index = 0; submit_index < submitCount; submit_index++) { + for (uint32_t i = 0; i < pSubmits[submit_index].commandBufferInfoCount; i++) { + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, pSubmits[submit_index].pCommandBufferInfos[i].commandBuffer); + uint32_t trace_count = + util_dynarray_num_elements(&cmd_buffer->ray_history, struct radv_rra_ray_history_data *); + if (!trace_count) + continue; + + total_trace_count += trace_count; + util_dynarray_append_dynarray(&device->rra_trace.ray_history, &cmd_buffer->ray_history); + } + } + + if (!total_trace_count) { + simple_mtx_unlock(&device->rra_trace.data_mtx); + return result; + } + + result = device->layer_dispatch.rra.DeviceWaitIdle(radv_device_to_handle(device)); + + struct radv_ray_history_header *header = device->rra_trace.ray_history_data; + header->submit_base_index += total_trace_count; + + simple_mtx_unlock(&device->rra_trace.data_mtx); + + return result; +} diff --git a/src/amd/vulkan/nir/radv_nir_rt_common.c b/src/amd/vulkan/nir/radv_nir_rt_common.c index f0fccf049f3..e1916111bb4 100644 --- a/src/amd/vulkan/nir/radv_nir_rt_common.c +++ b/src/amd/vulkan/nir/radv_nir_rt_common.c @@ -570,6 +570,12 @@ radv_build_ray_traversal(struct radv_device *device, nir_builder *b, const struc } nir_push_else(b, NULL); { + if (args->vars.iteration_instance_count) { + nir_def *iteration_instance_count = nir_load_deref(b, args->vars.iteration_instance_count); + iteration_instance_count = nir_iadd_imm(b, iteration_instance_count, 1 << 16); + nir_store_deref(b, args->vars.iteration_instance_count, iteration_instance_count, 0x1); + } + /* instance */ nir_def *instance_node_addr = build_node_to_addr(device, b, global_bvh_node, false); nir_store_deref(b, args->vars.instance_addr, instance_node_addr, 1); @@ -670,6 +676,12 @@ radv_build_ray_traversal(struct radv_device *device, nir_builder *b, const struc insert_traversal_triangle_case(device, b, args, &ray_flags, result, global_bvh_node); } nir_pop_if(b, NULL); + + if (args->vars.iteration_instance_count) { + nir_def *iteration_instance_count = nir_load_deref(b, args->vars.iteration_instance_count); + iteration_instance_count = nir_iadd_imm(b, iteration_instance_count, 1); + nir_store_deref(b, args->vars.iteration_instance_count, iteration_instance_count, 0x1); + } } nir_pop_loop(b, NULL); diff --git a/src/amd/vulkan/nir/radv_nir_rt_common.h b/src/amd/vulkan/nir/radv_nir_rt_common.h index 5ccf38ea1f0..c55e5d82473 100644 --- a/src/amd/vulkan/nir/radv_nir_rt_common.h +++ b/src/amd/vulkan/nir/radv_nir_rt_common.h @@ -118,6 +118,9 @@ struct radv_ray_traversal_vars { /* Information about the current instance used for culling. */ nir_deref_instr *instance_addr; nir_deref_instr *sbt_offset_and_flags; + + /* Statistics. Iteration count in the low 16 bits, candidate instance counts in the high 16 bits. */ + nir_deref_instr *iteration_instance_count; }; struct radv_ray_traversal_args { diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c index 54223d28717..50cb850484f 100644 --- a/src/amd/vulkan/nir/radv_nir_rt_shader.c +++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c @@ -855,6 +855,137 @@ radv_nir_lower_rt_io(nir_shader *nir, bool monolithic, uint32_t payload_offset) } } +static nir_def * +radv_build_token_begin(nir_builder *b, struct rt_variables *vars, nir_def *hit, enum radv_packed_token_type token_type, + nir_def *token_size, uint32_t max_token_size) +{ + struct radv_rra_trace_data *rra_trace = &vars->device->rra_trace; + assert(rra_trace->ray_history_addr); + assert(rra_trace->ray_history_buffer_size >= max_token_size); + + nir_def *ray_history_addr = nir_imm_int64(b, rra_trace->ray_history_addr); + + nir_def *launch_id = nir_load_ray_launch_id(b); + + nir_def *trace = nir_imm_true(b); + for (uint32_t i = 0; i < 3; i++) { + nir_def *remainder = nir_umod_imm(b, nir_channel(b, launch_id, i), rra_trace->ray_history_resolution_scale); + trace = nir_iand(b, trace, nir_ieq_imm(b, remainder, 0)); + } + nir_push_if(b, trace); + + static_assert(offsetof(struct radv_ray_history_header, offset) == 0, "Unexpected offset"); + nir_def *base_offset = nir_global_atomic(b, 32, ray_history_addr, token_size, .atomic_op = nir_atomic_op_iadd); + + /* Abuse the dword alignment of token_size to add an invalid bit to offset. */ + trace = nir_ieq_imm(b, nir_iand_imm(b, base_offset, 1), 0); + + nir_def *in_bounds = nir_ule_imm(b, base_offset, rra_trace->ray_history_buffer_size - max_token_size); + /* Make sure we don't overwrite the header in case of an overflow. */ + in_bounds = nir_iand(b, in_bounds, nir_uge_imm(b, base_offset, sizeof(struct radv_ray_history_header))); + + nir_push_if(b, nir_iand(b, trace, in_bounds)); + + nir_def *dst_addr = nir_iadd(b, ray_history_addr, nir_u2u64(b, base_offset)); + + nir_def *launch_size = nir_load_ray_launch_size(b); + + nir_def *launch_id_comps[3]; + nir_def *launch_size_comps[3]; + for (uint32_t i = 0; i < 3; i++) { + launch_id_comps[i] = nir_udiv_imm(b, nir_channel(b, launch_id, i), rra_trace->ray_history_resolution_scale); + launch_size_comps[i] = nir_udiv_imm(b, nir_channel(b, launch_size, i), rra_trace->ray_history_resolution_scale); + } + + nir_def *global_index = + nir_iadd(b, launch_id_comps[0], + nir_iadd(b, nir_imul(b, launch_id_comps[1], launch_size_comps[0]), + nir_imul(b, launch_id_comps[2], nir_imul(b, launch_size_comps[0], launch_size_comps[1])))); + nir_def *launch_index_and_hit = nir_bcsel(b, hit, nir_ior_imm(b, global_index, 1u << 29u), global_index); + nir_build_store_global(b, nir_ior_imm(b, launch_index_and_hit, token_type << 30), dst_addr, .align_mul = 4); + + return nir_iadd_imm(b, dst_addr, 4); +} + +static void +radv_build_token_end(nir_builder *b) +{ + nir_pop_if(b, NULL); + nir_pop_if(b, NULL); +} + +static void +radv_build_end_trace_token(nir_builder *b, struct rt_variables *vars, nir_def *tmax, nir_def *hit, + nir_def *iteration_instance_count) +{ + nir_def *token_size = nir_bcsel(b, hit, nir_imm_int(b, sizeof(struct radv_packed_end_trace_token)), + nir_imm_int(b, offsetof(struct radv_packed_end_trace_token, primitive_id))); + + nir_def *dst_addr = radv_build_token_begin(b, vars, hit, radv_packed_token_end_trace, token_size, + sizeof(struct radv_packed_end_trace_token)); + { + nir_build_store_global(b, nir_load_var(b, vars->accel_struct), dst_addr, .align_mul = 4); + dst_addr = nir_iadd_imm(b, dst_addr, 8); + + nir_def *dispatch_indices = + nir_load_smem_amd(b, 2, nir_imm_int64(b, vars->device->rra_trace.ray_history_addr), + nir_imm_int(b, offsetof(struct radv_ray_history_header, dispatch_index)), .align_mul = 4); + nir_def *dispatch_index = nir_iadd(b, nir_channel(b, dispatch_indices, 0), nir_channel(b, dispatch_indices, 1)); + nir_def *dispatch_and_flags = nir_iand_imm(b, nir_load_var(b, vars->cull_mask_and_flags), 0xFFFF); + dispatch_and_flags = nir_ior(b, dispatch_and_flags, dispatch_index); + nir_build_store_global(b, dispatch_and_flags, dst_addr, .align_mul = 4); + dst_addr = nir_iadd_imm(b, dst_addr, 4); + + nir_def *shifted_cull_mask = nir_iand_imm(b, nir_load_var(b, vars->cull_mask_and_flags), 0xFF000000); + + nir_def *packed_args = nir_load_var(b, vars->sbt_offset); + packed_args = nir_ior(b, packed_args, nir_ishl_imm(b, nir_load_var(b, vars->sbt_stride), 4)); + packed_args = nir_ior(b, packed_args, nir_ishl_imm(b, nir_load_var(b, vars->miss_index), 8)); + packed_args = nir_ior(b, packed_args, shifted_cull_mask); + nir_build_store_global(b, packed_args, dst_addr, .align_mul = 4); + dst_addr = nir_iadd_imm(b, dst_addr, 4); + + nir_build_store_global(b, nir_load_var(b, vars->origin), dst_addr, .align_mul = 4); + dst_addr = nir_iadd_imm(b, dst_addr, 12); + + nir_build_store_global(b, nir_load_var(b, vars->tmin), dst_addr, .align_mul = 4); + dst_addr = nir_iadd_imm(b, dst_addr, 4); + + nir_build_store_global(b, nir_load_var(b, vars->direction), dst_addr, .align_mul = 4); + dst_addr = nir_iadd_imm(b, dst_addr, 12); + + nir_build_store_global(b, tmax, dst_addr, .align_mul = 4); + dst_addr = nir_iadd_imm(b, dst_addr, 4); + + nir_build_store_global(b, iteration_instance_count, dst_addr, .align_mul = 4); + dst_addr = nir_iadd_imm(b, dst_addr, 4); + + nir_push_if(b, hit); + { + nir_build_store_global(b, nir_load_var(b, vars->primitive_id), dst_addr, .align_mul = 4); + dst_addr = nir_iadd_imm(b, dst_addr, 4); + + nir_def *geometry_id = nir_iand_imm(b, nir_load_var(b, vars->geometry_id_and_flags), 0xFFFFFFF); + nir_build_store_global(b, geometry_id, dst_addr, .align_mul = 4); + dst_addr = nir_iadd_imm(b, dst_addr, 4); + + nir_def *instance_id_and_hit_kind = + nir_build_load_global(b, 1, 32, + nir_iadd_imm(b, nir_load_var(b, vars->instance_addr), + offsetof(struct radv_bvh_instance_node, instance_id))); + instance_id_and_hit_kind = + nir_ior(b, instance_id_and_hit_kind, nir_ishl_imm(b, nir_load_var(b, vars->hit_kind), 24)); + nir_build_store_global(b, instance_id_and_hit_kind, dst_addr, .align_mul = 4); + dst_addr = nir_iadd_imm(b, dst_addr, 4); + + nir_build_store_global(b, nir_load_var(b, vars->tmax), dst_addr, .align_mul = 4); + dst_addr = nir_iadd_imm(b, dst_addr, 4); + } + nir_pop_if(b, NULL); + } + radv_build_token_end(b); +} + static nir_function_impl * lower_any_hit_for_intersection(nir_shader *any_hit) { @@ -1432,6 +1563,14 @@ radv_build_traversal(struct radv_device *device, struct radv_ray_tracing_pipelin .sbt_offset_and_flags = nir_build_deref_var(b, trav_vars.sbt_offset_and_flags), }; + nir_variable *iteration_instance_count = NULL; + if (vars->device->rra_trace.ray_history_addr) { + iteration_instance_count = + nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "iteration_instance_count"); + nir_store_var(b, iteration_instance_count, nir_imm_int(b, 0), 0x1); + trav_vars_args.iteration_instance_count = nir_build_deref_var(b, iteration_instance_count); + } + struct traversal_data data = { .device = device, .vars = vars, @@ -1464,8 +1603,14 @@ radv_build_traversal(struct radv_device *device, struct radv_ray_tracing_pipelin .data = &data, }; + nir_def *original_tmax = nir_load_var(b, vars->tmax); + radv_build_ray_traversal(device, b, &args); + if (vars->device->rra_trace.ray_history_addr) + radv_build_end_trace_token(b, vars, original_tmax, nir_load_var(b, trav_vars.hit), + nir_load_var(b, iteration_instance_count)); + nir_metadata_preserve(nir_shader_get_entrypoint(b->shader), nir_metadata_none); radv_nir_lower_hit_attrib_derefs(b->shader); diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index aba9c4dbe90..581738d2089 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -326,6 +326,8 @@ radv_destroy_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer) struct radv_cmd_buffer *cmd_buffer = container_of(vk_cmd_buffer, struct radv_cmd_buffer, vk); if (cmd_buffer->qf != RADV_QUEUE_SPARSE) { + util_dynarray_fini(&cmd_buffer->ray_history); + list_for_each_entry_safe (struct radv_cmd_buffer_upload, up, &cmd_buffer->upload.list, list) { radv_rmv_log_command_buffer_bo_destroy(cmd_buffer->device, up->upload_bo); cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, up->upload_bo); @@ -404,6 +406,8 @@ radv_create_cmd_buffer(struct vk_command_pool *pool, struct vk_command_buffer ** for (unsigned i = 0; i < MAX_BIND_POINTS; i++) vk_object_base_init(&device->vk, &cmd_buffer->descriptors[i].push_set.set.base, VK_OBJECT_TYPE_DESCRIPTOR_SET); + + util_dynarray_init(&cmd_buffer->ray_history, NULL); } *cmd_buffer_out = &cmd_buffer->vk; @@ -438,6 +442,8 @@ radv_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer, UNUSED VkCommandB free(up); } + util_dynarray_clear(&cmd_buffer->ray_history); + cmd_buffer->push_constant_stages = 0; cmd_buffer->scratch_size_per_wave_needed = 0; cmd_buffer->scratch_waves_wanted = 0; @@ -10334,6 +10340,71 @@ radv_indirect_dispatch(struct radv_cmd_buffer *cmd_buffer, struct radeon_winsys_ radv_compute_dispatch(cmd_buffer, &info); } +static void +radv_trace_trace_rays(struct radv_cmd_buffer *cmd_buffer, const VkTraceRaysIndirectCommand2KHR *cmd, + uint64_t indirect_va) +{ + if (!cmd || indirect_va) + return; + + struct radv_rra_ray_history_data *data = malloc(sizeof(struct radv_rra_ray_history_data)); + if (!data) + return; + + uint32_t width = DIV_ROUND_UP(cmd->width, cmd_buffer->device->rra_trace.ray_history_resolution_scale); + uint32_t height = DIV_ROUND_UP(cmd->height, cmd_buffer->device->rra_trace.ray_history_resolution_scale); + uint32_t depth = DIV_ROUND_UP(cmd->depth, cmd_buffer->device->rra_trace.ray_history_resolution_scale); + + struct radv_rra_ray_history_counter counter = { + .dispatch_size = {width, height, depth}, + .hit_shader_count = cmd->hitShaderBindingTableSize / cmd->hitShaderBindingTableStride, + .miss_shader_count = cmd->missShaderBindingTableSize / cmd->missShaderBindingTableStride, + .shader_count = cmd_buffer->state.rt_pipeline->stage_count, + .pipeline_api_hash = cmd_buffer->state.rt_pipeline->base.base.pipeline_hash, + .mode = 1, + .stride = sizeof(uint32_t), + .data_size = 0, + .ray_id_begin = 0, + .ray_id_end = 0xFFFFFFFF, + .pipeline_type = RADV_RRA_PIPELINE_RAY_TRACING, + }; + + struct radv_rra_ray_history_dispatch_size dispatch_size = { + .size = {width, height, depth}, + }; + + struct radv_rra_ray_history_traversal_flags traversal_flags = {0}; + + data->metadata = (struct radv_rra_ray_history_metadata){ + .counter_info.type = RADV_RRA_COUNTER_INFO, + .counter_info.size = sizeof(struct radv_rra_ray_history_counter), + .counter = counter, + + .dispatch_size_info.type = RADV_RRA_DISPATCH_SIZE, + .dispatch_size_info.size = sizeof(struct radv_rra_ray_history_dispatch_size), + .dispatch_size = dispatch_size, + + .traversal_flags_info.type = RADV_RRA_TRAVERSAL_FLAGS, + .traversal_flags_info.size = sizeof(struct radv_rra_ray_history_traversal_flags), + .traversal_flags = traversal_flags, + }; + + uint32_t dispatch_index = util_dynarray_num_elements(&cmd_buffer->ray_history, struct radv_rra_ray_history_data *) + << 16; + + util_dynarray_append(&cmd_buffer->ray_history, struct radv_rra_ray_history_data *, data); + + cmd_buffer->state.flush_bits |= + RADV_CMD_FLAG_INV_SCACHE | RADV_CMD_FLAG_CS_PARTIAL_FLUSH | + radv_src_access_flush(cmd_buffer, VK_ACCESS_2_SHADER_READ_BIT | VK_ACCESS_2_SHADER_WRITE_BIT, NULL) | + radv_dst_access_flush(cmd_buffer, VK_ACCESS_2_SHADER_READ_BIT | VK_ACCESS_2_SHADER_WRITE_BIT, NULL); + + radv_update_buffer_cp( + cmd_buffer, + cmd_buffer->device->rra_trace.ray_history_addr + offsetof(struct radv_ray_history_header, dispatch_index), + &dispatch_index, sizeof(dispatch_index)); +} + enum radv_rt_mode { radv_rt_mode_direct, radv_rt_mode_indirect, @@ -10366,6 +10437,9 @@ radv_trace_rays(struct radv_cmd_buffer *cmd_buffer, VkTraceRaysIndirectCommand2K if (cmd_buffer->device->instance->debug_flags & RADV_DEBUG_NO_RT) return; + if (unlikely(cmd_buffer->device->rra_trace.ray_history_buffer)) + radv_trace_trace_rays(cmd_buffer, tables, indirect_va); + struct radv_compute_pipeline *pipeline = &cmd_buffer->state.rt_pipeline->base; struct radv_shader *rt_prolog = cmd_buffer->state.rt_prolog; uint32_t base_reg = rt_prolog->info.user_data_0; diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index 357eae50821..0161c75cc8c 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -630,28 +630,8 @@ capture_trace(VkQueue _queue) VkResult result = VK_SUCCESS; - char filename[2048]; - struct tm now; - time_t t; - - t = time(NULL); - now = *localtime(&t); - - if (queue->device->instance->vk.trace_mode & RADV_TRACE_MODE_RRA) { - if (_mesa_hash_table_num_entries(queue->device->rra_trace.accel_structs) == 0) { - fprintf(stderr, "radv: No acceleration structures captured, not saving RRA trace.\n"); - } else { - snprintf(filename, sizeof(filename), "/tmp/%s_%04d.%02d.%02d_%02d.%02d.%02d.rra", util_get_process_name(), - 1900 + now.tm_year, now.tm_mon + 1, now.tm_mday, now.tm_hour, now.tm_min, now.tm_sec); - - result = radv_rra_dump_trace(_queue, filename); - - if (result == VK_SUCCESS) - fprintf(stderr, "radv: RRA capture saved to '%s'\n", filename); - else - fprintf(stderr, "radv: Failed to save RRA capture!\n"); - } - } + if (queue->device->instance->vk.trace_mode & RADV_TRACE_MODE_RRA) + queue->device->rra_trace.triggered = true; if (queue->device->vk.memory_trace_data.is_enabled) { simple_mtx_lock(&queue->device->vk.memory_trace_data.token_mtx); @@ -1064,7 +1044,9 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr } if ((device->instance->vk.trace_mode & RADV_TRACE_MODE_RRA) && radv_enable_rt(physical_device, false)) { - radv_rra_trace_init(device); + result = radv_rra_trace_init(device); + if (result != VK_SUCCESS) + goto fail; } if (device->vk.enabled_features.rayTracingPipelineShaderGroupHandleCaptureReplay) { @@ -1087,6 +1069,8 @@ fail: radv_sqtt_finish(device); + radv_rra_trace_finish(radv_device_to_handle(device), &device->rra_trace); + radv_spm_finish(device); radv_trap_handler_finish(device); diff --git a/src/amd/vulkan/radv_pipeline_rt.c b/src/amd/vulkan/radv_pipeline_rt.c index 031ddafd28e..ca3868bd2e4 100644 --- a/src/amd/vulkan/radv_pipeline_rt.c +++ b/src/amd/vulkan/radv_pipeline_rt.c @@ -803,12 +803,13 @@ radv_rt_pipeline_create(VkDevice _device, VkPipelineCache _cache, const VkRayTra goto fail; bool keep_executable_info = radv_pipeline_capture_shaders(device, pipeline->base.base.create_flags); + bool emit_ray_history = !!device->rra_trace.ray_history_buffer; radv_hash_rt_shaders(device, pipeline->sha1, stages, pCreateInfo, pipeline->groups); pipeline->base.base.pipeline_hash = *(uint64_t *)pipeline->sha1; bool cache_hit = false; - if (!keep_executable_info) + if (!keep_executable_info && !emit_ray_history) cache_hit = radv_ray_tracing_pipeline_cache_search(device, cache, pipeline, pCreateInfo); if (!cache_hit) { @@ -828,7 +829,7 @@ radv_rt_pipeline_create(VkDevice _device, VkPipelineCache _cache, const VkRayTra radv_rmv_log_rt_pipeline_create(device, pipeline); - if (!cache_hit) + if (!cache_hit && !emit_ray_history) radv_ray_tracing_pipeline_cache_insert(device, cache, pipeline, pCreateInfo->stageCount, pipeline->sha1); /* write shader VAs into group handles */ diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index 122add82d26..a02b7dc799a 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -847,13 +847,131 @@ struct radv_rra_accel_struct_data { void radv_destroy_rra_accel_struct_data(VkDevice device, struct radv_rra_accel_struct_data *data); +struct radv_ray_history_header { + uint32_t offset; + uint32_t dispatch_index; + uint32_t submit_base_index; +}; + +enum radv_packed_token_type { + radv_packed_token_end_trace, +}; + +struct radv_packed_token_header { + uint32_t launch_index : 29; + uint32_t hit : 1; + uint32_t token_type : 2; +}; + +struct radv_packed_end_trace_token { + struct radv_packed_token_header header; + + uint32_t accel_struct_lo; + uint32_t accel_struct_hi; + + uint32_t flags : 16; + uint32_t dispatch_index : 16; + + uint32_t sbt_offset : 4; + uint32_t sbt_stride : 4; + uint32_t miss_index : 16; + uint32_t cull_mask : 8; + + float origin[3]; + float tmin; + float direction[3]; + float tmax; + + uint32_t iteration_count : 16; + uint32_t instance_count : 16; + + uint32_t primitive_id; + uint32_t geometry_id; + + uint32_t instance_id : 24; + uint32_t hit_kind : 8; + + float t; +}; +static_assert(sizeof(struct radv_packed_end_trace_token) == 72, "Unexpected radv_packed_end_trace_token size"); + +enum radv_rra_ray_history_metadata_type { + RADV_RRA_COUNTER_INFO = 1, + RADV_RRA_DISPATCH_SIZE = 2, + RADV_RRA_TRAVERSAL_FLAGS = 3, +}; + +struct radv_rra_ray_history_metadata_info { + enum radv_rra_ray_history_metadata_type type : 32; + uint32_t padding; + uint64_t size; +}; + +enum radv_rra_pipeline_type { + RADV_RRA_PIPELINE_RAY_TRACING, +}; + +struct radv_rra_ray_history_counter { + uint32_t dispatch_size[3]; + uint32_t hit_shader_count; + uint32_t miss_shader_count; + uint32_t shader_count; + uint64_t pipeline_api_hash; + uint32_t mode; + uint32_t mask; + uint32_t stride; + uint32_t data_size; + uint32_t lost_token_size; + uint32_t ray_id_begin; + uint32_t ray_id_end; + enum radv_rra_pipeline_type pipeline_type : 32; +}; + +struct radv_rra_ray_history_dispatch_size { + uint32_t size[3]; + uint32_t padding; +}; + +struct radv_rra_ray_history_traversal_flags { + uint32_t box_sort_mode : 1; + uint32_t node_ptr_flags : 1; + uint32_t reserved : 30; + uint32_t padding; +}; + +struct radv_rra_ray_history_metadata { + struct radv_rra_ray_history_metadata_info counter_info; + struct radv_rra_ray_history_counter counter; + + struct radv_rra_ray_history_metadata_info dispatch_size_info; + struct radv_rra_ray_history_dispatch_size dispatch_size; + + struct radv_rra_ray_history_metadata_info traversal_flags_info; + struct radv_rra_ray_history_traversal_flags traversal_flags; +}; +static_assert(sizeof(struct radv_rra_ray_history_metadata) == 136, + "radv_rra_ray_history_metadata does not match RRA expectations"); + +struct radv_rra_ray_history_data { + struct radv_rra_ray_history_metadata metadata; +}; + struct radv_rra_trace_data { struct hash_table *accel_structs; struct hash_table_u64 *accel_struct_vas; simple_mtx_t data_mtx; bool validate_as; bool copy_after_build; + bool triggered; uint32_t copy_memory_index; + + struct util_dynarray ray_history; + VkBuffer ray_history_buffer; + VkDeviceMemory ray_history_memory; + void *ray_history_data; + uint64_t ray_history_addr; + uint32_t ray_history_buffer_size; + uint32_t ray_history_resolution_scale; }; enum radv_dispatch_table { @@ -1776,6 +1894,8 @@ struct radv_cmd_buffer { uint64_t shader_upload_seq; uint32_t sqtt_cb_id; + + struct util_dynarray ray_history; }; static inline bool @@ -3002,9 +3122,10 @@ VkResult radv_sqtt_get_timed_cmdbuf(struct radv_queue *queue, struct radeon_wins VkResult radv_sqtt_acquire_gpu_timestamp(struct radv_device *device, struct radeon_winsys_bo **gpu_timestamp_bo, uint32_t *gpu_timestamp_offset, void **gpu_timestamp_ptr); -void radv_rra_trace_init(struct radv_device *device); +VkResult radv_rra_trace_init(struct radv_device *device); VkResult radv_rra_dump_trace(VkQueue vk_queue, char *filename); +void radv_rra_trace_clear_ray_history(VkDevice _device, struct radv_rra_trace_data *data); void radv_rra_trace_finish(VkDevice vk_device, struct radv_rra_trace_data *data); void radv_memory_trace_init(struct radv_device *device); diff --git a/src/amd/vulkan/radv_rra.c b/src/amd/vulkan/radv_rra.c index 6f08da3524f..e056252468a 100644 --- a/src/amd/vulkan/radv_rra.c +++ b/src/amd/vulkan/radv_rra.c @@ -42,6 +42,7 @@ static_assert(sizeof(struct rra_file_header) == 32, "rra_file_header does not ma enum rra_chunk_version { RADV_RRA_ASIC_API_INFO_CHUNK_VERSION = 0x1, + RADV_RRA_RAY_HISTORY_CHUNK_VERSION = 0x2, RADV_RRA_ACCEL_STRUCT_CHUNK_VERSION = 0xF0005, }; @@ -894,7 +895,7 @@ exit: return result; } -void +VkResult radv_rra_trace_init(struct radv_device *device) { device->rra_trace.validate_as = debug_get_bool_option("RADV_RRA_TRACE_VALIDATE", false); @@ -906,11 +907,91 @@ radv_rra_trace_init(struct radv_device *device) device->rra_trace.copy_memory_index = radv_find_memory_index( device->physical_device, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT); + + util_dynarray_init(&device->rra_trace.ray_history, NULL); + + device->rra_trace.ray_history_buffer_size = debug_get_num_option("RADV_RRA_TRACE_HISTORY_SIZE", 100 * 1024 * 1024); + if (device->rra_trace.ray_history_buffer_size < + sizeof(struct radv_ray_history_header) + sizeof(struct radv_packed_end_trace_token)) + return VK_SUCCESS; + + device->rra_trace.ray_history_resolution_scale = debug_get_num_option("RADV_RRA_TRACE_RESOLUTION_SCALE", 1); + device->rra_trace.ray_history_resolution_scale = MAX2(device->rra_trace.ray_history_resolution_scale, 1); + + VkBufferCreateInfo buffer_create_info = { + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .pNext = + &(VkBufferUsageFlags2CreateInfoKHR){ + .sType = VK_STRUCTURE_TYPE_BUFFER_USAGE_FLAGS_2_CREATE_INFO_KHR, + .usage = VK_BUFFER_USAGE_2_TRANSFER_SRC_BIT_KHR | VK_BUFFER_USAGE_2_SHADER_DEVICE_ADDRESS_BIT_KHR, + }, + .size = device->rra_trace.ray_history_buffer_size, + }; + + VkDevice _device = radv_device_to_handle(device); + VkResult result = radv_CreateBuffer(_device, &buffer_create_info, NULL, &device->rra_trace.ray_history_buffer); + if (result != VK_SUCCESS) + return result; + + VkMemoryRequirements requirements; + vk_common_GetBufferMemoryRequirements(_device, device->rra_trace.ray_history_buffer, &requirements); + + VkMemoryAllocateInfo alloc_info = { + .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, + .allocationSize = requirements.size, + .memoryTypeIndex = radv_find_memory_index(device->physical_device, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | + VK_MEMORY_PROPERTY_HOST_COHERENT_BIT), + }; + + result = radv_AllocateMemory(_device, &alloc_info, NULL, &device->rra_trace.ray_history_memory); + if (result != VK_SUCCESS) + return result; + + result = vk_common_MapMemory(_device, device->rra_trace.ray_history_memory, 0, VK_WHOLE_SIZE, 0, + (void **)&device->rra_trace.ray_history_data); + if (result != VK_SUCCESS) + return result; + + result = vk_common_BindBufferMemory(_device, device->rra_trace.ray_history_buffer, + device->rra_trace.ray_history_memory, 0); + + VkBufferDeviceAddressInfo addr_info = { + .sType = VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO, + .buffer = device->rra_trace.ray_history_buffer, + }; + device->rra_trace.ray_history_addr = radv_GetBufferDeviceAddress(_device, &addr_info); + + struct radv_ray_history_header *ray_history_header = device->rra_trace.ray_history_data; + memset(ray_history_header, 0, sizeof(struct radv_ray_history_header)); + ray_history_header->offset = 1; + + return result; +} + +void +radv_rra_trace_clear_ray_history(VkDevice _device, struct radv_rra_trace_data *data) +{ + util_dynarray_foreach (&data->ray_history, struct radv_rra_ray_history_data *, _entry) { + struct radv_rra_ray_history_data *entry = *_entry; + free(entry); + } + util_dynarray_clear(&data->ray_history); } void radv_rra_trace_finish(VkDevice vk_device, struct radv_rra_trace_data *data) { + radv_DestroyBuffer(vk_device, data->ray_history_buffer, NULL); + + if (data->ray_history_memory) + vk_common_UnmapMemory(vk_device, data->ray_history_memory); + + radv_FreeMemory(vk_device, data->ray_history_memory, NULL); + + radv_rra_trace_clear_ray_history(vk_device, data); + util_dynarray_fini(&data->ray_history); + if (data->accel_structs) hash_table_foreach (data->accel_structs, entry) radv_destroy_rra_accel_struct_data(vk_device, entry->data); @@ -953,6 +1034,8 @@ struct rra_copy_context { void *mapped_data; struct hash_entry **entries; + + uint32_t min_size; }; static VkResult @@ -962,7 +1045,7 @@ rra_copy_context_init(struct rra_copy_context *ctx) if (device->rra_trace.copy_after_build) return VK_SUCCESS; - uint32_t max_size = 0; + uint32_t max_size = ctx->min_size; uint32_t accel_struct_count = _mesa_hash_table_num_entries(device->rra_trace.accel_structs); for (unsigned i = 0; i < accel_struct_count; i++) { struct radv_rra_accel_struct_data *data = ctx->entries[i]->data; @@ -1115,6 +1198,119 @@ rra_unmap_accel_struct_data(struct rra_copy_context *ctx, uint32_t i) vk_common_UnmapMemory(ctx->device, data->memory); } +enum rra_ray_history_token_type { + rra_ray_history_token_begin, + rra_ray_history_token_tlas, + rra_ray_history_token_blas, + rra_ray_history_token_end, + rra_ray_history_token_call, + rra_ray_history_token_timestamp, + rra_ray_history_token_ahit_status, + rra_ray_history_token_call2, + rra_ray_history_token_isec_status, + rra_ray_history_token_end2, + rra_ray_history_token_begin2, + rra_ray_history_token_normal = 0xFFFF, +}; + +struct rra_ray_history_id_token { + uint32_t id : 30; + uint32_t reserved : 1; + uint32_t has_control : 1; +}; +static_assert(sizeof(struct rra_ray_history_id_token) == 4, "rra_ray_history_id_token does not match RRA expectations"); + +struct rra_ray_history_control_token { + uint32_t type : 16; + uint32_t length : 8; + uint32_t data : 8; +}; +static_assert(sizeof(struct rra_ray_history_control_token) == 4, + "rra_ray_history_control_token does not match RRA expectations"); + +struct rra_ray_history_begin_token { + uint32_t wave_id; + uint32_t launch_ids[3]; + uint32_t accel_struct_lo; + uint32_t accel_struct_hi; + uint32_t ray_flags; + uint32_t cull_mask : 8; + uint32_t stb_offset : 4; + uint32_t stb_stride : 4; + uint32_t miss_index : 16; + float origin[3]; + float tmin; + float direction[3]; + float tmax; +}; +static_assert(sizeof(struct rra_ray_history_begin_token) == 64, + "rra_ray_history_begin_token does not match RRA expectations"); + +struct rra_ray_history_begin2_token { + struct rra_ray_history_begin_token base; + uint32_t call_instruction_id; + uint32_t unique_wave_id; + uint32_t parent_unique_wave_id; +}; +static_assert(sizeof(struct rra_ray_history_begin2_token) == 76, + "rra_ray_history_begin2_token does not match RRA expectations"); + +struct rra_ray_history_end_token { + uint32_t primitive_index; + uint32_t geometry_index; +}; +static_assert(sizeof(struct rra_ray_history_end_token) == 8, + "rra_ray_history_end_token does not match RRA expectations"); + +struct rra_ray_history_end2_token { + struct rra_ray_history_end_token base; + uint32_t instance_index : 24; + uint32_t hit_kind : 8; + uint32_t iteration_count; + uint32_t candidate_instance_count; + float t; +}; +static_assert(sizeof(struct rra_ray_history_end2_token) == 24, + "rra_ray_history_end2_token does not match RRA expectations"); + +struct rra_ray_history_tlas_token { + uint64_t addr; +}; +static_assert(sizeof(struct rra_ray_history_tlas_token) == 8, + "rra_ray_history_tlas_token does not match RRA expectations"); + +struct rra_ray_history_blas_token { + uint64_t addr; +}; +static_assert(sizeof(struct rra_ray_history_blas_token) == 8, + "rra_ray_history_blas_token does not match RRA expectations"); + +struct rra_ray_history_call_token { + uint32_t addr[2]; +}; +static_assert(sizeof(struct rra_ray_history_call_token) == 8, + "rra_ray_history_call_token does not match RRA expectations"); + +struct rra_ray_history_call2_token { + struct rra_ray_history_call_token base; + uint32_t sbt_index; +}; +static_assert(sizeof(struct rra_ray_history_call2_token) == 12, + "rra_ray_history_call2_token does not match RRA expectations"); + +struct rra_ray_history_isec_token { + float t; + uint32_t hit_kind; +}; +static_assert(sizeof(struct rra_ray_history_isec_token) == 8, + "rra_ray_history_isec_token does not match RRA expectations"); + +struct rra_ray_history_timestamp_token { + uint64_t gpu_timestamp; +}; +static_assert(sizeof(struct rra_ray_history_timestamp_token) == 8, + "rra_ray_history_timestamp_token does not match RRA expectations"); + VkResult radv_rra_dump_trace(VkQueue vk_queue, char *filename) { @@ -1127,13 +1323,22 @@ radv_rra_dump_trace(VkQueue vk_queue, char *filename) return result; uint64_t *accel_struct_offsets = NULL; + uint64_t *ray_history_offsets = NULL; struct hash_entry **hash_entries = NULL; FILE *file = NULL; uint32_t struct_count = _mesa_hash_table_num_entries(device->rra_trace.accel_structs); accel_struct_offsets = calloc(struct_count, sizeof(uint64_t)); if (!accel_struct_offsets) + return VK_ERROR_OUT_OF_HOST_MEMORY; + + uint32_t dispatch_count = + util_dynarray_num_elements(&device->rra_trace.ray_history, struct radv_rra_ray_history_data *); + ray_history_offsets = calloc(dispatch_count, sizeof(uint64_t)); + if (!ray_history_offsets) { result = VK_ERROR_OUT_OF_HOST_MEMORY; + goto cleanup; + } hash_entries = malloc(sizeof(*hash_entries) * struct_count); if (!hash_entries) { @@ -1175,6 +1380,7 @@ radv_rra_dump_trace(VkQueue vk_queue, char *filename) .queue = vk_queue, .entries = hash_entries, .family_index = queue->vk.queue_family_index, + .min_size = device->rra_trace.ray_history_buffer_size, }; result = rra_copy_context_init(©_ctx); @@ -1197,6 +1403,118 @@ radv_rra_dump_trace(VkQueue vk_queue, char *filename) written_accel_struct_count++; } + uint64_t ray_history_offset = (uint64_t)ftell(file); + + uint32_t ray_history_index = 0xFFFFFFFF; + struct radv_rra_ray_history_data *ray_history = NULL; + + uint8_t *history = device->rra_trace.ray_history_data; + struct radv_ray_history_header *history_header = (void *)history; + + uint32_t history_buffer_size_mb = device->rra_trace.ray_history_buffer_size / 1024 / 1024; + uint32_t history_size_mb = history_header->offset / 1024 / 1024; + if (history_header->offset > device->rra_trace.ray_history_buffer_size) { + fprintf(stderr, "radv: rra: The ray history buffer size (%u MB) is to small. %u MB is required.\n", + history_buffer_size_mb, history_size_mb); + } else { + fprintf(stderr, "radv: rra: Ray history buffer size = %u MB, ray history size = %u MB.\n", history_buffer_size_mb, + history_size_mb); + } + + uint32_t token_size; + for (uint32_t offset = sizeof(struct radv_ray_history_header); offset < history_header->offset; + offset += token_size) { + struct radv_packed_end_trace_token *src = (void *)(history + offset); + token_size = src->header.hit ? sizeof(struct radv_packed_end_trace_token) + : offsetof(struct radv_packed_end_trace_token, primitive_id); + + if (src->dispatch_index != ray_history_index) { + ray_history_index = src->dispatch_index; + assert(ray_history_index < dispatch_count); + ray_history = *util_dynarray_element(&device->rra_trace.ray_history, struct radv_rra_ray_history_data *, + ray_history_index); + + assert(!ray_history_offsets[ray_history_index]); + ray_history_offsets[ray_history_index] = (uint64_t)ftell(file); + fwrite(&ray_history->metadata, sizeof(struct radv_rra_ray_history_metadata), 1, file); + } + + uint32_t *dispatch_size = ray_history->metadata.dispatch_size.size; + + uint32_t x = src->header.launch_index % dispatch_size[0]; + uint32_t y = (src->header.launch_index / dispatch_size[0]) % dispatch_size[1]; + uint32_t z = src->header.launch_index / (dispatch_size[0] * dispatch_size[1]); + + struct rra_ray_history_id_token begin_id = { + .id = src->header.launch_index, + .has_control = true, + }; + struct rra_ray_history_control_token begin_control = { + .type = rra_ray_history_token_begin, + .length = sizeof(struct rra_ray_history_begin_token) / 4, + }; + struct rra_ray_history_begin_token begin = { + .wave_id = src->header.launch_index / 32, + .launch_ids = {x, y, z}, + .accel_struct_lo = src->accel_struct_lo, + .accel_struct_hi = src->accel_struct_hi & 0x1FFFFFF, + .ray_flags = src->flags, + .cull_mask = src->cull_mask, + .stb_offset = src->sbt_offset, + .stb_stride = src->sbt_stride, + .miss_index = src->miss_index, + .origin[0] = src->origin[0], + .origin[1] = src->origin[1], + .origin[2] = src->origin[2], + .tmin = src->tmin, + .direction[0] = src->direction[0], + .direction[1] = src->direction[1], + .direction[2] = src->direction[2], + .tmax = src->tmax, + }; + fwrite(&begin_id, sizeof(begin_id), 1, file); + fwrite(&begin_control, sizeof(begin_control), 1, file); + fwrite(&begin, sizeof(begin), 1, file); + + struct rra_ray_history_id_token end_id = { + .id = src->header.launch_index, + .has_control = true, + }; + struct rra_ray_history_control_token end_control = { + .type = rra_ray_history_token_end2, + .length = sizeof(struct rra_ray_history_end2_token) / 4, + }; + struct rra_ray_history_end2_token end = { + .base.primitive_index = 0xFFFFFFFF, + .base.geometry_index = 0xFFFFFFFF, + .iteration_count = src->iteration_count, + .candidate_instance_count = src->instance_count, + }; + + if (src->header.hit) { + end.base.primitive_index = src->primitive_id; + end.base.geometry_index = src->geometry_id; + end.instance_index = src->instance_id; + end.hit_kind = src->hit_kind; + end.t = src->t; + } + + fwrite(&end_id, sizeof(end_id), 1, file); + fwrite(&end_control, sizeof(end_control), 1, file); + fwrite(&end, sizeof(end), 1, file); + } + + for (uint32_t i = 0; i < dispatch_count; i++) { + if (ray_history_offsets[i]) + continue; + + ray_history = *util_dynarray_element(&device->rra_trace.ray_history, struct radv_rra_ray_history_data *, i); + ray_history_offsets[i] = (uint64_t)ftell(file); + fwrite(&ray_history->metadata, sizeof(struct radv_rra_ray_history_metadata), 1, file); + } + + history_header->offset = 1; + rra_copy_context_finish(©_ctx); uint64_t chunk_info_offset = (uint64_t)ftell(file); @@ -1204,10 +1522,24 @@ radv_rra_dump_trace(VkQueue vk_queue, char *filename) rra_dump_chunk_description(asic_info_offset, 0, sizeof(struct rra_asic_info), "AsicInfo", RADV_RRA_ASIC_API_INFO_CHUNK_VERSION, file); + for (uint32_t i = 0; i < dispatch_count; i++) { + uint64_t tokens_size; + if (i == dispatch_count - 1) + tokens_size = (uint64_t)(chunk_info_offset - ray_history_offsets[i]); + else + tokens_size = (uint64_t)(ray_history_offsets[i + 1] - ray_history_offsets[i]); + tokens_size -= sizeof(struct radv_rra_ray_history_metadata); + + rra_dump_chunk_description(ray_history_offsets[i], 0, sizeof(struct radv_rra_ray_history_metadata), + "HistoryMetadata", RADV_RRA_RAY_HISTORY_CHUNK_VERSION, file); + rra_dump_chunk_description(ray_history_offsets[i] + sizeof(struct radv_rra_ray_history_metadata), 0, tokens_size, + "HistoryTokensRaw", RADV_RRA_RAY_HISTORY_CHUNK_VERSION, file); + } + for (uint32_t i = 0; i < written_accel_struct_count; ++i) { uint64_t accel_struct_size; if (i == written_accel_struct_count - 1) - accel_struct_size = (uint64_t)(chunk_info_offset - accel_struct_offsets[i]); + accel_struct_size = (uint64_t)(ray_history_offset - accel_struct_offsets[i]); else accel_struct_size = (uint64_t)(accel_struct_offsets[i + 1] - accel_struct_offsets[i]); @@ -1227,6 +1559,7 @@ cleanup: fclose(file); free(hash_entries); + free(ray_history_offsets); free(accel_struct_offsets); return result; }