diff --git a/src/amd/vulkan/radv_acceleration_structure.c b/src/amd/vulkan/radv_acceleration_structure.c index aa542ae94e0..946604baca5 100644 --- a/src/amd/vulkan/radv_acceleration_structure.c +++ b/src/amd/vulkan/radv_acceleration_structure.c @@ -29,12 +29,48 @@ #include "radv_cs.h" #include "radv_meta.h" +#include "radix_sort/radv_radix_sort.h" + +/* Min and max bounds of the bvh used to compute morton codes */ +#define SCRATCH_TOTAL_BOUNDS_SIZE (6 * sizeof(float)) + +enum accel_struct_build { + accel_struct_build_unoptimized, + accel_struct_build_lbvh, +}; + +static enum accel_struct_build +get_accel_struct_build(const struct radv_physical_device *pdevice, + VkAccelerationStructureBuildTypeKHR buildType) +{ + if (buildType != VK_ACCELERATION_STRUCTURE_BUILD_TYPE_DEVICE_KHR) + return accel_struct_build_unoptimized; + + return (pdevice->rad_info.chip_class < GFX10) ? accel_struct_build_unoptimized + : accel_struct_build_lbvh; +} + +static uint32_t +get_node_id_stride(enum accel_struct_build build_mode) +{ + switch (build_mode) { + case accel_struct_build_unoptimized: + return 4; + case accel_struct_build_lbvh: + return 8; + default: + unreachable("Unhandled accel_struct_build!"); + } +} + VKAPI_ATTR void VKAPI_CALL radv_GetAccelerationStructureBuildSizesKHR( VkDevice _device, VkAccelerationStructureBuildTypeKHR buildType, const VkAccelerationStructureBuildGeometryInfoKHR *pBuildInfo, const uint32_t *pMaxPrimitiveCounts, VkAccelerationStructureBuildSizesInfoKHR *pSizeInfo) { + RADV_FROM_HANDLE(radv_device, device, _device); + uint64_t triangles = 0, boxes = 0, instances = 0; STATIC_ASSERT(sizeof(struct radv_bvh_triangle_node) == 64); @@ -79,9 +115,30 @@ radv_GetAccelerationStructureBuildSizesKHR( pSizeInfo->accelerationStructureSize = size; - /* 2x the max number of nodes in a BVH layer (one uint32_t each) */ - pSizeInfo->updateScratchSize = pSizeInfo->buildScratchSize = - MAX2(4096, 2 * (boxes + instances + triangles) * sizeof(uint32_t)); + /* 2x the max number of nodes in a BVH layer and order information for sorting when using + * LBVH (one uint32_t each, two buffers) plus space to store the bounds. + * LBVH is only supported for device builds and hardware that supports global atomics. + */ + enum accel_struct_build build_mode = get_accel_struct_build(device->physical_device, buildType); + uint32_t node_id_stride = get_node_id_stride(build_mode); + + uint32_t leaf_count = boxes + instances + triangles; + VkDeviceSize scratchSize = 2 * leaf_count * node_id_stride; + + if (build_mode == accel_struct_build_lbvh) { + radix_sort_vk_memory_requirements_t requirements; + radix_sort_vk_get_memory_requirements(device->meta_state.accel_struct_build.radix_sort, + leaf_count, &requirements); + + /* Make sure we have the space required by the radix sort. */ + scratchSize = MAX2(scratchSize, requirements.keyvals_size * 2); + + scratchSize += requirements.internal_size + SCRATCH_TOTAL_BOUNDS_SIZE; + } + + scratchSize = MAX2(4096, scratchSize); + pSizeInfo->updateScratchSize = scratchSize; + pSizeInfo->buildScratchSize = scratchSize; } VKAPI_ATTR VkResult VKAPI_CALL @@ -745,6 +802,19 @@ radv_CopyAccelerationStructureKHR(VkDevice _device, VkDeferredOperationKHR defer return VK_SUCCESS; } +static nir_builder +create_accel_build_shader(struct radv_device *device, const char *name) +{ + nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "%s", name); + b.shader->info.workgroup_size[0] = 64; + + assert(b.shader->info.workgroup_size[1] == 1); + assert(b.shader->info.workgroup_size[2] == 1); + assert(!b.shader->info.workgroup_size_variable); + + return b; +} + static nir_ssa_def * get_indices(nir_builder *b, nir_ssa_def *addr, nir_ssa_def *type, nir_ssa_def *id) { @@ -935,6 +1005,21 @@ struct build_primitive_constants { }; }; +struct bounds_constants { + uint64_t node_addr; + uint64_t scratch_addr; +}; + +struct morton_constants { + uint64_t node_addr; + uint64_t scratch_addr; +}; + +struct fill_constants { + uint64_t addr; + uint32_t value; +}; + struct build_internal_constants { uint64_t node_dst_addr; uint64_t scratch_addr; @@ -972,6 +1057,29 @@ nir_invert_3x3(nir_builder *b, nir_ssa_def *in[3][3], nir_ssa_def *out[3][3]) } } +static nir_ssa_def * +id_to_node_id_offset(nir_builder *b, nir_ssa_def *global_id, + const struct radv_physical_device *pdevice) +{ + uint32_t stride = get_node_id_stride( + get_accel_struct_build(pdevice, VK_ACCELERATION_STRUCTURE_BUILD_TYPE_DEVICE_KHR)); + + return nir_umul24(b, global_id, nir_imm_int(b, stride)); +} + +static nir_ssa_def * +id_to_morton_offset(nir_builder *b, nir_ssa_def *global_id, + const struct radv_physical_device *pdevice) +{ + enum accel_struct_build build_mode = + get_accel_struct_build(pdevice, VK_ACCELERATION_STRUCTURE_BUILD_TYPE_DEVICE_KHR); + assert(build_mode == accel_struct_build_lbvh); + + uint32_t stride = get_node_id_stride(build_mode); + + return nir_iadd_imm(b, nir_umul24(b, global_id, nir_imm_int(b, stride)), sizeof(uint32_t)); +} + static nir_shader * build_leaf_shader(struct radv_device *dev) { @@ -1003,9 +1111,15 @@ build_leaf_shader(struct radv_device *dev) nir_umul24(&b, nir_channels(&b, nir_load_workgroup_id(&b, 32), 1), nir_imm_int(&b, b.shader->info.workgroup_size[0])), nir_channels(&b, nir_load_local_invocation_id(&b), 1)); - scratch_addr = nir_iadd( - &b, scratch_addr, - nir_u2u64(&b, nir_iadd(&b, scratch_offset, nir_umul24(&b, global_id, nir_imm_int(&b, 4))))); + nir_ssa_def *scratch_dst_addr = + nir_iadd(&b, scratch_addr, + nir_u2u64(&b, nir_iadd(&b, scratch_offset, + id_to_node_id_offset(&b, global_id, dev->physical_device)))); + + nir_variable *bounds[2] = { + nir_variable_create(b.shader, nir_var_shader_temp, vec3_type, "min_bound"), + nir_variable_create(b.shader, nir_var_shader_temp, vec3_type, "max_bound"), + }; nir_push_if(&b, nir_ieq_imm(&b, geom_type, VK_GEOMETRY_TYPE_TRIANGLES_KHR)); { /* Triangles */ @@ -1053,6 +1167,22 @@ build_leaf_shader(struct radv_device *dev) for (unsigned j = 0; j < 3; ++j) node_data[i * 3 + j] = nir_fdph(&b, positions[i], nir_load_var(&b, transform[j])); + nir_ssa_def *min_bound = NULL; + nir_ssa_def *max_bound = NULL; + for (unsigned i = 0; i < 3; ++i) { + nir_ssa_def *position = nir_vec(&b, node_data + i * 3, 3); + if (min_bound) { + min_bound = nir_fmin(&b, min_bound, position); + max_bound = nir_fmax(&b, max_bound, position); + } else { + min_bound = position; + max_bound = position; + } + } + + nir_store_var(&b, bounds[0], min_bound, 7); + nir_store_var(&b, bounds[1], max_bound, 7); + node_data[12] = global_id; node_data[13] = geometry_id; node_data[15] = nir_imm_int(&b, 9); @@ -1066,7 +1196,7 @@ build_leaf_shader(struct radv_device *dev) } nir_ssa_def *node_id = nir_ushr_imm(&b, node_offset, 3); - nir_build_store_global(&b, node_id, scratch_addr); + nir_build_store_global(&b, node_id, scratch_dst_addr); } nir_push_else(&b, NULL); nir_push_if(&b, nir_ieq_imm(&b, geom_type, VK_GEOMETRY_TYPE_AABBS_KHR)); @@ -1077,14 +1207,18 @@ build_leaf_shader(struct radv_device *dev) nir_ssa_def *node_offset = nir_iadd(&b, node_dst_offset, nir_umul24(&b, global_id, nir_imm_int(&b, 64))); nir_ssa_def *aabb_node_dst_addr = nir_iadd(&b, node_dst_addr, nir_u2u64(&b, node_offset)); + nir_ssa_def *node_id = nir_iadd_imm(&b, nir_ushr_imm(&b, node_offset, 3), 7); - nir_build_store_global(&b, node_id, scratch_addr); + nir_build_store_global(&b, node_id, scratch_dst_addr); aabb_addr = nir_iadd(&b, aabb_addr, nir_u2u64(&b, nir_imul(&b, aabb_stride, global_id))); nir_ssa_def *min_bound = nir_build_load_global(&b, 3, 32, nir_iadd_imm(&b, aabb_addr, 0)); nir_ssa_def *max_bound = nir_build_load_global(&b, 3, 32, nir_iadd_imm(&b, aabb_addr, 12)); + nir_store_var(&b, bounds[0], min_bound, 7); + nir_store_var(&b, bounds[1], max_bound, 7); + nir_ssa_def *values[] = {nir_channel(&b, min_bound, 0), nir_channel(&b, min_bound, 1), nir_channel(&b, min_bound, 2), @@ -1130,16 +1264,9 @@ build_leaf_shader(struct radv_device *dev) nir_ssa_def *node_offset = nir_iadd(&b, node_dst_offset, nir_umul24(&b, global_id, nir_imm_int(&b, 128))); node_dst_addr = nir_iadd(&b, node_dst_addr, nir_u2u64(&b, node_offset)); + nir_ssa_def *node_id = nir_iadd_imm(&b, nir_ushr_imm(&b, node_offset, 3), 6); - nir_build_store_global(&b, node_id, scratch_addr); - - nir_variable *bounds[2] = { - nir_variable_create(b.shader, nir_var_shader_temp, vec3_type, "min_bound"), - nir_variable_create(b.shader, nir_var_shader_temp, vec3_type, "max_bound"), - }; - - nir_store_var(&b, bounds[0], nir_channels(&b, nir_imm_vec4(&b, NAN, NAN, NAN, NAN), 7), 7); - nir_store_var(&b, bounds[1], nir_channels(&b, nir_imm_vec4(&b, NAN, NAN, NAN, NAN), 7), 7); + nir_build_store_global(&b, node_id, scratch_dst_addr); nir_ssa_def *header_addr = nir_pack_64_2x32(&b, nir_channels(&b, inst3, 12)); nir_push_if(&b, nir_ine_imm(&b, header_addr, 0)); @@ -1204,6 +1331,32 @@ build_leaf_shader(struct radv_device *dev) nir_pop_if(&b, NULL); nir_pop_if(&b, NULL); + if (get_accel_struct_build(dev->physical_device, + VK_ACCELERATION_STRUCTURE_BUILD_TYPE_DEVICE_KHR) != + accel_struct_build_unoptimized) { + nir_ssa_def *min = nir_load_var(&b, bounds[0]); + nir_ssa_def *max = nir_load_var(&b, bounds[1]); + + nir_ssa_def *min_reduced = nir_reduce(&b, min, .reduction_op = nir_op_fmin); + nir_ssa_def *max_reduced = nir_reduce(&b, max, .reduction_op = nir_op_fmax); + + nir_push_if(&b, nir_elect(&b, 1)); + + nir_global_atomic_fmin(&b, 32, nir_isub(&b, scratch_addr, nir_imm_int64(&b, 24)), + nir_channel(&b, min_reduced, 0)); + nir_global_atomic_fmin(&b, 32, nir_isub(&b, scratch_addr, nir_imm_int64(&b, 20)), + nir_channel(&b, min_reduced, 1)); + nir_global_atomic_fmin(&b, 32, nir_isub(&b, scratch_addr, nir_imm_int64(&b, 16)), + nir_channel(&b, min_reduced, 2)); + + nir_global_atomic_fmax(&b, 32, nir_isub(&b, scratch_addr, nir_imm_int64(&b, 12)), + nir_channel(&b, max_reduced, 0)); + nir_global_atomic_fmax(&b, 32, nir_isub(&b, scratch_addr, nir_imm_int64(&b, 8)), + nir_channel(&b, max_reduced, 1)); + nir_global_atomic_fmax(&b, 32, nir_isub(&b, scratch_addr, nir_imm_int64(&b, 4)), + nir_channel(&b, max_reduced, 2)); + } + return b.shader; } @@ -1267,6 +1420,89 @@ determine_bounds(nir_builder *b, nir_ssa_def *node_addr, nir_ssa_def *node_id, nir_pop_if(b, NULL); } +/* https://developer.nvidia.com/blog/thinking-parallel-part-iii-tree-construction-gpu/ */ +static nir_ssa_def * +build_morton_component(nir_builder *b, nir_ssa_def *x) +{ + x = nir_iand_imm(b, nir_imul_imm(b, x, 0x00000101u), 0x0F00F00Fu); + x = nir_iand_imm(b, nir_imul_imm(b, x, 0x00000011u), 0xC30C30C3u); + x = nir_iand_imm(b, nir_imul_imm(b, x, 0x00000005u), 0x49249249u); + return x; +} + +static nir_shader * +build_morton_shader(struct radv_device *dev) +{ + const struct glsl_type *vec3_type = glsl_vector_type(GLSL_TYPE_FLOAT, 3); + + nir_builder b = create_accel_build_shader(dev, "accel_build_morton_shader"); + + /* + * push constants: + * i32 x 2: node address + * i32 x 2: scratch address + */ + nir_ssa_def *pconst0 = + nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 0, .range = 16); + + nir_ssa_def *node_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst0, 0b0011)); + nir_ssa_def *scratch_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst0, 0b1100)); + + nir_ssa_def *global_id = + nir_iadd(&b, + nir_imul_imm(&b, nir_channel(&b, nir_load_workgroup_id(&b, 32), 0), + b.shader->info.workgroup_size[0]), + nir_load_local_invocation_index(&b)); + + nir_ssa_def *node_id_addr = nir_iadd( + &b, scratch_addr, nir_u2u64(&b, id_to_node_id_offset(&b, global_id, dev->physical_device))); + nir_ssa_def *node_id = + nir_build_load_global(&b, 1, 32, node_id_addr, .align_mul = 4, .align_offset = 0); + + nir_variable *node_bounds[2] = { + nir_variable_create(b.shader, nir_var_shader_temp, vec3_type, "min_bound"), + nir_variable_create(b.shader, nir_var_shader_temp, vec3_type, "max_bound"), + }; + + determine_bounds(&b, node_addr, node_id, node_bounds); + + nir_ssa_def *node_min = nir_load_var(&b, node_bounds[0]); + nir_ssa_def *node_max = nir_load_var(&b, node_bounds[1]); + nir_ssa_def *node_pos = + nir_fmul(&b, nir_fadd(&b, node_min, node_max), nir_imm_vec3(&b, 0.5, 0.5, 0.5)); + + nir_ssa_def *bvh_min = + nir_build_load_global(&b, 3, 32, nir_isub(&b, scratch_addr, nir_imm_int64(&b, 24)), + .align_mul = 4, .align_offset = 0); + nir_ssa_def *bvh_max = + nir_build_load_global(&b, 3, 32, nir_isub(&b, scratch_addr, nir_imm_int64(&b, 12)), + .align_mul = 4, .align_offset = 0); + nir_ssa_def *bvh_size = nir_fsub(&b, bvh_max, bvh_min); + + nir_ssa_def *normalized_node_pos = nir_fdiv(&b, nir_fsub(&b, node_pos, bvh_min), bvh_size); + + nir_ssa_def *x_int = + nir_f2u32(&b, nir_fmul_imm(&b, nir_channel(&b, normalized_node_pos, 0), 255.0)); + nir_ssa_def *x_morton = build_morton_component(&b, x_int); + + nir_ssa_def *y_int = + nir_f2u32(&b, nir_fmul_imm(&b, nir_channel(&b, normalized_node_pos, 1), 255.0)); + nir_ssa_def *y_morton = build_morton_component(&b, y_int); + + nir_ssa_def *z_int = + nir_f2u32(&b, nir_fmul_imm(&b, nir_channel(&b, normalized_node_pos, 2), 255.0)); + nir_ssa_def *z_morton = build_morton_component(&b, z_int); + + nir_ssa_def *morton_code = nir_iadd( + &b, nir_iadd(&b, nir_ishl_imm(&b, x_morton, 2), nir_ishl_imm(&b, y_morton, 1)), z_morton); + + nir_ssa_def *dst_addr = nir_iadd( + &b, scratch_addr, nir_u2u64(&b, id_to_morton_offset(&b, global_id, dev->physical_device))); + nir_build_store_global(&b, morton_code, dst_addr, .align_mul = 4); + + return b.shader; +} + static nir_shader * build_internal_shader(struct radv_device *dev) { @@ -1308,12 +1544,22 @@ build_internal_shader(struct radv_device *dev) nir_ssa_def *node_offset = nir_iadd(&b, node_dst_offset, nir_ishl_imm(&b, global_id, 7)); nir_ssa_def *node_dst_addr = nir_iadd(&b, node_addr, nir_u2u64(&b, node_offset)); - nir_ssa_def *src_nodes = nir_build_load_global( - &b, 4, 32, - nir_iadd(&b, scratch_addr, - nir_u2u64(&b, nir_iadd(&b, src_scratch_offset, nir_ishl_imm(&b, global_id, 4))))); - nir_build_store_global(&b, src_nodes, nir_iadd_imm(&b, node_dst_addr, 0)); + nir_ssa_def *src_base_addr = + nir_iadd(&b, scratch_addr, + nir_u2u64(&b, nir_iadd(&b, src_scratch_offset, + id_to_node_id_offset(&b, src_idx, dev->physical_device)))); + + enum accel_struct_build build_mode = + get_accel_struct_build(dev->physical_device, VK_ACCELERATION_STRUCTURE_BUILD_TYPE_DEVICE_KHR); + uint32_t node_id_stride = get_node_id_stride(build_mode); + + nir_ssa_def *src_nodes[4]; + for (uint32_t i = 0; i < 4; i++) { + src_nodes[i] = + nir_build_load_global(&b, 1, 32, nir_iadd_imm(&b, src_base_addr, i * node_id_stride)); + nir_build_store_global(&b, src_nodes[i], nir_iadd_imm(&b, node_dst_addr, i * 4)); + } nir_ssa_def *total_bounds[2] = { nir_channels(&b, nir_imm_vec4(&b, NAN, NAN, NAN, NAN), 7), @@ -1329,7 +1575,7 @@ build_internal_shader(struct radv_device *dev) nir_store_var(&b, bounds[1], nir_channels(&b, nir_imm_vec4(&b, NAN, NAN, NAN, NAN), 7), 7); nir_push_if(&b, nir_ilt(&b, nir_imm_int(&b, i), src_count)); - determine_bounds(&b, node_addr, nir_channel(&b, src_nodes, i), bounds); + determine_bounds(&b, node_addr, src_nodes[i], bounds); nir_pop_if(&b, NULL); nir_build_store_global(&b, nir_load_var(&b, bounds[0]), nir_iadd_imm(&b, node_dst_addr, 16 + 24 * i)); @@ -1342,7 +1588,8 @@ build_internal_shader(struct radv_device *dev) nir_ssa_def *node_id = nir_iadd_imm(&b, nir_ushr_imm(&b, node_offset, 3), 5); nir_ssa_def *dst_scratch_addr = nir_iadd(&b, scratch_addr, - nir_u2u64(&b, nir_iadd(&b, dst_scratch_offset, nir_ishl_imm(&b, global_id, 2)))); + nir_u2u64(&b, nir_iadd(&b, dst_scratch_offset, + id_to_node_id_offset(&b, global_id, dev->physical_device)))); nir_build_store_global(&b, node_id, dst_scratch_addr); nir_push_if(&b, fill_header); @@ -1586,12 +1833,82 @@ radv_device_finish_accel_struct_build_state(struct radv_device *device) &state->alloc); radv_DestroyPipeline(radv_device_to_handle(device), state->accel_struct_build.leaf_pipeline, &state->alloc); + radv_DestroyPipeline(radv_device_to_handle(device), state->accel_struct_build.morton_pipeline, + &state->alloc); radv_DestroyPipelineLayout(radv_device_to_handle(device), state->accel_struct_build.copy_p_layout, &state->alloc); radv_DestroyPipelineLayout(radv_device_to_handle(device), state->accel_struct_build.internal_p_layout, &state->alloc); radv_DestroyPipelineLayout(radv_device_to_handle(device), state->accel_struct_build.leaf_p_layout, &state->alloc); + radv_DestroyPipelineLayout(radv_device_to_handle(device), + state->accel_struct_build.morton_p_layout, &state->alloc); + + if (state->accel_struct_build.radix_sort) + radix_sort_vk_destroy(state->accel_struct_build.radix_sort, radv_device_to_handle(device), + &state->alloc); +} + +static VkResult +create_build_pipeline(struct radv_device *device, nir_shader *shader, unsigned push_constant_size, + VkPipeline *pipeline, VkPipelineLayout *layout) +{ + const VkPipelineLayoutCreateInfo pl_create_info = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, + .setLayoutCount = 0, + .pushConstantRangeCount = 1, + .pPushConstantRanges = + &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0, push_constant_size}, + }; + + VkResult result = radv_CreatePipelineLayout(radv_device_to_handle(device), &pl_create_info, + &device->meta_state.alloc, layout); + if (result != VK_SUCCESS) { + radv_device_finish_accel_struct_build_state(device); + ralloc_free(shader); + return result; + } + + VkPipelineShaderStageCreateInfo shader_stage = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_COMPUTE_BIT, + .module = vk_shader_module_handle_from_nir(shader), + .pName = "main", + .pSpecializationInfo = NULL, + }; + + VkComputePipelineCreateInfo pipeline_info = { + .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, + .stage = shader_stage, + .flags = 0, + .layout = *layout, + }; + + result = radv_CreateComputePipelines(radv_device_to_handle(device), + radv_pipeline_cache_to_handle(&device->meta_state.cache), 1, + &pipeline_info, &device->meta_state.alloc, pipeline); + + if (result != VK_SUCCESS) { + radv_device_finish_accel_struct_build_state(device); + ralloc_free(shader); + return result; + } + + return VK_SUCCESS; +} + +static void +radix_sort_fill_buffer(VkCommandBuffer commandBuffer, + radix_sort_vk_buffer_info_t const *buffer_info, VkDeviceSize offset, + VkDeviceSize size, uint32_t data) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + + assert(size % 4 == 0); + assert(size != VK_WHOLE_SIZE); + + radv_fill_buffer_shader(cmd_buffer, buffer_info->devaddr + buffer_info->offset + offset, size, + data); } VkResult @@ -1602,122 +1919,47 @@ radv_device_init_accel_struct_build_state(struct radv_device *device) nir_shader *internal_cs = build_internal_shader(device); nir_shader *copy_cs = build_copy_shader(device); - const VkPipelineLayoutCreateInfo leaf_pl_create_info = { - .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, - .setLayoutCount = 0, - .pushConstantRangeCount = 1, - .pPushConstantRanges = &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0, - sizeof(struct build_primitive_constants)}, - }; - - result = radv_CreatePipelineLayout(radv_device_to_handle(device), &leaf_pl_create_info, - &device->meta_state.alloc, - &device->meta_state.accel_struct_build.leaf_p_layout); + result = create_build_pipeline(device, leaf_cs, sizeof(struct build_primitive_constants), + &device->meta_state.accel_struct_build.leaf_pipeline, + &device->meta_state.accel_struct_build.leaf_p_layout); if (result != VK_SUCCESS) - goto fail; + return result; - VkPipelineShaderStageCreateInfo leaf_shader_stage = { - .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, - .stage = VK_SHADER_STAGE_COMPUTE_BIT, - .module = vk_shader_module_handle_from_nir(leaf_cs), - .pName = "main", - .pSpecializationInfo = NULL, - }; - - VkComputePipelineCreateInfo leaf_pipeline_info = { - .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, - .stage = leaf_shader_stage, - .flags = 0, - .layout = device->meta_state.accel_struct_build.leaf_p_layout, - }; - - result = radv_CreateComputePipelines( - radv_device_to_handle(device), radv_pipeline_cache_to_handle(&device->meta_state.cache), 1, - &leaf_pipeline_info, NULL, &device->meta_state.accel_struct_build.leaf_pipeline); + result = create_build_pipeline(device, internal_cs, sizeof(struct build_internal_constants), + &device->meta_state.accel_struct_build.internal_pipeline, + &device->meta_state.accel_struct_build.internal_p_layout); if (result != VK_SUCCESS) - goto fail; + return result; - const VkPipelineLayoutCreateInfo internal_pl_create_info = { - .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, - .setLayoutCount = 0, - .pushConstantRangeCount = 1, - .pPushConstantRanges = &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0, - sizeof(struct build_internal_constants)}, - }; + result = create_build_pipeline(device, copy_cs, sizeof(struct copy_constants), + &device->meta_state.accel_struct_build.copy_pipeline, + &device->meta_state.accel_struct_build.copy_p_layout); - result = radv_CreatePipelineLayout(radv_device_to_handle(device), &internal_pl_create_info, - &device->meta_state.alloc, - &device->meta_state.accel_struct_build.internal_p_layout); if (result != VK_SUCCESS) - goto fail; + return result; - VkPipelineShaderStageCreateInfo internal_shader_stage = { - .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, - .stage = VK_SHADER_STAGE_COMPUTE_BIT, - .module = vk_shader_module_handle_from_nir(internal_cs), - .pName = "main", - .pSpecializationInfo = NULL, - }; + if (get_accel_struct_build(device->physical_device, + VK_ACCELERATION_STRUCTURE_BUILD_TYPE_DEVICE_KHR) == + accel_struct_build_lbvh) { + nir_shader *morton_cs = build_morton_shader(device); - VkComputePipelineCreateInfo internal_pipeline_info = { - .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, - .stage = internal_shader_stage, - .flags = 0, - .layout = device->meta_state.accel_struct_build.internal_p_layout, - }; + result = create_build_pipeline(device, morton_cs, sizeof(struct morton_constants), + &device->meta_state.accel_struct_build.morton_pipeline, + &device->meta_state.accel_struct_build.morton_p_layout); + if (result != VK_SUCCESS) + return result; - result = radv_CreateComputePipelines( - radv_device_to_handle(device), radv_pipeline_cache_to_handle(&device->meta_state.cache), 1, - &internal_pipeline_info, NULL, &device->meta_state.accel_struct_build.internal_pipeline); - if (result != VK_SUCCESS) - goto fail; + device->meta_state.accel_struct_build.radix_sort = + radv_create_radix_sort_u64(radv_device_to_handle(device), &device->meta_state.alloc, + radv_pipeline_cache_to_handle(&device->meta_state.cache)); - const VkPipelineLayoutCreateInfo copy_pl_create_info = { - .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, - .setLayoutCount = 0, - .pushConstantRangeCount = 1, - .pPushConstantRanges = - &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(struct copy_constants)}, - }; + struct radix_sort_vk_sort_devaddr_info *radix_sort_info = + &device->meta_state.accel_struct_build.radix_sort_info; + radix_sort_info->ext = NULL; + radix_sort_info->key_bits = 24; + radix_sort_info->fill_buffer = radix_sort_fill_buffer; + } - result = radv_CreatePipelineLayout(radv_device_to_handle(device), ©_pl_create_info, - &device->meta_state.alloc, - &device->meta_state.accel_struct_build.copy_p_layout); - if (result != VK_SUCCESS) - goto fail; - - VkPipelineShaderStageCreateInfo copy_shader_stage = { - .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, - .stage = VK_SHADER_STAGE_COMPUTE_BIT, - .module = vk_shader_module_handle_from_nir(copy_cs), - .pName = "main", - .pSpecializationInfo = NULL, - }; - - VkComputePipelineCreateInfo copy_pipeline_info = { - .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, - .stage = copy_shader_stage, - .flags = 0, - .layout = device->meta_state.accel_struct_build.copy_p_layout, - }; - - result = radv_CreateComputePipelines( - radv_device_to_handle(device), radv_pipeline_cache_to_handle(&device->meta_state.cache), 1, - ©_pipeline_info, NULL, &device->meta_state.accel_struct_build.copy_pipeline); - if (result != VK_SUCCESS) - goto fail; - - ralloc_free(copy_cs); - ralloc_free(internal_cs); - ralloc_free(leaf_cs); - - return VK_SUCCESS; - -fail: - radv_device_finish_accel_struct_build_state(device); - ralloc_free(copy_cs); - ralloc_free(internal_cs); - ralloc_free(leaf_cs); return result; } @@ -1725,6 +1967,8 @@ struct bvh_state { uint32_t node_offset; uint32_t node_count; uint32_t scratch_offset; + uint32_t buffer_1_offset; + uint32_t buffer_2_offset; uint32_t instance_offset; uint32_t instance_count; @@ -1739,12 +1983,35 @@ radv_CmdBuildAccelerationStructuresKHR( RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); struct radv_meta_saved_state saved_state; + enum radv_cmd_flush_bits flush_bits = + RADV_CMD_FLAG_CS_PARTIAL_FLUSH | + radv_src_access_flush(cmd_buffer, VK_ACCESS_2_SHADER_READ_BIT | VK_ACCESS_2_SHADER_WRITE_BIT, + NULL) | + radv_dst_access_flush(cmd_buffer, VK_ACCESS_2_SHADER_READ_BIT | VK_ACCESS_2_SHADER_WRITE_BIT, + NULL); + + enum accel_struct_build build_mode = get_accel_struct_build( + cmd_buffer->device->physical_device, VK_ACCELERATION_STRUCTURE_BUILD_TYPE_DEVICE_KHR); + uint32_t node_id_stride = get_node_id_stride(build_mode); + uint32_t scratch_offset = + (build_mode != accel_struct_build_unoptimized) ? SCRATCH_TOTAL_BOUNDS_SIZE : 0; + radv_meta_save( &saved_state, cmd_buffer, RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_DESCRIPTORS | RADV_META_SAVE_CONSTANTS); struct bvh_state *bvh_states = calloc(infoCount, sizeof(struct bvh_state)); - radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE, + if (build_mode != accel_struct_build_unoptimized) { + for (uint32_t i = 0; i < infoCount; ++i) { + /* Clear the bvh bounds with nan. */ + radv_fill_buffer_shader(cmd_buffer, pInfos[i].scratchData.deviceAddress, 6 * sizeof(float), + 0x7FC00000); + } + + cmd_buffer->state.flush_bits |= flush_bits; + } + + radv_CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, cmd_buffer->device->meta_state.accel_struct_build.leaf_pipeline); for (uint32_t i = 0; i < infoCount; ++i) { @@ -1753,7 +2020,7 @@ radv_CmdBuildAccelerationStructuresKHR( struct build_primitive_constants prim_consts = { .node_dst_addr = radv_accel_struct_get_va(accel_struct), - .scratch_addr = pInfos[i].scratchData.deviceAddress, + .scratch_addr = pInfos[i].scratchData.deviceAddress + scratch_offset, .dst_offset = ALIGN(sizeof(struct radv_accel_struct_header), 64) + 128, .dst_scratch_offset = 0, }; @@ -1805,20 +2072,91 @@ radv_CmdBuildAccelerationStructuresKHR( unreachable("Unknown geometryType"); } - radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), - cmd_buffer->device->meta_state.accel_struct_build.leaf_p_layout, - VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(prim_consts), - &prim_consts); + radv_CmdPushConstants( + commandBuffer, cmd_buffer->device->meta_state.accel_struct_build.leaf_p_layout, + VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(prim_consts), &prim_consts); radv_unaligned_dispatch(cmd_buffer, ppBuildRangeInfos[i][j].primitiveCount, 1, 1); prim_consts.dst_offset += prim_size * ppBuildRangeInfos[i][j].primitiveCount; - prim_consts.dst_scratch_offset += 4 * ppBuildRangeInfos[i][j].primitiveCount; + prim_consts.dst_scratch_offset += + node_id_stride * ppBuildRangeInfos[i][j].primitiveCount; } } bvh_states[i].node_offset = prim_consts.dst_offset; - bvh_states[i].node_count = prim_consts.dst_scratch_offset / 4; + bvh_states[i].node_count = prim_consts.dst_scratch_offset / node_id_stride; } - radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE, + if (build_mode == accel_struct_build_lbvh) { + cmd_buffer->state.flush_bits |= flush_bits; + + radv_CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, + cmd_buffer->device->meta_state.accel_struct_build.morton_pipeline); + + for (uint32_t i = 0; i < infoCount; ++i) { + RADV_FROM_HANDLE(radv_acceleration_structure, accel_struct, + pInfos[i].dstAccelerationStructure); + + const struct morton_constants consts = { + .node_addr = radv_accel_struct_get_va(accel_struct), + .scratch_addr = pInfos[i].scratchData.deviceAddress + SCRATCH_TOTAL_BOUNDS_SIZE, + }; + + radv_CmdPushConstants(commandBuffer, + cmd_buffer->device->meta_state.accel_struct_build.morton_p_layout, + VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts); + radv_unaligned_dispatch(cmd_buffer, bvh_states[i].node_count, 1, 1); + } + + cmd_buffer->state.flush_bits |= flush_bits; + + for (uint32_t i = 0; i < infoCount; ++i) { + struct radix_sort_vk_memory_requirements requirements; + radix_sort_vk_get_memory_requirements( + cmd_buffer->device->meta_state.accel_struct_build.radix_sort, bvh_states[i].node_count, + &requirements); + + struct radix_sort_vk_sort_devaddr_info info = + cmd_buffer->device->meta_state.accel_struct_build.radix_sort_info; + info.count = bvh_states[i].node_count; + + VkDeviceAddress base_addr = + pInfos[i].scratchData.deviceAddress + SCRATCH_TOTAL_BOUNDS_SIZE; + + info.keyvals_even.buffer = VK_NULL_HANDLE; + info.keyvals_even.offset = 0; + info.keyvals_even.devaddr = base_addr; + + info.keyvals_odd = base_addr + requirements.keyvals_size; + + info.internal.buffer = VK_NULL_HANDLE; + info.internal.offset = 0; + info.internal.devaddr = base_addr + requirements.keyvals_size * 2; + + VkDeviceAddress result_addr; + radix_sort_vk_sort_devaddr(cmd_buffer->device->meta_state.accel_struct_build.radix_sort, + &info, radv_device_to_handle(cmd_buffer->device), commandBuffer, + &result_addr); + + assert(result_addr == info.keyvals_even.devaddr || result_addr == info.keyvals_odd); + + if (result_addr == info.keyvals_even.devaddr) { + bvh_states[i].buffer_1_offset = SCRATCH_TOTAL_BOUNDS_SIZE; + bvh_states[i].buffer_2_offset = SCRATCH_TOTAL_BOUNDS_SIZE + requirements.keyvals_size; + } else { + bvh_states[i].buffer_1_offset = SCRATCH_TOTAL_BOUNDS_SIZE + requirements.keyvals_size; + bvh_states[i].buffer_2_offset = SCRATCH_TOTAL_BOUNDS_SIZE; + } + bvh_states[i].scratch_offset = bvh_states[i].buffer_1_offset; + } + + cmd_buffer->state.flush_bits |= flush_bits; + } else { + for (uint32_t i = 0; i < infoCount; ++i) { + bvh_states[i].buffer_1_offset = 0; + bvh_states[i].buffer_2_offset = bvh_states[i].node_count * 4; + } + } + + radv_CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, cmd_buffer->device->meta_state.accel_struct_build.internal_pipeline); bool progress = true; for (unsigned iter = 0; progress; ++iter) { @@ -1830,18 +2168,20 @@ radv_CmdBuildAccelerationStructuresKHR( if (iter && bvh_states[i].node_count == 1) continue; - if (!progress) { - cmd_buffer->state.flush_bits |= - RADV_CMD_FLAG_CS_PARTIAL_FLUSH | - radv_src_access_flush(cmd_buffer, VK_ACCESS_2_SHADER_WRITE_BIT, NULL) | - radv_dst_access_flush(cmd_buffer, - VK_ACCESS_2_SHADER_READ_BIT | VK_ACCESS_2_SHADER_WRITE_BIT, NULL); - } + if (!progress) + cmd_buffer->state.flush_bits |= flush_bits; + progress = true; + uint32_t dst_node_count = MAX2(1, DIV_ROUND_UP(bvh_states[i].node_count, 4)); bool final_iter = dst_node_count == 1; + uint32_t src_scratch_offset = bvh_states[i].scratch_offset; - uint32_t dst_scratch_offset = src_scratch_offset ? 0 : bvh_states[i].node_count * 4; + uint32_t buffer_1_offset = bvh_states[i].buffer_1_offset; + uint32_t buffer_2_offset = bvh_states[i].buffer_2_offset; + uint32_t dst_scratch_offset = + (src_scratch_offset == buffer_1_offset) ? buffer_2_offset : buffer_1_offset; + uint32_t dst_node_offset = bvh_states[i].node_offset; if (final_iter) dst_node_offset = ALIGN(sizeof(struct radv_accel_struct_header), 64); @@ -1855,7 +2195,7 @@ radv_CmdBuildAccelerationStructuresKHR( .fill_header = bvh_states[i].node_count | (final_iter ? 0x80000000U : 0), }; - radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), + radv_CmdPushConstants(commandBuffer, cmd_buffer->device->meta_state.accel_struct_build.internal_p_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts); radv_unaligned_dispatch(cmd_buffer, dst_node_count, 1, 1); diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index b9a91d21150..89c66a0af86 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -82,6 +82,8 @@ #include "radv_shader_args.h" #include "sid.h" +#include "radix_sort/radix_sort_vk_devaddr.h" + /* Pre-declarations needed for WSI entrypoints */ struct wl_surface; struct wl_display; @@ -661,10 +663,15 @@ struct radv_meta_state { struct { VkPipelineLayout leaf_p_layout; VkPipeline leaf_pipeline; + VkPipelineLayout morton_p_layout; + VkPipeline morton_pipeline; VkPipelineLayout internal_p_layout; VkPipeline internal_pipeline; VkPipelineLayout copy_p_layout; VkPipeline copy_pipeline; + + struct radix_sort_vk *radix_sort; + struct radix_sort_vk_sort_devaddr_info radix_sort_info; } accel_struct_build; struct {