diff --git a/src/amd/vulkan/bvh/build_helpers.h b/src/amd/vulkan/bvh/build_helpers.h index 30b3224b2d6..3014a827e61 100644 --- a/src/amd/vulkan/bvh/build_helpers.h +++ b/src/amd/vulkan/bvh/build_helpers.h @@ -8,210 +8,7 @@ #define BVH_BUILD_HELPERS_H #include "bvh.h" - -#define VK_FORMAT_UNDEFINED 0 -#define VK_FORMAT_R4G4_UNORM_PACK8 1 -#define VK_FORMAT_R4G4B4A4_UNORM_PACK16 2 -#define VK_FORMAT_B4G4R4A4_UNORM_PACK16 3 -#define VK_FORMAT_R5G6B5_UNORM_PACK16 4 -#define VK_FORMAT_B5G6R5_UNORM_PACK16 5 -#define VK_FORMAT_R5G5B5A1_UNORM_PACK16 6 -#define VK_FORMAT_B5G5R5A1_UNORM_PACK16 7 -#define VK_FORMAT_A1R5G5B5_UNORM_PACK16 8 -#define VK_FORMAT_R8_UNORM 9 -#define VK_FORMAT_R8_SNORM 10 -#define VK_FORMAT_R8_USCALED 11 -#define VK_FORMAT_R8_SSCALED 12 -#define VK_FORMAT_R8_UINT 13 -#define VK_FORMAT_R8_SINT 14 -#define VK_FORMAT_R8_SRGB 15 -#define VK_FORMAT_R8G8_UNORM 16 -#define VK_FORMAT_R8G8_SNORM 17 -#define VK_FORMAT_R8G8_USCALED 18 -#define VK_FORMAT_R8G8_SSCALED 19 -#define VK_FORMAT_R8G8_UINT 20 -#define VK_FORMAT_R8G8_SINT 21 -#define VK_FORMAT_R8G8_SRGB 22 -#define VK_FORMAT_R8G8B8_UNORM 23 -#define VK_FORMAT_R8G8B8_SNORM 24 -#define VK_FORMAT_R8G8B8_USCALED 25 -#define VK_FORMAT_R8G8B8_SSCALED 26 -#define VK_FORMAT_R8G8B8_UINT 27 -#define VK_FORMAT_R8G8B8_SINT 28 -#define VK_FORMAT_R8G8B8_SRGB 29 -#define VK_FORMAT_B8G8R8_UNORM 30 -#define VK_FORMAT_B8G8R8_SNORM 31 -#define VK_FORMAT_B8G8R8_USCALED 32 -#define VK_FORMAT_B8G8R8_SSCALED 33 -#define VK_FORMAT_B8G8R8_UINT 34 -#define VK_FORMAT_B8G8R8_SINT 35 -#define VK_FORMAT_B8G8R8_SRGB 36 -#define VK_FORMAT_R8G8B8A8_UNORM 37 -#define VK_FORMAT_R8G8B8A8_SNORM 38 -#define VK_FORMAT_R8G8B8A8_USCALED 39 -#define VK_FORMAT_R8G8B8A8_SSCALED 40 -#define VK_FORMAT_R8G8B8A8_UINT 41 -#define VK_FORMAT_R8G8B8A8_SINT 42 -#define VK_FORMAT_R8G8B8A8_SRGB 43 -#define VK_FORMAT_B8G8R8A8_UNORM 44 -#define VK_FORMAT_B8G8R8A8_SNORM 45 -#define VK_FORMAT_B8G8R8A8_USCALED 46 -#define VK_FORMAT_B8G8R8A8_SSCALED 47 -#define VK_FORMAT_B8G8R8A8_UINT 48 -#define VK_FORMAT_B8G8R8A8_SINT 49 -#define VK_FORMAT_B8G8R8A8_SRGB 50 -#define VK_FORMAT_A8B8G8R8_UNORM_PACK32 51 -#define VK_FORMAT_A8B8G8R8_SNORM_PACK32 52 -#define VK_FORMAT_A8B8G8R8_USCALED_PACK32 53 -#define VK_FORMAT_A8B8G8R8_SSCALED_PACK32 54 -#define VK_FORMAT_A8B8G8R8_UINT_PACK32 55 -#define VK_FORMAT_A8B8G8R8_SINT_PACK32 56 -#define VK_FORMAT_A8B8G8R8_SRGB_PACK32 57 -#define VK_FORMAT_A2R10G10B10_UNORM_PACK32 58 -#define VK_FORMAT_A2R10G10B10_SNORM_PACK32 59 -#define VK_FORMAT_A2R10G10B10_USCALED_PACK32 60 -#define VK_FORMAT_A2R10G10B10_SSCALED_PACK32 61 -#define VK_FORMAT_A2R10G10B10_UINT_PACK32 62 -#define VK_FORMAT_A2R10G10B10_SINT_PACK32 63 -#define VK_FORMAT_A2B10G10R10_UNORM_PACK32 64 -#define VK_FORMAT_A2B10G10R10_SNORM_PACK32 65 -#define VK_FORMAT_A2B10G10R10_USCALED_PACK32 66 -#define VK_FORMAT_A2B10G10R10_SSCALED_PACK32 67 -#define VK_FORMAT_A2B10G10R10_UINT_PACK32 68 -#define VK_FORMAT_A2B10G10R10_SINT_PACK32 69 -#define VK_FORMAT_R16_UNORM 70 -#define VK_FORMAT_R16_SNORM 71 -#define VK_FORMAT_R16_USCALED 72 -#define VK_FORMAT_R16_SSCALED 73 -#define VK_FORMAT_R16_UINT 74 -#define VK_FORMAT_R16_SINT 75 -#define VK_FORMAT_R16_SFLOAT 76 -#define VK_FORMAT_R16G16_UNORM 77 -#define VK_FORMAT_R16G16_SNORM 78 -#define VK_FORMAT_R16G16_USCALED 79 -#define VK_FORMAT_R16G16_SSCALED 80 -#define VK_FORMAT_R16G16_UINT 81 -#define VK_FORMAT_R16G16_SINT 82 -#define VK_FORMAT_R16G16_SFLOAT 83 -#define VK_FORMAT_R16G16B16_UNORM 84 -#define VK_FORMAT_R16G16B16_SNORM 85 -#define VK_FORMAT_R16G16B16_USCALED 86 -#define VK_FORMAT_R16G16B16_SSCALED 87 -#define VK_FORMAT_R16G16B16_UINT 88 -#define VK_FORMAT_R16G16B16_SINT 89 -#define VK_FORMAT_R16G16B16_SFLOAT 90 -#define VK_FORMAT_R16G16B16A16_UNORM 91 -#define VK_FORMAT_R16G16B16A16_SNORM 92 -#define VK_FORMAT_R16G16B16A16_USCALED 93 -#define VK_FORMAT_R16G16B16A16_SSCALED 94 -#define VK_FORMAT_R16G16B16A16_UINT 95 -#define VK_FORMAT_R16G16B16A16_SINT 96 -#define VK_FORMAT_R16G16B16A16_SFLOAT 97 -#define VK_FORMAT_R32_UINT 98 -#define VK_FORMAT_R32_SINT 99 -#define VK_FORMAT_R32_SFLOAT 100 -#define VK_FORMAT_R32G32_UINT 101 -#define VK_FORMAT_R32G32_SINT 102 -#define VK_FORMAT_R32G32_SFLOAT 103 -#define VK_FORMAT_R32G32B32_UINT 104 -#define VK_FORMAT_R32G32B32_SINT 105 -#define VK_FORMAT_R32G32B32_SFLOAT 106 -#define VK_FORMAT_R32G32B32A32_UINT 107 -#define VK_FORMAT_R32G32B32A32_SINT 108 -#define VK_FORMAT_R32G32B32A32_SFLOAT 109 -#define VK_FORMAT_R64_UINT 110 -#define VK_FORMAT_R64_SINT 111 -#define VK_FORMAT_R64_SFLOAT 112 -#define VK_FORMAT_R64G64_UINT 113 -#define VK_FORMAT_R64G64_SINT 114 -#define VK_FORMAT_R64G64_SFLOAT 115 -#define VK_FORMAT_R64G64B64_UINT 116 -#define VK_FORMAT_R64G64B64_SINT 117 -#define VK_FORMAT_R64G64B64_SFLOAT 118 -#define VK_FORMAT_R64G64B64A64_UINT 119 -#define VK_FORMAT_R64G64B64A64_SINT 120 -#define VK_FORMAT_R64G64B64A64_SFLOAT 121 - -#define VK_INDEX_TYPE_UINT16 0 -#define VK_INDEX_TYPE_UINT32 1 -#define VK_INDEX_TYPE_NONE_KHR 1000165000 -#define VK_INDEX_TYPE_UINT8_EXT 1000265000 - -#define VK_GEOMETRY_TYPE_TRIANGLES_KHR 0 -#define VK_GEOMETRY_TYPE_AABBS_KHR 1 -#define VK_GEOMETRY_TYPE_INSTANCES_KHR 2 - -#define VK_GEOMETRY_INSTANCE_TRIANGLE_FACING_CULL_DISABLE_BIT_KHR 1 -#define VK_GEOMETRY_INSTANCE_TRIANGLE_FLIP_FACING_BIT_KHR 2 -#define VK_GEOMETRY_INSTANCE_FORCE_OPAQUE_BIT_KHR 4 -#define VK_GEOMETRY_INSTANCE_FORCE_NO_OPAQUE_BIT_KHR 8 - -#define TYPE(type, align) \ - layout(buffer_reference, buffer_reference_align = align, scalar) buffer type##_ref \ - { \ - type value; \ - }; - -#define REF(type) type##_ref -#define VOID_REF uint64_t -#define NULL 0 -#define DEREF(var) var.value - -#define SIZEOF(type) uint32_t(uint64_t(REF(type)(uint64_t(0)) + 1)) - -#define OFFSET(ptr, offset) (uint64_t(ptr) + offset) - -#define INFINITY (1.0 / 0.0) -#define NAN (0.0 / 0.0) - -#define INDEX(type, ptr, index) REF(type)(OFFSET(ptr, (index)*SIZEOF(type))) - -TYPE(int8_t, 1); -TYPE(uint8_t, 1); -TYPE(int16_t, 2); -TYPE(uint16_t, 2); -TYPE(int32_t, 4); -TYPE(uint32_t, 4); -TYPE(int64_t, 8); -TYPE(uint64_t, 8); - -TYPE(float, 4); - -TYPE(vec2, 4); -TYPE(vec3, 4); -TYPE(vec4, 4); - -TYPE(uvec4, 16); - -TYPE(VOID_REF, 8); - -/* copied from u_math.h */ -uint32_t -align(uint32_t value, uint32_t alignment) -{ - return (value + alignment - 1) & ~(alignment - 1); -} - -int32_t -to_emulated_float(float f) -{ - int32_t bits = floatBitsToInt(f); - return f < 0 ? -2147483648 - bits : bits; -} - -float -from_emulated_float(int32_t bits) -{ - return intBitsToFloat(bits < 0 ? -2147483648 - bits : bits); -} - -TYPE(radv_aabb, 4); - -struct key_id_pair { - uint32_t id; - uint32_t key; -}; -TYPE(key_id_pair, 4); +#include "vk_build_helpers.h" TYPE(radv_accel_struct_serialization_header, 8); TYPE(radv_accel_struct_header, 8); @@ -221,12 +18,6 @@ TYPE(radv_bvh_instance_node, 8); TYPE(radv_bvh_box16_node, 4); TYPE(radv_bvh_box32_node, 4); -TYPE(radv_ir_header, 4); -TYPE(radv_ir_node, 4); -TYPE(radv_ir_box_node, 4); - -TYPE(radv_global_sync_data, 4); - uint32_t id_to_offset(uint32_t id) { @@ -259,178 +50,23 @@ addr_to_node(uint64_t addr) return (addr >> 3) & ((1ul << 45) - 1); } -uint32_t -ir_id_to_offset(uint32_t id) -{ - return id & (~3u); -} - -uint32_t -ir_id_to_type(uint32_t id) -{ - return id & 3u; -} - -uint32_t -pack_ir_node_id(uint32_t offset, uint32_t type) -{ - return offset | type; -} - uint32_t ir_type_to_bvh_type(uint32_t type) { switch (type) { - case radv_ir_node_triangle: + case vk_ir_node_triangle: return radv_bvh_node_triangle; - case radv_ir_node_internal: + case vk_ir_node_internal: return radv_bvh_node_box32; - case radv_ir_node_instance: + case vk_ir_node_instance: return radv_bvh_node_instance; - case radv_ir_node_aabb: + case vk_ir_node_aabb: return radv_bvh_node_aabb; } /* unreachable in valid nodes */ return RADV_BVH_INVALID_NODE; } -float -aabb_surface_area(radv_aabb aabb) -{ - vec3 diagonal = aabb.max - aabb.min; - return 2 * diagonal.x * diagonal.y + 2 * diagonal.y * diagonal.z + 2 * diagonal.x * diagonal.z; -} - -/* Just a wrapper for 3 uints. */ -struct triangle_indices { - uint32_t index[3]; -}; - -triangle_indices -load_indices(VOID_REF indices, uint32_t index_format, uint32_t global_id) -{ - triangle_indices result; - - uint32_t index_base = global_id * 3; - - switch (index_format) { - case VK_INDEX_TYPE_UINT16: { - result.index[0] = DEREF(INDEX(uint16_t, indices, index_base + 0)); - result.index[1] = DEREF(INDEX(uint16_t, indices, index_base + 1)); - result.index[2] = DEREF(INDEX(uint16_t, indices, index_base + 2)); - break; - } - case VK_INDEX_TYPE_UINT32: { - result.index[0] = DEREF(INDEX(uint32_t, indices, index_base + 0)); - result.index[1] = DEREF(INDEX(uint32_t, indices, index_base + 1)); - result.index[2] = DEREF(INDEX(uint32_t, indices, index_base + 2)); - break; - } - case VK_INDEX_TYPE_NONE_KHR: { - result.index[0] = index_base + 0; - result.index[1] = index_base + 1; - result.index[2] = index_base + 2; - break; - } - case VK_INDEX_TYPE_UINT8_EXT: { - result.index[0] = DEREF(INDEX(uint8_t, indices, index_base + 0)); - result.index[1] = DEREF(INDEX(uint8_t, indices, index_base + 1)); - result.index[2] = DEREF(INDEX(uint8_t, indices, index_base + 2)); - break; - } - } - - return result; -} - -/* Just a wrapper for 3 vec4s. */ -struct triangle_vertices { - vec4 vertex[3]; -}; - -TYPE(float16_t, 2); - -triangle_vertices -load_vertices(VOID_REF vertices, triangle_indices indices, uint32_t vertex_format, uint32_t stride) -{ - triangle_vertices result; - - for (uint32_t i = 0; i < 3; i++) { - VOID_REF vertex_ptr = OFFSET(vertices, indices.index[i] * stride); - vec4 vertex = vec4(0.0, 0.0, 0.0, 1.0); - - switch (vertex_format) { - case VK_FORMAT_R32G32_SFLOAT: - vertex.x = DEREF(INDEX(float, vertex_ptr, 0)); - vertex.y = DEREF(INDEX(float, vertex_ptr, 1)); - break; - case VK_FORMAT_R32G32B32_SFLOAT: - case VK_FORMAT_R32G32B32A32_SFLOAT: - vertex.x = DEREF(INDEX(float, vertex_ptr, 0)); - vertex.y = DEREF(INDEX(float, vertex_ptr, 1)); - vertex.z = DEREF(INDEX(float, vertex_ptr, 2)); - break; - case VK_FORMAT_R16G16_SFLOAT: - vertex.x = DEREF(INDEX(float16_t, vertex_ptr, 0)); - vertex.y = DEREF(INDEX(float16_t, vertex_ptr, 1)); - break; - case VK_FORMAT_R16G16B16_SFLOAT: - case VK_FORMAT_R16G16B16A16_SFLOAT: - vertex.x = DEREF(INDEX(float16_t, vertex_ptr, 0)); - vertex.y = DEREF(INDEX(float16_t, vertex_ptr, 1)); - vertex.z = DEREF(INDEX(float16_t, vertex_ptr, 2)); - break; - case VK_FORMAT_R16G16_SNORM: - vertex.x = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 0)) / float(0x7FFF)); - vertex.y = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 1)) / float(0x7FFF)); - break; - case VK_FORMAT_R16G16B16A16_SNORM: - vertex.x = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 0)) / float(0x7FFF)); - vertex.y = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 1)) / float(0x7FFF)); - vertex.z = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 2)) / float(0x7FFF)); - break; - case VK_FORMAT_R8G8_SNORM: - vertex.x = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 0)) / float(0x7F)); - vertex.y = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 1)) / float(0x7F)); - break; - case VK_FORMAT_R8G8B8A8_SNORM: - vertex.x = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 0)) / float(0x7F)); - vertex.y = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 1)) / float(0x7F)); - vertex.z = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 2)) / float(0x7F)); - break; - case VK_FORMAT_R16G16_UNORM: - vertex.x = DEREF(INDEX(uint16_t, vertex_ptr, 0)) / float(0xFFFF); - vertex.y = DEREF(INDEX(uint16_t, vertex_ptr, 1)) / float(0xFFFF); - break; - case VK_FORMAT_R16G16B16A16_UNORM: - vertex.x = DEREF(INDEX(uint16_t, vertex_ptr, 0)) / float(0xFFFF); - vertex.y = DEREF(INDEX(uint16_t, vertex_ptr, 1)) / float(0xFFFF); - vertex.z = DEREF(INDEX(uint16_t, vertex_ptr, 2)) / float(0xFFFF); - break; - case VK_FORMAT_R8G8_UNORM: - vertex.x = DEREF(INDEX(uint8_t, vertex_ptr, 0)) / float(0xFF); - vertex.y = DEREF(INDEX(uint8_t, vertex_ptr, 1)) / float(0xFF); - break; - case VK_FORMAT_R8G8B8A8_UNORM: - vertex.x = DEREF(INDEX(uint8_t, vertex_ptr, 0)) / float(0xFF); - vertex.y = DEREF(INDEX(uint8_t, vertex_ptr, 1)) / float(0xFF); - vertex.z = DEREF(INDEX(uint8_t, vertex_ptr, 2)) / float(0xFF); - break; - case VK_FORMAT_A2B10G10R10_UNORM_PACK32: { - uint32_t data = DEREF(REF(uint32_t)(vertex_ptr)); - vertex.x = float(data & 0x3FF) / 0x3FF; - vertex.y = float((data >> 10) & 0x3FF) / 0x3FF; - vertex.z = float((data >> 20) & 0x3FF) / 0x3FF; - break; - } - } - - result.vertex[i] = vertex; - } - - return result; -} - /* A GLSL-adapted copy of VkAccelerationStructureInstanceKHR. */ struct AccelerationStructureInstance { mat3x4 transform; @@ -441,7 +77,7 @@ struct AccelerationStructureInstance { TYPE(AccelerationStructureInstance, 8); bool -build_triangle(inout radv_aabb bounds, VOID_REF dst_ptr, radv_bvh_geometry_data geom_data, uint32_t global_id) +build_triangle(inout vk_aabb bounds, VOID_REF dst_ptr, vk_bvh_geometry_data geom_data, uint32_t global_id) { bool is_valid = true; triangle_indices indices = load_indices(geom_data.indices, geom_data.index_format, global_id); @@ -490,7 +126,7 @@ build_triangle(inout radv_aabb bounds, VOID_REF dst_ptr, radv_bvh_geometry_data } bool -build_aabb(inout radv_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint32_t geometry_id, uint32_t global_id) +build_aabb(inout vk_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint32_t geometry_id, uint32_t global_id) { bool is_valid = true; REF(radv_bvh_aabb_node) node = REF(radv_bvh_aabb_node)(dst_ptr); @@ -521,10 +157,10 @@ build_aabb(inout radv_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint32_t return is_valid; } -radv_aabb +vk_aabb calculate_instance_node_bounds(radv_accel_struct_header header, mat3x4 otw_matrix) { - radv_aabb aabb; + vk_aabb aabb; for (uint32_t comp = 0; comp < 3; ++comp) { aabb.min[comp] = otw_matrix[comp][3]; aabb.max[comp] = otw_matrix[comp][3]; @@ -555,7 +191,7 @@ encode_sbt_offset_and_flags(uint32_t src) } bool -build_instance(inout radv_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint32_t global_id) +build_instance(inout vk_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint32_t global_id) { REF(radv_bvh_instance_node) node = REF(radv_bvh_instance_node)(dst_ptr); @@ -591,123 +227,4 @@ build_instance(inout radv_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint3 From macros.h */ #define DIV_ROUND_UP(A, B) (((A) + (B)-1) / (B)) -#ifdef USE_GLOBAL_SYNC - -/* There might be more invocations available than tasks to do. - * In that case, the fetched task index is greater than the - * counter offset for the next phase. To avoid out-of-bounds - * accessing, phases will be skipped until the task index is - * is in-bounds again. */ -uint32_t num_tasks_to_skip = 0; -uint32_t phase_index = 0; -bool should_skip = false; -shared uint32_t global_task_index; - -shared uint32_t shared_phase_index; - -uint32_t -task_count(REF(radv_ir_header) header) -{ - uint32_t phase_index = DEREF(header).sync_data.phase_index; - return DEREF(header).sync_data.task_counts[phase_index & 1]; -} - -/* Sets the task count for the next phase. */ -void -set_next_task_count(REF(radv_ir_header) header, uint32_t new_count) -{ - uint32_t phase_index = DEREF(header).sync_data.phase_index; - DEREF(header).sync_data.task_counts[(phase_index + 1) & 1] = new_count; -} - -/* - * This function has two main objectives: - * Firstly, it partitions pending work among free invocations. - * Secondly, it guarantees global synchronization between different phases. - * - * After every call to fetch_task, a new task index is returned. - * fetch_task will also set num_tasks_to_skip. Use should_execute_phase - * to determine if the current phase should be executed or skipped. - * - * Since tasks are assigned per-workgroup, there is a possibility of the task index being - * greater than the total task count. - */ -uint32_t -fetch_task(REF(radv_ir_header) header, bool did_work) -{ - /* Perform a memory + control barrier for all buffer writes for the entire workgroup. - * This guarantees that once the workgroup leaves the PHASE loop, all invocations have finished - * and their results are written to memory. */ - controlBarrier(gl_ScopeWorkgroup, gl_ScopeDevice, gl_StorageSemanticsBuffer, - gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible); - if (gl_LocalInvocationIndex == 0) { - if (did_work) - atomicAdd(DEREF(header).sync_data.task_done_counter, 1); - global_task_index = atomicAdd(DEREF(header).sync_data.task_started_counter, 1); - - do { - /* Perform a memory barrier to refresh the current phase's end counter, in case - * another workgroup changed it. */ - memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer, - gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible); - - /* The first invocation of the first workgroup in a new phase is responsible to initiate the - * switch to a new phase. It is only possible to switch to a new phase if all tasks of the - * previous phase have been completed. Switching to a new phase and incrementing the phase - * end counter in turn notifies all invocations for that phase that it is safe to execute. - */ - if (global_task_index == DEREF(header).sync_data.current_phase_end_counter && - DEREF(header).sync_data.task_done_counter == DEREF(header).sync_data.current_phase_end_counter) { - if (DEREF(header).sync_data.next_phase_exit_flag != 0) { - DEREF(header).sync_data.phase_index = TASK_INDEX_INVALID; - memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer, - gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible); - } else { - atomicAdd(DEREF(header).sync_data.phase_index, 1); - DEREF(header).sync_data.current_phase_start_counter = DEREF(header).sync_data.current_phase_end_counter; - /* Ensure the changes to the phase index and start/end counter are visible for other - * workgroup waiting in the loop. */ - memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer, - gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible); - atomicAdd(DEREF(header).sync_data.current_phase_end_counter, - DIV_ROUND_UP(task_count(header), gl_WorkGroupSize.x)); - } - break; - } - - /* If other invocations have finished all nodes, break out; there is no work to do */ - if (DEREF(header).sync_data.phase_index == TASK_INDEX_INVALID) { - break; - } - } while (global_task_index >= DEREF(header).sync_data.current_phase_end_counter); - - shared_phase_index = DEREF(header).sync_data.phase_index; - } - - barrier(); - if (DEREF(header).sync_data.phase_index == TASK_INDEX_INVALID) - return TASK_INDEX_INVALID; - - num_tasks_to_skip = shared_phase_index - phase_index; - - uint32_t local_task_index = global_task_index - DEREF(header).sync_data.current_phase_start_counter; - return local_task_index * gl_WorkGroupSize.x + gl_LocalInvocationID.x; -} - -bool -should_execute_phase() -{ - if (num_tasks_to_skip > 0) { - /* Skip to next phase. */ - ++phase_index; - --num_tasks_to_skip; - return false; - } - return true; -} - -#define PHASE(header) \ - for (; task_index != TASK_INDEX_INVALID && should_execute_phase(); task_index = fetch_task(header, true)) -#endif - #endif /* BUILD_HELPERS_H */ diff --git a/src/amd/vulkan/bvh/build_interface.h b/src/amd/vulkan/bvh/build_interface.h index 6422319c506..c0c06c98fed 100644 --- a/src/amd/vulkan/bvh/build_interface.h +++ b/src/amd/vulkan/bvh/build_interface.h @@ -16,49 +16,6 @@ #define VOID_REF uint64_t #endif -struct leaf_args { - VOID_REF ir; - VOID_REF bvh; - REF(radv_ir_header) header; - REF(key_id_pair) ids; - - radv_bvh_geometry_data geom_data; -}; - -struct morton_args { - VOID_REF bvh; - REF(radv_ir_header) header; - REF(key_id_pair) ids; -}; - -#define LBVH_RIGHT_CHILD_BIT_SHIFT 29 -#define LBVH_RIGHT_CHILD_BIT (1 << LBVH_RIGHT_CHILD_BIT_SHIFT) - -struct lbvh_node_info { - /* Number of children that have been processed (or are invalid/leaves) in - * the lbvh_generate_ir pass. - */ - uint32_t path_count; - - uint32_t children[2]; - uint32_t parent; -}; - -struct lbvh_main_args { - VOID_REF bvh; - REF(key_id_pair) src_ids; - VOID_REF node_info; - uint32_t id_count; - uint32_t internal_node_base; -}; - -struct lbvh_generate_ir_args { - VOID_REF bvh; - VOID_REF node_info; - VOID_REF header; - uint32_t internal_node_base; -}; - #define RADV_COPY_MODE_COPY 0 #define RADV_COPY_MODE_SERIALIZE 1 #define RADV_COPY_MODE_DESERIALIZE 2 @@ -72,30 +29,14 @@ struct copy_args { struct encode_args { VOID_REF intermediate_bvh; VOID_REF output_bvh; - REF(radv_ir_header) header; + REF(vk_ir_header) header; uint32_t output_bvh_offset; uint32_t leaf_node_count; uint32_t geometry_type; }; -struct ploc_prefix_scan_partition { - uint32_t aggregate; - uint32_t inclusive_sum; -}; - -#define PLOC_WORKGROUP_SIZE 1024 - -struct ploc_args { - VOID_REF bvh; - VOID_REF prefix_scan_partitions; - REF(radv_ir_header) header; - VOID_REF ids_0; - VOID_REF ids_1; - uint32_t internal_node_offset; -}; - struct header_args { - REF(radv_ir_header) src; + REF(vk_ir_header) src; REF(radv_accel_struct_header) dst; uint32_t bvh_offset; uint32_t instance_count; @@ -104,11 +45,11 @@ struct header_args { struct update_args { REF(radv_accel_struct_header) src; REF(radv_accel_struct_header) dst; - REF(radv_aabb) leaf_bounds; + REF(vk_aabb) leaf_bounds; REF(uint32_t) internal_ready_count; uint32_t leaf_node_count; - radv_bvh_geometry_data geom_data; + vk_bvh_geometry_data geom_data; }; #endif /* BUILD_INTERFACE_H */ diff --git a/src/amd/vulkan/bvh/bvh.h b/src/amd/vulkan/bvh/bvh.h index 27399fff200..2b87ec47664 100644 --- a/src/amd/vulkan/bvh/bvh.h +++ b/src/amd/vulkan/bvh/bvh.h @@ -7,17 +7,14 @@ #ifndef BVH_BVH_H #define BVH_BVH_H +#include "vk_bvh.h" + #define radv_bvh_node_triangle 0 #define radv_bvh_node_box16 4 #define radv_bvh_node_box32 5 #define radv_bvh_node_instance 6 #define radv_bvh_node_aabb 7 -#define radv_ir_node_triangle 0 -#define radv_ir_node_internal 1 -#define radv_ir_node_instance 2 -#define radv_ir_node_aabb 3 - #define RADV_GEOMETRY_OPAQUE (1u << 31) #define RADV_INSTANCE_FORCE_OPAQUE (1u << 31) @@ -29,31 +26,9 @@ #define VK_UUID_SIZE 16 #else #include -typedef struct radv_ir_node radv_ir_node; -typedef struct radv_global_sync_data radv_global_sync_data; -typedef struct radv_bvh_geometry_data radv_bvh_geometry_data; - typedef uint16_t float16_t; - -typedef struct { - float values[3][4]; -} mat3x4; - -typedef struct { - float x; - float y; - float z; -} vec3; - -typedef struct radv_aabb radv_aabb; - #endif -struct radv_aabb { - vec3 min; - vec3 max; -}; - struct radv_accel_struct_serialization_header { uint8_t driver_uuid[VK_UUID_SIZE]; uint8_t accel_struct_compat[VK_UUID_SIZE]; @@ -74,7 +49,7 @@ struct radv_accel_struct_geometry_info { struct radv_accel_struct_header { uint32_t bvh_offset; uint32_t reserved; - radv_aabb aabb; + vk_aabb aabb; /* Everything after this gets either updated/copied from the CPU or written by header.comp. */ uint64_t compacted_size; @@ -89,45 +64,6 @@ struct radv_accel_struct_header { uint32_t build_flags; }; -struct radv_ir_node { - radv_aabb aabb; -}; - -#define RADV_UNKNOWN_BVH_OFFSET 0xFFFFFFFF -#define RADV_NULL_BVH_OFFSET 0xFFFFFFFE - -struct radv_ir_box_node { - radv_ir_node base; - uint32_t children[2]; - uint32_t bvh_offset; -}; - -struct radv_global_sync_data { - uint32_t task_counts[2]; - uint32_t task_started_counter; - uint32_t task_done_counter; - uint32_t current_phase_start_counter; - uint32_t current_phase_end_counter; - uint32_t phase_index; - /* If this flag is set, the shader should exit - * instead of executing another phase */ - uint32_t next_phase_exit_flag; -}; - -struct radv_ir_header { - int32_t min_bounds[3]; - int32_t max_bounds[3]; - uint32_t active_leaf_count; - /* Indirect dispatch dimensions for the encoder. - * ir_internal_node_count is the thread count in the X dimension, - * while Y and Z are always set to 1. */ - uint32_t ir_internal_node_count; - uint32_t dispatch_size_y; - uint32_t dispatch_size_z; - radv_global_sync_data sync_data; - uint32_t dst_node_offset; -}; - struct radv_bvh_triangle_node { float coords[3][3]; uint32_t reserved[3]; @@ -170,28 +106,11 @@ struct radv_bvh_box16_node { struct radv_bvh_box32_node { uint32_t children[4]; - radv_aabb coords[4]; + vk_aabb coords[4]; uint32_t reserved[4]; }; #define RADV_BVH_ROOT_NODE radv_bvh_node_box32 #define RADV_BVH_INVALID_NODE 0xffffffffu -/* If the task index is set to this value, there is no - * more work to do. */ -#define TASK_INDEX_INVALID 0xFFFFFFFF - -struct radv_bvh_geometry_data { - uint64_t data; - uint64_t indices; - uint64_t transform; - - uint32_t geometry_id; - uint32_t geometry_type; - uint32_t first_id; - uint32_t stride; - uint32_t vertex_format; - uint32_t index_format; -}; - #endif /* BVH_H */ diff --git a/src/amd/vulkan/bvh/encode.comp b/src/amd/vulkan/bvh/encode.comp index 5c84f631860..50623aa3736 100644 --- a/src/amd/vulkan/bvh/encode.comp +++ b/src/amd/vulkan/bvh/encode.comp @@ -36,31 +36,85 @@ void set_parent(uint32_t child, uint32_t parent) void main() { - /* Revert the order so we start at the root */ - uint32_t global_id = DEREF(args.header).ir_internal_node_count - 1 - gl_GlobalInvocationID.x; - - uint32_t output_leaf_node_size; - switch (args.geometry_type) { - case VK_GEOMETRY_TYPE_TRIANGLES_KHR: - output_leaf_node_size = SIZEOF(radv_bvh_triangle_node); - break; - case VK_GEOMETRY_TYPE_AABBS_KHR: - output_leaf_node_size = SIZEOF(radv_bvh_aabb_node); - break; - default: /* instances */ - output_leaf_node_size = SIZEOF(radv_bvh_instance_node); - break; - } - - uint32_t intermediate_leaf_nodes_size = args.leaf_node_count * SIZEOF(radv_ir_node); + /* Encode leaf nodes. */ uint32_t dst_leaf_offset = id_to_offset(RADV_BVH_ROOT_NODE) + SIZEOF(radv_bvh_box32_node); + + uint32_t ir_leaf_node_size; + uint32_t output_leaf_node_size; + switch (args.geometry_type) { + case VK_GEOMETRY_TYPE_TRIANGLES_KHR: { + ir_leaf_node_size = SIZEOF(vk_ir_triangle_node); + output_leaf_node_size = SIZEOF(radv_bvh_triangle_node); + + vk_ir_triangle_node src_node = + DEREF(REF(vk_ir_triangle_node)(OFFSET(args.intermediate_bvh, gl_GlobalInvocationID.x * ir_leaf_node_size))); + REF(radv_bvh_triangle_node) dst_node = + REF(radv_bvh_triangle_node)(OFFSET(args.output_bvh, dst_leaf_offset + gl_GlobalInvocationID.x * output_leaf_node_size)); + + DEREF(dst_node).coords = src_node.coords; + DEREF(dst_node).triangle_id = src_node.triangle_id; + DEREF(dst_node).geometry_id_and_flags = src_node.geometry_id_and_flags; + DEREF(dst_node).id = 9; + + break; + } + case VK_GEOMETRY_TYPE_AABBS_KHR: { + ir_leaf_node_size = SIZEOF(vk_ir_aabb_node); + output_leaf_node_size = SIZEOF(radv_bvh_aabb_node); + + vk_ir_aabb_node src_node = + DEREF(REF(vk_ir_aabb_node)(OFFSET(args.intermediate_bvh, gl_GlobalInvocationID.x * ir_leaf_node_size))); + REF(radv_bvh_aabb_node) dst_node = + REF(radv_bvh_aabb_node)(OFFSET(args.output_bvh, dst_leaf_offset + gl_GlobalInvocationID.x * output_leaf_node_size)); + + DEREF(dst_node).primitive_id = src_node.primitive_id; + DEREF(dst_node).geometry_id_and_flags = src_node.geometry_id_and_flags; + + break; + } + default: { + /* instances */ + ir_leaf_node_size = SIZEOF(vk_ir_instance_node); + output_leaf_node_size = SIZEOF(radv_bvh_instance_node); + + vk_ir_instance_node src_node = + DEREF(REF(vk_ir_instance_node)(OFFSET(args.intermediate_bvh, gl_GlobalInvocationID.x * ir_leaf_node_size))); + REF(radv_bvh_instance_node) dst_node = + REF(radv_bvh_instance_node)(OFFSET(args.output_bvh, dst_leaf_offset + gl_GlobalInvocationID.x * output_leaf_node_size)); + + radv_accel_struct_header blas_header = + DEREF(REF(radv_accel_struct_header)(src_node.base_ptr)); + + DEREF(dst_node).bvh_ptr = addr_to_node(src_node.base_ptr + blas_header.bvh_offset); + DEREF(dst_node).bvh_offset = blas_header.bvh_offset; + + mat4 transform = mat4(src_node.otw_matrix); + mat4 inv_transform = transpose(inverse(transpose(transform))); + DEREF(dst_node).wto_matrix = mat3x4(inv_transform); + DEREF(dst_node).otw_matrix = mat3x4(transform); + + DEREF(dst_node).custom_instance_and_mask = src_node.custom_instance_and_mask; + DEREF(dst_node).sbt_offset_and_flags = encode_sbt_offset_and_flags(src_node.sbt_offset_and_flags); + DEREF(dst_node).instance_id = src_node.instance_id; + + break; + } + } + + if (gl_GlobalInvocationID.x >= DEREF(args.header).ir_internal_node_count) + return; + + /* Encode internal nodes. Revert the order so we start at the root */ + uint32_t global_id = DEREF(args.header).ir_internal_node_count - 1 - gl_GlobalInvocationID.x; + + uint32_t intermediate_leaf_nodes_size = args.leaf_node_count * ir_leaf_node_size; uint32_t dst_internal_offset = dst_leaf_offset + args.leaf_node_count * output_leaf_node_size; - REF(radv_ir_box_node) intermediate_internal_nodes = - REF(radv_ir_box_node)OFFSET(args.intermediate_bvh, intermediate_leaf_nodes_size); - REF(radv_ir_box_node) src_node = INDEX(radv_ir_box_node, intermediate_internal_nodes, global_id); - radv_ir_box_node src = DEREF(src_node); + REF(vk_ir_box_node) intermediate_internal_nodes = + REF(vk_ir_box_node)OFFSET(args.intermediate_bvh, intermediate_leaf_nodes_size); + REF(vk_ir_box_node) src_node = INDEX(vk_ir_box_node, intermediate_internal_nodes, global_id); + vk_ir_box_node src = DEREF(src_node); bool is_root_node = global_id == DEREF(args.header).ir_internal_node_count - 1; @@ -70,10 +124,10 @@ main() gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible); uint32_t bvh_offset = is_root_node ? id_to_offset(RADV_BVH_ROOT_NODE) : DEREF(src_node).bvh_offset; - if (bvh_offset == RADV_UNKNOWN_BVH_OFFSET) + if (bvh_offset == VK_UNKNOWN_BVH_OFFSET) continue; - if (bvh_offset == RADV_NULL_BVH_OFFSET) + if (bvh_offset == VK_NULL_BVH_OFFSET) break; REF(radv_bvh_box32_node) dst_node = REF(radv_bvh_box32_node)(OFFSET(args.output_bvh, bvh_offset)); @@ -92,11 +146,11 @@ main() float largest_surface_area = -INFINITY; for (int32_t i = 0; i < found_child_count; ++i) { - if (ir_id_to_type(children[i]) != radv_ir_node_internal) + if (ir_id_to_type(children[i]) != vk_ir_node_internal) continue; - radv_aabb bounds = - DEREF(REF(radv_ir_node)OFFSET(args.intermediate_bvh, + vk_aabb bounds = + DEREF(REF(vk_ir_node)OFFSET(args.intermediate_bvh, ir_id_to_offset(children[i]))).aabb; float surface_area = aabb_surface_area(bounds); @@ -107,8 +161,8 @@ main() } if (collapsed_child_index != -1) { - REF(radv_ir_box_node) child_node = - REF(radv_ir_box_node)OFFSET(args.intermediate_bvh, + REF(vk_ir_box_node) child_node = + REF(vk_ir_box_node)OFFSET(args.intermediate_bvh, ir_id_to_offset(children[collapsed_child_index])); uint32_t grandchildren[2] = DEREF(child_node).children; uint32_t valid_grandchild_count = 0; @@ -131,7 +185,7 @@ main() children[collapsed_child_index] = children[found_child_count]; } - DEREF(child_node).bvh_offset = RADV_NULL_BVH_OFFSET; + DEREF(child_node).bvh_offset = VK_NULL_BVH_OFFSET; } else break; } @@ -141,24 +195,24 @@ main() uint32_t offset = ir_id_to_offset(children[i]); uint32_t dst_offset; - if (type == radv_ir_node_internal) { + if (type == vk_ir_node_internal) { #if COMPACT dst_offset = atomicAdd(DEREF(args.header).dst_node_offset, SIZEOF(radv_bvh_box32_node)); #else uint32_t offset_in_internal_nodes = offset - intermediate_leaf_nodes_size; - uint32_t child_index = offset_in_internal_nodes / SIZEOF(radv_ir_box_node); + uint32_t child_index = offset_in_internal_nodes / SIZEOF(vk_ir_box_node); dst_offset = dst_internal_offset + child_index * SIZEOF(radv_bvh_box32_node); #endif - REF(radv_ir_box_node) child_node = REF(radv_ir_box_node)OFFSET(args.intermediate_bvh, offset); + REF(vk_ir_box_node) child_node = REF(vk_ir_box_node)OFFSET(args.intermediate_bvh, offset); DEREF(child_node).bvh_offset = dst_offset; } else { - uint32_t child_index = offset / SIZEOF(radv_ir_node); + uint32_t child_index = offset / ir_leaf_node_size; dst_offset = dst_leaf_offset + child_index * output_leaf_node_size; } - radv_aabb child_aabb = - DEREF(REF(radv_ir_node)OFFSET(args.intermediate_bvh, offset)).aabb; + vk_aabb child_aabb = + DEREF(REF(vk_ir_node)OFFSET(args.intermediate_bvh, offset)).aabb; DEREF(dst_node).coords[i] = child_aabb; diff --git a/src/amd/vulkan/bvh/leaf.comp b/src/amd/vulkan/bvh/leaf.comp deleted file mode 100644 index 26568527c6f..00000000000 --- a/src/amd/vulkan/bvh/leaf.comp +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Copyright © 2022 Konstantin Seurer - * - * SPDX-License-Identifier: MIT - */ - -#version 460 - -#extension GL_GOOGLE_include_directive : require - -#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require -#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require -#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require -#extension GL_EXT_shader_explicit_arithmetic_types_int64 : require -#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require -#extension GL_EXT_scalar_block_layout : require -#extension GL_EXT_buffer_reference : require -#extension GL_EXT_buffer_reference2 : require -#extension GL_KHR_shader_subgroup_vote : require -#extension GL_KHR_shader_subgroup_arithmetic : require -#extension GL_KHR_shader_subgroup_ballot : require - -layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; - -#include "build_interface.h" - -layout(push_constant) uniform CONSTS { - leaf_args args; -}; - -void -main(void) -{ - uint32_t global_id = gl_GlobalInvocationID.x; - uint32_t primitive_id = args.geom_data.first_id + global_id; - - REF(key_id_pair) id_ptr = INDEX(key_id_pair, args.ids, primitive_id); - uint32_t src_offset = global_id * args.geom_data.stride; - - uint32_t dst_stride; - uint32_t node_type; - if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR) { - dst_stride = SIZEOF(radv_bvh_triangle_node); - node_type = radv_ir_node_triangle; - } else if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_AABBS_KHR) { - dst_stride = SIZEOF(radv_bvh_aabb_node); - node_type = radv_ir_node_aabb; - } else { - dst_stride = SIZEOF(radv_bvh_instance_node); - node_type = radv_ir_node_instance; - } - - uint32_t dst_offset = primitive_id * dst_stride; - VOID_REF dst_ptr = OFFSET(args.bvh, dst_offset); - - radv_aabb bounds; - bool is_active; - if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR) { - is_active = build_triangle(bounds, dst_ptr, args.geom_data, global_id); - } else if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_AABBS_KHR) { - VOID_REF src_ptr = OFFSET(args.geom_data.data, src_offset); - is_active = build_aabb(bounds, src_ptr, dst_ptr, args.geom_data.geometry_id, global_id); - } else { - VOID_REF src_ptr = OFFSET(args.geom_data.data, src_offset); - /* arrayOfPointers */ - if (args.geom_data.stride == 8) { - src_ptr = DEREF(REF(VOID_REF)(src_ptr)); - } - - is_active = build_instance(bounds, src_ptr, dst_ptr, global_id); - } - -#if ALWAYS_ACTIVE - if (!is_active && args.geom_data.geometry_type != VK_GEOMETRY_TYPE_INSTANCES_KHR) { - bounds.min = vec3(0.0); - bounds.max = vec3(0.0); - is_active = true; - } -#endif - - if (is_active) { - REF(radv_ir_node) ir_node = INDEX(radv_ir_node, args.ir, primitive_id); - DEREF(ir_node).aabb = bounds; - } - - uint32_t ir_offset = primitive_id * SIZEOF(radv_ir_node); - DEREF(id_ptr).id = is_active ? pack_ir_node_id(ir_offset, node_type) : RADV_BVH_INVALID_NODE; - - uvec4 ballot = subgroupBallot(is_active); - if (subgroupElect()) - atomicAdd(DEREF(args.header).active_leaf_count, subgroupBallotBitCount(ballot)); - - atomicMin(DEREF(args.header).min_bounds[0], to_emulated_float(bounds.min.x)); - atomicMin(DEREF(args.header).min_bounds[1], to_emulated_float(bounds.min.y)); - atomicMin(DEREF(args.header).min_bounds[2], to_emulated_float(bounds.min.z)); - atomicMax(DEREF(args.header).max_bounds[0], to_emulated_float(bounds.max.x)); - atomicMax(DEREF(args.header).max_bounds[1], to_emulated_float(bounds.max.y)); - atomicMax(DEREF(args.header).max_bounds[2], to_emulated_float(bounds.max.z)); -} diff --git a/src/amd/vulkan/bvh/meson.build b/src/amd/vulkan/bvh/meson.build index 594194169a9..9173892d4a1 100644 --- a/src/amd/vulkan/bvh/meson.build +++ b/src/amd/vulkan/bvh/meson.build @@ -23,36 +23,6 @@ bvh_shaders = [ 'header', [], ], - [ - 'lbvh_generate_ir.comp', - 'lbvh_generate_ir', - [], - ], - [ - 'lbvh_main.comp', - 'lbvh_main', - [], - ], - [ - 'leaf.comp', - 'leaf', - ['ALWAYS_ACTIVE=0'], - ], - [ - 'leaf.comp', - 'leaf_always_active', - ['ALWAYS_ACTIVE=1'], - ], - [ - 'morton.comp', - 'morton', - [], - ], - [ - 'ploc_internal.comp', - 'ploc_internal', - [], - ], [ 'update.comp', 'update', @@ -61,17 +31,20 @@ bvh_shaders = [ ] bvh_include_dir = dir_source_root + '/src/amd/vulkan/bvh' +vk_bvh_include_dir = dir_source_root + '/src/vulkan/runtime/bvh' bvh_includes = files( 'build_helpers.h', 'build_interface.h', 'bvh.h', + vk_bvh_include_dir + '/vk_build_helpers.h', + vk_bvh_include_dir + '/vk_bvh.h', ) bvh_spv = [] foreach s : bvh_shaders command = [ - prog_glslang, '-V', '-I' + bvh_include_dir, '--target-env', 'spirv1.5', + prog_glslang, '-V', '-I' + bvh_include_dir, '-I' + vk_bvh_include_dir, '--target-env', 'spirv1.5', '-x', '-o', '@OUTPUT@', '@INPUT@', glslang_depfile, glslang_quiet, ] diff --git a/src/amd/vulkan/bvh/update.comp b/src/amd/vulkan/bvh/update.comp index 54577355e9e..ca06dfdf375 100644 --- a/src/amd/vulkan/bvh/update.comp +++ b/src/amd/vulkan/bvh/update.comp @@ -53,7 +53,7 @@ void main() { VOID_REF dst_ptr = OFFSET(dst_bvh, dst_offset); uint32_t src_offset = gl_GlobalInvocationID.x * args.geom_data.stride; - radv_aabb bounds; + vk_aabb bounds; bool is_active; if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR) { is_active = build_triangle(bounds, dst_ptr, args.geom_data, gl_GlobalInvocationID.x); @@ -65,7 +65,7 @@ void main() { if (!is_active) return; - DEREF(INDEX(radv_aabb, args.leaf_bounds, leaf_node_id)) = bounds; + DEREF(INDEX(vk_aabb, args.leaf_bounds, leaf_node_id)) = bounds; memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer, gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible); @@ -112,11 +112,11 @@ void main() { for (uint32_t i = 0; i < valid_child_count; ++i) { uint32_t child_offset = id_to_offset(children[i]); - radv_aabb child_bounds; + vk_aabb child_bounds; if (child_offset == dst_offset) child_bounds = bounds; else if (child_offset >= internal_nodes_offset) { - child_bounds = radv_aabb(vec3(INFINITY), vec3(-INFINITY)); + child_bounds = vk_aabb(vec3(INFINITY), vec3(-INFINITY)); REF(radv_bvh_box32_node) child_node = REF(radv_bvh_box32_node)OFFSET(dst_bvh, child_offset); for (uint32_t j = 0; j < 4; ++j) { if (DEREF(child_node).children[j] == RADV_BVH_INVALID_NODE) @@ -126,16 +126,16 @@ void main() { } } else { uint32_t child_index = (child_offset - first_leaf_offset) / leaf_node_size; - child_bounds = DEREF(INDEX(radv_aabb, args.leaf_bounds, child_index)); + child_bounds = DEREF(INDEX(vk_aabb, args.leaf_bounds, child_index)); } DEREF(dst_node).coords[i] = child_bounds; } if (parent_id == RADV_BVH_ROOT_NODE) { - radv_aabb root_bounds = radv_aabb(vec3(INFINITY), vec3(-INFINITY)); + vk_aabb root_bounds = vk_aabb(vec3(INFINITY), vec3(-INFINITY)); for (uint32_t i = 0; i < valid_child_count; ++i) { - radv_aabb bounds = DEREF(dst_node).coords[i]; + vk_aabb bounds = DEREF(dst_node).coords[i]; root_bounds.min = min(root_bounds.min, bounds.min); root_bounds.max = max(root_bounds.max, bounds.max); } diff --git a/src/amd/vulkan/meson.build b/src/amd/vulkan/meson.build index 5976bef8b85..539be57c9dc 100644 --- a/src/amd/vulkan/meson.build +++ b/src/amd/vulkan/meson.build @@ -191,9 +191,6 @@ if amd_with_llvm ) endif -subdir('radix_sort') -libradv_files += radix_sort_files - subdir('bvh') subdir('layers') diff --git a/src/amd/vulkan/radix_sort/meson.build b/src/amd/vulkan/radix_sort/meson.build deleted file mode 100644 index c1478755822..00000000000 --- a/src/amd/vulkan/radix_sort/meson.build +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright © 2022 Konstantin Seurer -# SPDX-License-Identifier: MIT - -subdir('shaders') - -radix_sort_files = files( - 'common/vk/barrier.c', - 'common/vk/barrier.h', - 'common/macros.h', - 'common/util.c', - 'common/util.h', - 'shaders/push.h', - 'targets/u64/config.h', - 'radix_sort_vk_devaddr.h', - 'radix_sort_vk_ext.h', - 'radix_sort_vk.c', - 'radix_sort_vk.h', - 'radv_radix_sort.c', - 'radv_radix_sort.h', - 'target.h' -) diff --git a/src/amd/vulkan/radix_sort/radv_radix_sort.c b/src/amd/vulkan/radix_sort/radv_radix_sort.c deleted file mode 100644 index 4305baaba75..00000000000 --- a/src/amd/vulkan/radix_sort/radv_radix_sort.c +++ /dev/null @@ -1,196 +0,0 @@ -/* - * Copyright © 2022 Konstantin Seurer - * - * SPDX-License-Identifier: MIT - */ - -#include "radv_radix_sort.h" -#include "targets/u64/config.h" -#include "radv_cmd_buffer.h" -#include "target.h" - -static const uint32_t init_spv[] = { -#include "radix_sort/shaders/init.comp.spv.h" -}; - -static const uint32_t fill_spv[] = { -#include "radix_sort/shaders/fill.comp.spv.h" -}; - -static const uint32_t histogram_spv[] = { -#include "radix_sort/shaders/histogram.comp.spv.h" -}; - -static const uint32_t prefix_spv[] = { -#include "radix_sort/shaders/prefix.comp.spv.h" -}; - -static const uint32_t scatter_0_even_spv[] = { -#include "radix_sort/shaders/scatter_0_even.comp.spv.h" -}; - -static const uint32_t scatter_0_odd_spv[] = { -#include "radix_sort/shaders/scatter_0_odd.comp.spv.h" -}; - -static const uint32_t scatter_1_even_spv[] = { -#include "radix_sort/shaders/scatter_1_even.comp.spv.h" -}; - -static const uint32_t scatter_1_odd_spv[] = { -#include "radix_sort/shaders/scatter_1_odd.comp.spv.h" -}; - -static const struct radix_sort_vk_target_config target_config = { - .keyval_dwords = RS_KEYVAL_DWORDS, - - .histogram = - { - .workgroup_size_log2 = RS_HISTOGRAM_WORKGROUP_SIZE_LOG2, - .subgroup_size_log2 = RS_HISTOGRAM_SUBGROUP_SIZE_LOG2, - .block_rows = RS_HISTOGRAM_BLOCK_ROWS, - }, - - .prefix = - { - .workgroup_size_log2 = RS_PREFIX_WORKGROUP_SIZE_LOG2, - .subgroup_size_log2 = RS_PREFIX_SUBGROUP_SIZE_LOG2, - }, - - .scatter = - { - .workgroup_size_log2 = RS_SCATTER_WORKGROUP_SIZE_LOG2, - .subgroup_size_log2 = RS_SCATTER_SUBGROUP_SIZE_LOG2, - .block_rows = RS_SCATTER_BLOCK_ROWS, - }, -}; - -radix_sort_vk_t * -radv_create_radix_sort_u64(VkDevice device, VkAllocationCallbacks const *ac, VkPipelineCache pc) -{ - const uint32_t *spv[8] = { - init_spv, fill_spv, histogram_spv, prefix_spv, - scatter_0_even_spv, scatter_0_odd_spv, scatter_1_even_spv, scatter_1_odd_spv, - }; - const uint32_t spv_sizes[8] = { - sizeof(init_spv), sizeof(fill_spv), sizeof(histogram_spv), sizeof(prefix_spv), - sizeof(scatter_0_even_spv), sizeof(scatter_0_odd_spv), sizeof(scatter_1_even_spv), sizeof(scatter_1_odd_spv), - }; - return radix_sort_vk_create(device, ac, pc, spv, spv_sizes, target_config); -} - -VKAPI_ATTR VkResult VKAPI_CALL -vkCreateShaderModule(VkDevice _device, const VkShaderModuleCreateInfo *pCreateInfo, - const VkAllocationCallbacks *pAllocator, VkShaderModule *pShaderModule) -{ - VK_FROM_HANDLE(radv_device, device, _device); - return device->vk.dispatch_table.CreateShaderModule(_device, pCreateInfo, pAllocator, pShaderModule); -} - -VKAPI_ATTR void VKAPI_CALL -vkDestroyShaderModule(VkDevice _device, VkShaderModule shaderModule, const VkAllocationCallbacks *pAllocator) -{ - VK_FROM_HANDLE(radv_device, device, _device); - device->vk.dispatch_table.DestroyShaderModule(_device, shaderModule, pAllocator); -} - -VKAPI_ATTR VkResult VKAPI_CALL -vkCreatePipelineLayout(VkDevice _device, const VkPipelineLayoutCreateInfo *pCreateInfo, - const VkAllocationCallbacks *pAllocator, VkPipelineLayout *pPipelineLayout) -{ - VK_FROM_HANDLE(radv_device, device, _device); - return device->vk.dispatch_table.CreatePipelineLayout(_device, pCreateInfo, pAllocator, pPipelineLayout); -} - -VKAPI_ATTR void VKAPI_CALL -vkDestroyPipelineLayout(VkDevice _device, VkPipelineLayout pipelineLayout, const VkAllocationCallbacks *pAllocator) -{ - VK_FROM_HANDLE(radv_device, device, _device); - device->vk.dispatch_table.DestroyPipelineLayout(_device, pipelineLayout, pAllocator); -} - -VKAPI_ATTR VkResult VKAPI_CALL -vkCreateComputePipelines(VkDevice _device, VkPipelineCache pipelineCache, uint32_t createInfoCount, - const VkComputePipelineCreateInfo *pCreateInfos, const VkAllocationCallbacks *pAllocator, - VkPipeline *pPipelines) -{ - VK_FROM_HANDLE(radv_device, device, _device); - return device->vk.dispatch_table.CreateComputePipelines(_device, pipelineCache, createInfoCount, pCreateInfos, - pAllocator, pPipelines); -} - -VKAPI_ATTR void VKAPI_CALL -vkDestroyPipeline(VkDevice _device, VkPipeline pipeline, const VkAllocationCallbacks *pAllocator) -{ - VK_FROM_HANDLE(radv_device, device, _device); - device->vk.dispatch_table.DestroyPipeline(_device, pipeline, pAllocator); -} - -VKAPI_ATTR void VKAPI_CALL -vkCmdPipelineBarrier(VkCommandBuffer commandBuffer, VkPipelineStageFlags srcStageMask, - VkPipelineStageFlags dstStageMask, VkDependencyFlags dependencyFlags, uint32_t memoryBarrierCount, - const VkMemoryBarrier *pMemoryBarriers, uint32_t bufferMemoryBarrierCount, - const VkBufferMemoryBarrier *pBufferMemoryBarriers, uint32_t imageMemoryBarrierCount, - const VkImageMemoryBarrier *pImageMemoryBarriers) -{ - VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); - - device->vk.dispatch_table.CmdPipelineBarrier(commandBuffer, srcStageMask, dstStageMask, dependencyFlags, - memoryBarrierCount, pMemoryBarriers, bufferMemoryBarrierCount, - pBufferMemoryBarriers, imageMemoryBarrierCount, pImageMemoryBarriers); -} - -VKAPI_ATTR void VKAPI_CALL -vkCmdPushConstants(VkCommandBuffer commandBuffer, VkPipelineLayout layout, VkShaderStageFlags stageFlags, - uint32_t offset, uint32_t size, const void *pValues) -{ - VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); - - device->vk.dispatch_table.CmdPushConstants(commandBuffer, layout, stageFlags, offset, size, pValues); -} - -VKAPI_ATTR void VKAPI_CALL -vkCmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint, VkPipeline pipeline) -{ - VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); - - device->vk.dispatch_table.CmdBindPipeline(commandBuffer, pipelineBindPoint, pipeline); -} - -VKAPI_ATTR void VKAPI_CALL -vkCmdDispatch(VkCommandBuffer commandBuffer, uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ) -{ - VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); - - device->vk.dispatch_table.CmdDispatch(commandBuffer, groupCountX, groupCountY, groupCountZ); -} - -VKAPI_ATTR VkDeviceAddress VKAPI_CALL -vkGetBufferDeviceAddress(VkDevice _device, const VkBufferDeviceAddressInfo *pInfo) -{ - VK_FROM_HANDLE(radv_device, device, _device); - return device->vk.dispatch_table.GetBufferDeviceAddress(_device, pInfo); -} - -VKAPI_ATTR void VKAPI_CALL -vkCmdFillBuffer(VkCommandBuffer commandBuffer, VkBuffer dstBuffer, VkDeviceSize dstOffset, VkDeviceSize size, - uint32_t data) -{ - VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); - - device->vk.dispatch_table.CmdFillBuffer(commandBuffer, dstBuffer, dstOffset, size, data); -} - -VKAPI_ATTR void VKAPI_CALL -vkCmdDispatchIndirect(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset) -{ - VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); - - device->vk.dispatch_table.CmdDispatchIndirect(commandBuffer, buffer, offset); -} diff --git a/src/amd/vulkan/radix_sort/radv_radix_sort.h b/src/amd/vulkan/radix_sort/radv_radix_sort.h deleted file mode 100644 index a0990610b9f..00000000000 --- a/src/amd/vulkan/radix_sort/radv_radix_sort.h +++ /dev/null @@ -1,14 +0,0 @@ -/* - * Copyright © 2022 Konstantin Seurer - * - * SPDX-License-Identifier: MIT - */ - -#ifndef RADV_RADIX_SORT_H -#define RADV_RADIX_SORT_H - -#include "radix_sort_vk_devaddr.h" - -radix_sort_vk_t *radv_create_radix_sort_u64(VkDevice device, VkAllocationCallbacks const *ac, VkPipelineCache pc); - -#endif diff --git a/src/amd/vulkan/radix_sort/shaders/meson.build b/src/amd/vulkan/radix_sort/shaders/meson.build deleted file mode 100644 index 7b5545696b2..00000000000 --- a/src/amd/vulkan/radix_sort/shaders/meson.build +++ /dev/null @@ -1,40 +0,0 @@ -# Copyright © 2022 Konstantin Seurer -# SPDX-License-Identifier: MIT - -radix_sort_shaders = [ - 'init.comp', - 'fill.comp', - 'histogram.comp', - 'prefix.comp', - 'scatter_0_even.comp', - 'scatter_0_odd.comp', - 'scatter_1_even.comp', - 'scatter_1_odd.comp' -] - -shader_include_dir = dir_source_root + '/src/amd/vulkan/radix_sort/targets/u64' - -shader_include_files = files( - 'bufref.h', - 'prefix_limits.h', - 'prefix.h', - 'push.h', - 'scatter.glsl', - dir_source_root + '/src/amd/vulkan/radix_sort/targets/u64/config.h' -) - -radix_sort_spv = [] -foreach s : radix_sort_shaders - _name = f'@s@.spv.h' - radix_sort_spv += custom_target( - _name, - input : s, - output : _name, - command : [ - prog_glslang, '-V', '-I' + shader_include_dir, '--target-env', 'spirv1.3', - '-x', '-o', '@OUTPUT@', '@INPUT@', glslang_quiet, glslang_depfile, - ], - depfile : f'@_name@.d', - depend_files : shader_include_files, - ) -endforeach diff --git a/src/amd/vulkan/radix_sort/shaders/prefix.h b/src/amd/vulkan/radix_sort/shaders/prefix.h deleted file mode 100644 index f9d470bb3f5..00000000000 --- a/src/amd/vulkan/radix_sort/shaders/prefix.h +++ /dev/null @@ -1,353 +0,0 @@ -// Copyright 2021 The Fuchsia Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_H_ -#define SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_H_ - -// -// Requires several defines -// -#ifndef RS_PREFIX_LIMITS -#error "Error: \"prefix_limits.h\" not loaded" -#endif - -#ifndef RS_PREFIX_ARGS -#error "Error: RS_PREFIX_ARGS undefined" -#endif - -#ifndef RS_PREFIX_LOAD -#error "Error: RS_PREFIX_LOAD undefined" -#endif - -#ifndef RS_PREFIX_STORE -#error "Error: RS_PREFIX_STORE undefined" -#endif - -#ifndef RS_SUBGROUP_SIZE -#error "Error: RS_SUBGROUP_SIZE undefined" -#endif - -#ifndef RS_WORKGROUP_SIZE -#error "Error: RS_WORKGROUP_SIZE undefined" -#endif - -#ifndef RS_WORKGROUP_SUBGROUPS -#error "Error: RS_WORKGROUP_SUBGROUPS undefined" -#endif - -// -// Optional switches: -// -// * Disable holding original inclusively scanned histogram values in registers. -// -// #define RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS -// - -// -// Compute exclusive prefix of uint32_t[256] -// -void -rs_prefix(RS_PREFIX_ARGS) -{ -#if (RS_WORKGROUP_SUBGROUPS == 1) - // - // Workgroup is a single subgroup so no shared memory is required. - // - - // - // Exclusive scan-add the histogram - // - const uint32_t h0 = RS_PREFIX_LOAD(0); - const uint32_t h0_inc = subgroupInclusiveAdd(h0); - RS_SUBGROUP_UNIFORM uint32_t h_last = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1); - - RS_PREFIX_STORE(0) = h0_inc - h0; // exclusive - - // - // Each iteration is dependent on the previous so no unrolling. The - // compiler is free to hoist the loads upward though. - // - for (RS_SUBGROUP_UNIFORM uint32_t ii = RS_SUBGROUP_SIZE; // - ii < RS_RADIX_SIZE; - ii += RS_SUBGROUP_SIZE) - { - const uint32_t h = RS_PREFIX_LOAD(ii); - const uint32_t h_inc = subgroupInclusiveAdd(h) + h_last; - h_last = subgroupBroadcast(h_inc, RS_SUBGROUP_SIZE - 1); - - RS_PREFIX_STORE(ii) = h_inc - h; // exclusive - } - -#else - // - // Workgroup is multiple subgroups and uses shared memory to store - // the scan's intermediate results. - // - // Assumes a power-of-two subgroup, workgroup and radix size. - // - // Downsweep: Repeatedly scan reductions until they fit in a single - // subgroup. - // - // Upsweep: Then uniformly apply reductions to each subgroup. - // - // - // Subgroup Size | 4 | 8 | 16 | 32 | 64 | - // --------------+----+----+----+----+----+ - // Sweep 0 | 64 | 32 | 16 | 8 | 4 | sweep_0[] - // Sweep 1 | 16 | 4 | - | - | - | sweep_1[] - // Sweep 2 | 4 | - | - | - | - | sweep_2[] - // --------------+----+----+----+----+----+ - // Total dwords | 84 | 36 | 16 | 8 | 4 | - // --------------+----+----+----+----+----+ - // -#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS - uint32_t h_exc[RS_H_COMPONENTS]; -#endif - - // - // Downsweep 0 - // - [[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++) - { - const uint32_t h = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE); - - const uint32_t h_inc = subgroupInclusiveAdd(h); - - const uint32_t smem_idx = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID; - - RS_PREFIX_SWEEP0(smem_idx) = subgroupBroadcast(h_inc, RS_SUBGROUP_SIZE - 1); - - // -#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS - h_exc[ii] = h_inc - h; -#else - RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) = h_inc - h; -#endif - } - - barrier(); - - // - // Skip generalizing these sweeps for all possible subgroups -- just - // write them directly. - // -#if ((RS_SUBGROUP_SIZE == 64) || (RS_SUBGROUP_SIZE == 32) || (RS_SUBGROUP_SIZE == 16)) - - ////////////////////////////////////////////////////////////////////// - // - // Scan 0 - // -#if (RS_SWEEP_0_SIZE != RS_SUBGROUP_SIZE) - if (gl_LocalInvocationID.x < RS_SWEEP_0_SIZE) // subgroup has inactive invocations -#endif - { - const uint32_t h0_red = RS_PREFIX_SWEEP0(gl_LocalInvocationID.x); - const uint32_t h0_inc = subgroupInclusiveAdd(h0_red); - - RS_PREFIX_SWEEP0(gl_LocalInvocationID.x) = h0_inc - h0_red; - } - -#elif (RS_SUBGROUP_SIZE == 8) - -#if (RS_SWEEP_0_SIZE < RS_WORKGROUP_SIZE) - - ////////////////////////////////////////////////////////////////////// - // - // Scan 0 and Downsweep 1 - // - if (gl_LocalInvocationID.x < RS_SWEEP_0_SIZE) // 32 invocations - { - const uint32_t h0_red = RS_PREFIX_SWEEP0(gl_LocalInvocationID.x); - const uint32_t h0_inc = subgroupInclusiveAdd(h0_red); - - RS_PREFIX_SWEEP0(gl_LocalInvocationID.x) = h0_inc - h0_red; - RS_PREFIX_SWEEP1(gl_SubgroupID) = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1); - } - -#else - - ////////////////////////////////////////////////////////////////////// - // - // Scan 0 and Downsweep 1 - // - [[unroll]] for (uint32_t ii = 0; ii < RS_S0_PASSES; ii++) // 32 invocations - { - const uint32_t idx0 = (ii * RS_WORKGROUP_SIZE) + gl_LocalInvocationID.x; - const uint32_t idx1 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID; - - const uint32_t h0_red = RS_PREFIX_SWEEP0(idx0); - const uint32_t h0_inc = subgroupInclusiveAdd(h0_red); - - RS_PREFIX_SWEEP0(idx0) = h0_inc - h0_red; - RS_PREFIX_SWEEP1(idx1) = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1); - } - -#endif - - barrier(); - - // - // Scan 1 - // - if (gl_LocalInvocationID.x < RS_SWEEP_1_SIZE) // 4 invocations - { - const uint32_t h1_red = RS_PREFIX_SWEEP1(gl_LocalInvocationID.x); - const uint32_t h1_inc = subgroupInclusiveAdd(h1_red); - - RS_PREFIX_SWEEP1(gl_LocalInvocationID.x) = h1_inc - h1_red; - } - -#elif (RS_SUBGROUP_SIZE == 4) - - ////////////////////////////////////////////////////////////////////// - // - // Scan 0 and Downsweep 1 - // -#if (RS_SWEEP_0_SIZE < RS_WORKGROUP_SIZE) - - if (gl_LocalInvocationID.x < RS_SWEEP_0_SIZE) // 64 invocations - { - const uint32_t h0_red = RS_PREFIX_SWEEP0(gl_LocalInvocationID.x); - const uint32_t h0_inc = subgroupInclusiveAdd(h0_red); - - RS_PREFIX_SWEEP0(gl_LocalInvocationID.x) = h0_inc - h0_red; - RS_PREFIX_SWEEP1(gl_SubgroupID) = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1); - } - -#else - - [[unroll]] for (uint32_t ii = 0; ii < RS_S0_PASSES; ii++) // 64 invocations - { - const uint32_t idx0 = (ii * RS_WORKGROUP_SIZE) + gl_LocalInvocationID.x; - const uint32_t idx1 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID; - - const uint32_t h0_red = RS_PREFIX_SWEEP0(idx0); - const uint32_t h0_inc = subgroupInclusiveAdd(h0_red); - - RS_PREFIX_SWEEP0(idx0) = h0_inc - h0_red; - RS_PREFIX_SWEEP1(idx1) = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1); - } -#endif - - barrier(); - - // - // Scan 1 and Downsweep 2 - // -#if (RS_SWEEP_1_SIZE < RS_WORKGROUP_SIZE) - if (gl_LocalInvocationID.x < RS_SWEEP_1_SIZE) // 16 invocations - { - const uint32_t h1_red = RS_PREFIX_SWEEP1(gl_LocalInvocationID.x); - const uint32_t h1_inc = subgroupInclusiveAdd(h1_red); - - RS_PREFIX_SWEEP1(gl_LocalInvocationID.x) = h1_inc - h1_red; - RS_PREFIX_SWEEP2(gl_SubgroupID) = subgroupBroadcast(h1_inc, RS_SUBGROUP_SIZE - 1); - } - -#else - - [[unroll]] for (uint32_t ii = 0; ii < RS_S1_PASSES; ii++) // 16 invocations - { - const uint32_t idx1 = (ii * RS_WORKGROUP_SIZE) + gl_LocalInvocationID.x; - const uint32_t idx2 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID; - - const uint32_t h1_red = RS_PREFIX_SWEEP1(idx1); - const uint32_t h1_inc = subgroupInclusiveAdd(h1_red); - - RS_PREFIX_SWEEP1(idx1) = h1_inc - h1_red; - RS_PREFIX_SWEEP2(idx2) = subgroupBroadcast(h1_inc, RS_SUBGROUP_SIZE - 1); - } - -#endif - - barrier(); - - // - // Scan 2 - // - // 4 invocations - // - if (gl_LocalInvocationID.x < RS_SWEEP_2_SIZE) - { - const uint32_t h2_red = RS_PREFIX_SWEEP2(gl_LocalInvocationID.x); - const uint32_t h2_inc = subgroupInclusiveAdd(h2_red); - - RS_PREFIX_SWEEP2(gl_LocalInvocationID.x) = h2_inc - h2_red; - } - -#else -#error "Error: Unsupported subgroup size" -#endif - - barrier(); - - ////////////////////////////////////////////////////////////////////// - // - // Final upsweep 0 - // -#if ((RS_SUBGROUP_SIZE == 64) || (RS_SUBGROUP_SIZE == 32) || (RS_SUBGROUP_SIZE == 16)) - - [[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++) - { - const uint32_t idx0 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID; - - // clang format issue -#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS - RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) = h_exc[ii] + RS_PREFIX_SWEEP0(idx0); -#else - const uint32_t h_exc = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE); - - RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) = h_exc + RS_PREFIX_SWEEP0(idx0); -#endif - } - -#elif (RS_SUBGROUP_SIZE == 8) - - [[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++) - { - const uint32_t idx0 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID; - const uint32_t idx1 = idx0 / RS_SUBGROUP_SIZE; - -#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS - RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) = - h_exc[ii] + RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1); -#else - const uint32_t h_exc = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE); - - RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) = - h_exc + RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1); -#endif - } - -#elif (RS_SUBGROUP_SIZE == 4) - - [[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++) - { - const uint32_t idx0 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID; - const uint32_t idx1 = idx0 / RS_SUBGROUP_SIZE; - const uint32_t idx2 = idx1 / RS_SUBGROUP_SIZE; - -#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS - RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) = - h_exc[ii] + (RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1) + RS_PREFIX_SWEEP2(idx2)); -#else - const uint32_t h_exc = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE); - - RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) = - h_exc + (RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1) + RS_PREFIX_SWEEP2(idx2)); -#endif - } - -#else -#error "Error: Unsupported subgroup size" -#endif - -#endif -} - -// -// -// - -#endif // SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_H_ diff --git a/src/amd/vulkan/radix_sort/targets/u64/config.h b/src/amd/vulkan/radix_sort/targets/u64/config.h deleted file mode 100644 index fa1a51eb017..00000000000 --- a/src/amd/vulkan/radix_sort/targets/u64/config.h +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2021 The Fuchsia Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_TARGETS_VENDORS_AMD_GCN3_U64_CONFIG_H_ -#define SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_TARGETS_VENDORS_AMD_GCN3_U64_CONFIG_H_ - -// -// -// - -// clang-format off -#define RS_KEYVAL_DWORDS 2 - -#define RS_FILL_WORKGROUP_SIZE_LOG2 7 -#define RS_FILL_BLOCK_ROWS 8 - -#define RS_HISTOGRAM_WORKGROUP_SIZE_LOG2 8 -#define RS_HISTOGRAM_SUBGROUP_SIZE_LOG2 6 -#define RS_HISTOGRAM_BLOCK_ROWS 14 - -#define RS_PREFIX_WORKGROUP_SIZE_LOG2 8 -#define RS_PREFIX_SUBGROUP_SIZE_LOG2 6 - -#define RS_SCATTER_WORKGROUP_SIZE_LOG2 8 -#define RS_SCATTER_SUBGROUP_SIZE_LOG2 6 -#define RS_SCATTER_BLOCK_ROWS 14 -// clang-format on - -// -// -// - -#endif // SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_TARGETS_VENDORS_AMD_GCN3_U64_CONFIG_H_ diff --git a/src/amd/vulkan/radv_acceleration_structure.c b/src/amd/vulkan/radv_acceleration_structure.c index dbb5595494f..c6fe70528b6 100644 --- a/src/amd/vulkan/radv_acceleration_structure.c +++ b/src/amd/vulkan/radv_acceleration_structure.c @@ -4,16 +4,12 @@ * SPDX-License-Identifier: MIT */ -#include "radv_sqtt.h" - #include "meta/radv_meta.h" #include "nir_builder.h" #include "radv_cs.h" #include "radv_entrypoints.h" -#include "radix_sort/common/vk/barrier.h" -#include "radix_sort/radv_radix_sort.h" -#include "radix_sort/shaders/push.h" +#include "radix_sort/radix_sort_u64.h" #include "bvh/build_interface.h" #include "bvh/bvh.h" @@ -21,30 +17,6 @@ #include "vk_acceleration_structure.h" #include "vk_common_entrypoints.h" -static const uint32_t leaf_spv[] = { -#include "bvh/leaf.spv.h" -}; - -static const uint32_t leaf_always_active_spv[] = { -#include "bvh/leaf_always_active.spv.h" -}; - -static const uint32_t morton_spv[] = { -#include "bvh/morton.spv.h" -}; - -static const uint32_t lbvh_main_spv[] = { -#include "bvh/lbvh_main.spv.h" -}; - -static const uint32_t lbvh_generate_ir_spv[] = { -#include "bvh/lbvh_generate_ir.spv.h" -}; - -static const uint32_t ploc_spv[] = { -#include "bvh/ploc_internal.spv.h" -}; - static const uint32_t copy_spv[] = { #include "bvh/copy.spv.h" }; @@ -65,21 +37,6 @@ static const uint32_t update_spv[] = { #include "bvh/update.spv.h" }; -#define KEY_ID_PAIR_SIZE 8 -#define MORTON_BIT_SIZE 24 - -enum internal_build_type { - INTERNAL_BUILD_TYPE_LBVH, - INTERNAL_BUILD_TYPE_PLOC, - INTERNAL_BUILD_TYPE_UPDATE, -}; - -struct build_config { - enum internal_build_type internal_type; - bool compact; - bool updateable; -}; - struct acceleration_structure_layout { uint32_t geometry_info_offset; uint32_t bvh_offset; @@ -89,71 +46,23 @@ struct acceleration_structure_layout { }; struct scratch_layout { - uint32_t size; uint32_t update_size; - uint32_t header_offset; - - /* Used for UPDATE only. */ - uint32_t internal_ready_count_offset; - - /* Used for BUILD only. */ - - uint32_t sort_buffer_offset[2]; - uint32_t sort_internal_offset; - - uint32_t ploc_prefix_sum_partition_offset; - uint32_t lbvh_node_offset; - - uint32_t ir_offset; - uint32_t internal_node_offset; }; -static struct build_config -build_config(uint32_t leaf_count, const VkAccelerationStructureBuildGeometryInfoKHR *build_info) -{ - struct build_config config = {0}; - - if (leaf_count <= 4) - config.internal_type = INTERNAL_BUILD_TYPE_LBVH; - else if (build_info->type == VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR) - config.internal_type = INTERNAL_BUILD_TYPE_PLOC; - else if (!(build_info->flags & VK_BUILD_ACCELERATION_STRUCTURE_PREFER_FAST_BUILD_BIT_KHR) && - !(build_info->flags & VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_UPDATE_BIT_KHR)) - config.internal_type = INTERNAL_BUILD_TYPE_PLOC; - else - config.internal_type = INTERNAL_BUILD_TYPE_LBVH; - - if (build_info->mode == VK_BUILD_ACCELERATION_STRUCTURE_MODE_UPDATE_KHR && - build_info->type == VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR) - config.internal_type = INTERNAL_BUILD_TYPE_UPDATE; - - if ((build_info->flags & VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_UPDATE_BIT_KHR) && - build_info->type == VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR) - config.updateable = true; - - if (build_info->flags & VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_COMPACTION_BIT_KHR) - config.compact = true; - - return config; -} +enum radv_encode_key_bits { + RADV_ENCODE_KEY_COMPACT = 1, +}; static void -get_build_layout(struct radv_device *device, uint32_t leaf_count, - const VkAccelerationStructureBuildGeometryInfoKHR *build_info, - struct acceleration_structure_layout *accel_struct, struct scratch_layout *scratch) +radv_get_acceleration_structure_layout(struct radv_device *device, uint32_t leaf_count, + const VkAccelerationStructureBuildGeometryInfoKHR *build_info, + struct acceleration_structure_layout *accel_struct) { uint32_t internal_count = MAX2(leaf_count, 2) - 1; - VkGeometryTypeKHR geometry_type = VK_GEOMETRY_TYPE_TRIANGLES_KHR; - - if (build_info->geometryCount) { - if (build_info->pGeometries) - geometry_type = build_info->pGeometries[0].geometryType; - else - geometry_type = build_info->ppGeometries[0]->geometryType; - } + VkGeometryTypeKHR geometry_type = vk_get_as_geometry_type(build_info); uint32_t bvh_leaf_size; switch (geometry_type) { @@ -170,92 +79,52 @@ get_build_layout(struct radv_device *device, uint32_t leaf_count, unreachable("Unknown VkGeometryTypeKHR"); } - if (accel_struct) { - uint64_t bvh_size = bvh_leaf_size * leaf_count + sizeof(struct radv_bvh_box32_node) * internal_count; - uint32_t offset = 0; - offset += sizeof(struct radv_accel_struct_header); + uint64_t bvh_size = bvh_leaf_size * leaf_count + sizeof(struct radv_bvh_box32_node) * internal_count; + uint32_t offset = 0; + offset += sizeof(struct radv_accel_struct_header); - if (device->rra_trace.accel_structs) { - accel_struct->geometry_info_offset = offset; - offset += sizeof(struct radv_accel_struct_geometry_info) * build_info->geometryCount; - } - /* Parent links, which have to go directly before bvh_offset as we index them using negative - * offsets from there. */ - offset += bvh_size / 64 * 4; - - /* The BVH and hence bvh_offset needs 64 byte alignment for RT nodes. */ - offset = ALIGN(offset, 64); - accel_struct->bvh_offset = offset; - - /* root node */ - offset += sizeof(struct radv_bvh_box32_node); - - accel_struct->leaf_nodes_offset = offset; - offset += bvh_leaf_size * leaf_count; - - accel_struct->internal_nodes_offset = offset; - /* Factor out the root node. */ - offset += sizeof(struct radv_bvh_box32_node) * (internal_count - 1); - - accel_struct->size = offset; + if (device->rra_trace.accel_structs) { + accel_struct->geometry_info_offset = offset; + offset += sizeof(struct radv_accel_struct_geometry_info) * build_info->geometryCount; } + /* Parent links, which have to go directly before bvh_offset as we index them using negative + * offsets from there. */ + offset += bvh_size / 64 * 4; - if (scratch) { - radix_sort_vk_memory_requirements_t requirements = { - 0, - }; - if (radv_device_init_accel_struct_build_state(device) == VK_SUCCESS) - radix_sort_vk_get_memory_requirements(device->meta_state.accel_struct_build.radix_sort, leaf_count, - &requirements); + /* The BVH and hence bvh_offset needs 64 byte alignment for RT nodes. */ + offset = ALIGN(offset, 64); + accel_struct->bvh_offset = offset; - uint32_t offset = 0; + /* root node */ + offset += sizeof(struct radv_bvh_box32_node); - uint32_t ploc_scratch_space = 0; - uint32_t lbvh_node_space = 0; + accel_struct->leaf_nodes_offset = offset; + offset += bvh_leaf_size * leaf_count; - struct build_config config = build_config(leaf_count, build_info); + accel_struct->internal_nodes_offset = offset; + /* Factor out the root node. */ + offset += sizeof(struct radv_bvh_box32_node) * (internal_count - 1); - if (config.internal_type == INTERNAL_BUILD_TYPE_PLOC) - ploc_scratch_space = DIV_ROUND_UP(leaf_count, PLOC_WORKGROUP_SIZE) * sizeof(struct ploc_prefix_scan_partition); - else - lbvh_node_space = sizeof(struct lbvh_node_info) * internal_count; + accel_struct->size = offset; +} - scratch->header_offset = offset; - offset += sizeof(struct radv_ir_header); +static void +radv_get_scratch_layout(struct radv_device *device, uint32_t leaf_count, struct scratch_layout *scratch) +{ + uint32_t internal_count = MAX2(leaf_count, 2) - 1; - scratch->sort_buffer_offset[0] = offset; - offset += requirements.keyvals_size; + uint32_t offset = 0; - scratch->sort_buffer_offset[1] = offset; - offset += requirements.keyvals_size; + scratch->header_offset = offset; + offset += sizeof(struct vk_ir_header); - scratch->sort_internal_offset = offset; - /* Internal sorting data is not needed when PLOC/LBVH are invoked, - * save space by aliasing them */ - scratch->ploc_prefix_sum_partition_offset = offset; - scratch->lbvh_node_offset = offset; - offset += MAX3(requirements.internal_size, ploc_scratch_space, lbvh_node_space); + uint32_t update_offset = 0; - scratch->ir_offset = offset; - offset += sizeof(struct radv_ir_node) * leaf_count; + update_offset += sizeof(vk_aabb) * leaf_count; + scratch->internal_ready_count_offset = update_offset; - scratch->internal_node_offset = offset; - offset += sizeof(struct radv_ir_box_node) * internal_count; - - scratch->size = offset; - - if (build_info->type == VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR) { - uint32_t update_offset = 0; - - update_offset += sizeof(radv_aabb) * leaf_count; - scratch->internal_ready_count_offset = update_offset; - - update_offset += sizeof(uint32_t) * internal_count; - scratch->update_size = update_offset; - } else { - scratch->update_size = offset; - } - } + update_offset += sizeof(uint32_t) * internal_count; + scratch->update_size = update_offset; } VKAPI_ATTR void VKAPI_CALL @@ -272,17 +141,11 @@ radv_GetAccelerationStructureBuildSizesKHR(VkDevice _device, VkAccelerationStruc STATIC_ASSERT(sizeof(struct radv_bvh_box16_node) == 64); STATIC_ASSERT(sizeof(struct radv_bvh_box32_node) == 128); - uint32_t leaf_count = 0; - for (uint32_t i = 0; i < pBuildInfo->geometryCount; i++) - leaf_count += pMaxPrimitiveCounts[i]; + if (radv_device_init_accel_struct_build_state(device) != VK_SUCCESS) + return; - struct acceleration_structure_layout accel_struct; - struct scratch_layout scratch; - get_build_layout(device, leaf_count, pBuildInfo, &accel_struct, &scratch); - - pSizeInfo->accelerationStructureSize = accel_struct.size; - pSizeInfo->updateScratchSize = scratch.update_size; - pSizeInfo->buildScratchSize = scratch.size; + vk_get_as_build_sizes(_device, buildType, pBuildInfo, pMaxPrimitiveCounts, pSizeInfo, + &device->meta_state.accel_struct_build.build_args); } VKAPI_ATTR VkResult VKAPI_CALL @@ -319,24 +182,13 @@ radv_device_finish_accel_struct_build_state(struct radv_device *device) struct vk_device_dispatch_table *dispatch = &device->vk.dispatch_table; dispatch->DestroyPipeline(_device, state->accel_struct_build.copy_pipeline, &state->alloc); - dispatch->DestroyPipeline(_device, state->accel_struct_build.ploc_pipeline, &state->alloc); - dispatch->DestroyPipeline(_device, state->accel_struct_build.lbvh_generate_ir_pipeline, &state->alloc); - dispatch->DestroyPipeline(_device, state->accel_struct_build.lbvh_main_pipeline, &state->alloc); - dispatch->DestroyPipeline(_device, state->accel_struct_build.leaf_pipeline, &state->alloc); - dispatch->DestroyPipeline(_device, state->accel_struct_build.leaf_updateable_pipeline, &state->alloc); dispatch->DestroyPipeline(_device, state->accel_struct_build.encode_pipeline, &state->alloc); dispatch->DestroyPipeline(_device, state->accel_struct_build.encode_compact_pipeline, &state->alloc); dispatch->DestroyPipeline(_device, state->accel_struct_build.header_pipeline, &state->alloc); - dispatch->DestroyPipeline(_device, state->accel_struct_build.morton_pipeline, &state->alloc); dispatch->DestroyPipeline(_device, state->accel_struct_build.update_pipeline, &state->alloc); radv_DestroyPipelineLayout(_device, state->accel_struct_build.copy_p_layout, &state->alloc); - radv_DestroyPipelineLayout(_device, state->accel_struct_build.ploc_p_layout, &state->alloc); - radv_DestroyPipelineLayout(_device, state->accel_struct_build.lbvh_generate_ir_p_layout, &state->alloc); - radv_DestroyPipelineLayout(_device, state->accel_struct_build.lbvh_main_p_layout, &state->alloc); - radv_DestroyPipelineLayout(_device, state->accel_struct_build.leaf_p_layout, &state->alloc); radv_DestroyPipelineLayout(_device, state->accel_struct_build.encode_p_layout, &state->alloc); radv_DestroyPipelineLayout(_device, state->accel_struct_build.header_p_layout, &state->alloc); - radv_DestroyPipelineLayout(_device, state->accel_struct_build.morton_p_layout, &state->alloc); radv_DestroyPipelineLayout(_device, state->accel_struct_build.update_p_layout, &state->alloc); if (state->accel_struct_build.radix_sort) @@ -492,7 +344,7 @@ radv_device_init_null_accel_struct(struct radv_device *device) }; for (uint32_t child = 0; child < 4; child++) { - root.coords[child] = (radv_aabb){ + root.coords[child] = (vk_aabb){ .min.x = NAN, .min.y = NAN, .min.z = NAN, @@ -524,6 +376,328 @@ radv_device_init_null_accel_struct(struct radv_device *device) return VK_SUCCESS; } +static VkDeviceSize +radv_get_as_size(VkDevice _device, const VkAccelerationStructureBuildGeometryInfoKHR *pBuildInfo, uint32_t leaf_count) +{ + VK_FROM_HANDLE(radv_device, device, _device); + + struct acceleration_structure_layout accel_struct; + radv_get_acceleration_structure_layout(device, leaf_count, pBuildInfo, &accel_struct); + return accel_struct.size; +} + +static VkDeviceSize +radv_get_update_scratch_size(struct vk_device *vk_device, uint32_t leaf_count) +{ + struct radv_device *device = container_of(vk_device, struct radv_device, vk); + + struct scratch_layout scratch; + radv_get_scratch_layout(device, leaf_count, &scratch); + return scratch.update_size; +} + +static uint32_t +radv_get_encode_key(VkAccelerationStructureTypeKHR type, VkBuildAccelerationStructureFlagBitsKHR flags) +{ + if (flags & VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_COMPACTION_BIT_KHR) + return RADV_ENCODE_KEY_COMPACT; + + return 0; +} + +static VkResult +radv_encode_bind_pipeline(VkCommandBuffer commandBuffer, uint32_t key) +{ + VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); + + bool compact = key & RADV_ENCODE_KEY_COMPACT; + device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, + compact ? device->meta_state.accel_struct_build.encode_compact_pipeline + : device->meta_state.accel_struct_build.encode_pipeline); + + return VK_SUCCESS; +} + +static void +radv_encode_as(VkCommandBuffer commandBuffer, const VkAccelerationStructureBuildGeometryInfoKHR *build_info, + const VkAccelerationStructureBuildRangeInfoKHR *build_range_infos, VkDeviceAddress intermediate_as_addr, + VkDeviceAddress intermediate_header_addr, uint32_t leaf_count, uint32_t key, + struct vk_acceleration_structure *dst) +{ + VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); + + struct acceleration_structure_layout layout; + radv_get_acceleration_structure_layout(device, leaf_count, build_info, &layout); + + if (key & RADV_ENCODE_KEY_COMPACT) { + uint32_t dst_offset = layout.internal_nodes_offset - layout.bvh_offset; + radv_update_buffer_cp(cmd_buffer, intermediate_header_addr + offsetof(struct vk_ir_header, dst_node_offset), + &dst_offset, sizeof(uint32_t)); + } + + const struct encode_args args = { + .intermediate_bvh = intermediate_as_addr, + .output_bvh = vk_acceleration_structure_get_va(dst) + layout.bvh_offset, + .header = intermediate_header_addr, + .output_bvh_offset = layout.bvh_offset, + .leaf_node_count = leaf_count, + .geometry_type = vk_get_as_geometry_type(build_info), + }; + vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.encode_p_layout, + VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(args), &args); + + struct radv_dispatch_info dispatch = { + .unaligned = true, + .ordered = true, + .blocks = {leaf_count, 1, 1}, + }; + + radv_compute_dispatch(cmd_buffer, &dispatch); +} + +static VkResult +radv_init_header_bind_pipeline(VkCommandBuffer commandBuffer, uint32_t key) +{ + VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); + + if (!(key & RADV_ENCODE_KEY_COMPACT)) + return VK_SUCCESS; + + /* Wait for encoding to finish. */ + cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | + radv_src_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + VK_ACCESS_2_SHADER_WRITE_BIT, NULL, NULL) | + radv_dst_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + VK_ACCESS_2_SHADER_READ_BIT, NULL, NULL); + + device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, + device->meta_state.accel_struct_build.header_pipeline); + + return VK_SUCCESS; +} + +static void +radv_init_header(VkCommandBuffer commandBuffer, const VkAccelerationStructureBuildGeometryInfoKHR *build_info, + const VkAccelerationStructureBuildRangeInfoKHR *build_range_infos, + VkDeviceAddress intermediate_as_addr, VkDeviceAddress intermediate_header_addr, uint32_t leaf_count, + uint32_t key, struct vk_acceleration_structure *dst) +{ + VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); + + size_t base = offsetof(struct radv_accel_struct_header, compacted_size); + + uint64_t instance_count = build_info->type == VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR ? leaf_count : 0; + + struct acceleration_structure_layout layout; + radv_get_acceleration_structure_layout(device, leaf_count, build_info, &layout); + + if (key & RADV_ENCODE_KEY_COMPACT) { + base = offsetof(struct radv_accel_struct_header, geometry_count); + + struct header_args args = { + .src = intermediate_header_addr, + .dst = vk_acceleration_structure_get_va(dst), + .bvh_offset = layout.bvh_offset, + .instance_count = instance_count, + }; + + vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.header_p_layout, + VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(args), &args); + + radv_unaligned_dispatch(cmd_buffer, 1, 1, 1); + } + + struct radv_accel_struct_header header; + + header.instance_offset = layout.bvh_offset + sizeof(struct radv_bvh_box32_node); + header.instance_count = instance_count; + header.compacted_size = layout.size; + + header.copy_dispatch_size[0] = DIV_ROUND_UP(header.compacted_size, 16 * 64); + header.copy_dispatch_size[1] = 1; + header.copy_dispatch_size[2] = 1; + + header.serialization_size = + header.compacted_size + + align(sizeof(struct radv_accel_struct_serialization_header) + sizeof(uint64_t) * header.instance_count, 128); + + header.size = header.serialization_size - sizeof(struct radv_accel_struct_serialization_header) - + sizeof(uint64_t) * header.instance_count; + + header.build_flags = build_info->flags; + header.geometry_count = build_info->geometryCount; + + radv_update_buffer_cp(cmd_buffer, vk_acceleration_structure_get_va(dst) + base, (const char *)&header + base, + sizeof(header) - base); + + if (device->rra_trace.accel_structs) { + uint64_t geometry_infos_size = build_info->geometryCount * sizeof(struct radv_accel_struct_geometry_info); + + struct radv_accel_struct_geometry_info *geometry_infos = malloc(geometry_infos_size); + if (!geometry_infos) + return; + + for (uint32_t i = 0; i < build_info->geometryCount; i++) { + const VkAccelerationStructureGeometryKHR *geometry = + build_info->pGeometries ? &build_info->pGeometries[i] : build_info->ppGeometries[i]; + geometry_infos[i].type = geometry->geometryType; + geometry_infos[i].flags = geometry->flags; + geometry_infos[i].primitive_count = build_range_infos[i].primitiveCount; + } + + radv_CmdUpdateBuffer(commandBuffer, dst->buffer, dst->offset + layout.geometry_info_offset, geometry_infos_size, + geometry_infos); + } +} + +static void +radv_init_update_scratch(VkCommandBuffer commandBuffer, VkDeviceAddress scratch, uint32_t leaf_count, + struct vk_acceleration_structure *src_as, struct vk_acceleration_structure *dst_as) +{ + VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); + + struct scratch_layout layout; + radv_get_scratch_layout(device, leaf_count, &layout); + + /* Prepare ready counts for internal nodes */ + radv_fill_buffer(cmd_buffer, NULL, NULL, scratch + layout.internal_ready_count_offset, + layout.update_size - layout.internal_ready_count_offset, 0x0); +} + +static void +radv_update_bind_pipeline(VkCommandBuffer commandBuffer) +{ + VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); + + /* Wait for update scratch initialization to finish.. */ + cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | + radv_src_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + VK_ACCESS_2_SHADER_WRITE_BIT, NULL, NULL) | + radv_dst_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + VK_ACCESS_2_SHADER_READ_BIT, NULL, NULL); + + device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, + device->meta_state.accel_struct_build.update_pipeline); +} + +static uint32_t +pack_geometry_id_and_flags(uint32_t geometry_id, uint32_t flags) +{ + uint32_t geometry_id_and_flags = geometry_id; + if (flags & VK_GEOMETRY_OPAQUE_BIT_KHR) + geometry_id_and_flags |= RADV_GEOMETRY_OPAQUE; + + return geometry_id_and_flags; +} + +static void +radv_update_as(VkCommandBuffer commandBuffer, const VkAccelerationStructureBuildGeometryInfoKHR *build_info, + const VkAccelerationStructureBuildRangeInfoKHR *build_range_infos, uint32_t leaf_count, + struct vk_acceleration_structure *dst, struct vk_acceleration_structure *src) +{ + VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); + + if (src != dst) { + VK_FROM_HANDLE(radv_buffer, src_as_buffer, src->buffer); + VK_FROM_HANDLE(radv_buffer, dst_as_buffer, dst->buffer); + + struct acceleration_structure_layout layout; + radv_get_acceleration_structure_layout(device, leaf_count, build_info, &layout); + + /* Copy header/metadata */ + radv_copy_buffer(cmd_buffer, src_as_buffer->bo, dst_as_buffer->bo, src_as_buffer->offset + src->offset, + dst_as_buffer->offset + dst->offset, layout.bvh_offset); + } + + struct scratch_layout layout; + radv_get_scratch_layout(device, leaf_count, &layout); + + struct update_args update_consts = { + .src = vk_acceleration_structure_get_va(src), + .dst = vk_acceleration_structure_get_va(dst), + .leaf_bounds = build_info->scratchData.deviceAddress, + .internal_ready_count = build_info->scratchData.deviceAddress + layout.internal_ready_count_offset, + .leaf_node_count = leaf_count, + }; + + uint32_t first_id = 0; + for (uint32_t i = 0; i < build_info->geometryCount; i++) { + const VkAccelerationStructureGeometryKHR *geom = + build_info->pGeometries ? &build_info->pGeometries[i] : build_info->ppGeometries[i]; + + const VkAccelerationStructureBuildRangeInfoKHR *build_range_info = &build_range_infos[i]; + + update_consts.geom_data = vk_fill_geometry_data(build_info->type, first_id, i, geom, build_range_info); + + vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.update_p_layout, + VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(update_consts), &update_consts); + radv_unaligned_dispatch(cmd_buffer, build_range_info->primitiveCount, 1, 1); + + first_id += build_range_info->primitiveCount; + } +} + +static const struct radix_sort_vk_target_config radix_sort_config = { + .keyval_dwords = 2, + .fill.workgroup_size_log2 = 7, + .fill.block_rows = 8, + .histogram.workgroup_size_log2 = 8, + .histogram.subgroup_size_log2 = 6, + .histogram.block_rows = 14, + .prefix.workgroup_size_log2 = 8, + .prefix.subgroup_size_log2 = 6, + .scatter.workgroup_size_log2 = 8, + .scatter.subgroup_size_log2 = 6, + .scatter.block_rows = 14, +}; + +static const struct vk_acceleration_structure_build_ops build_ops = { + .get_as_size = radv_get_as_size, + .get_update_scratch_size = radv_get_update_scratch_size, + .get_encode_key[0] = radv_get_encode_key, + .get_encode_key[1] = radv_get_encode_key, + .encode_bind_pipeline[0] = radv_encode_bind_pipeline, + .encode_bind_pipeline[1] = radv_init_header_bind_pipeline, + .encode_as[0] = radv_encode_as, + .encode_as[1] = radv_init_header, + .init_update_scratch = radv_init_update_scratch, + .update_bind_pipeline[0] = radv_update_bind_pipeline, + .update_as[0] = radv_update_as, +}; + +static void +radv_write_buffer_cp(VkCommandBuffer commandBuffer, VkDeviceAddress addr, void *data, uint32_t size) +{ + VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + radv_update_buffer_cp(cmd_buffer, addr, data, size); +} + +static void +radv_flush_buffer_write_cp(VkCommandBuffer commandBuffer) +{ +} + +static void +radv_cmd_dispatch_unaligned(VkCommandBuffer commandBuffer, uint32_t x, uint32_t y, uint32_t z) +{ + VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + radv_unaligned_dispatch(cmd_buffer, x, y, z); +} + +static void +radv_cmd_fill_buffer_addr(VkCommandBuffer commandBuffer, VkDeviceAddress addr, VkDeviceSize size, uint32_t data) +{ + VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + radv_fill_buffer(cmd_buffer, NULL, NULL, addr, size, data); +} + VkResult radv_device_init_accel_struct_build_state(struct radv_device *device) { @@ -533,38 +707,6 @@ radv_device_init_accel_struct_build_state(struct radv_device *device) if (device->meta_state.accel_struct_build.radix_sort) goto exit; - result = create_build_pipeline_spv(device, leaf_always_active_spv, sizeof(leaf_always_active_spv), - sizeof(struct leaf_args), - &device->meta_state.accel_struct_build.leaf_updateable_pipeline, - &device->meta_state.accel_struct_build.leaf_p_layout); - if (result != VK_SUCCESS) - goto exit; - - result = create_build_pipeline_spv(device, leaf_spv, sizeof(leaf_spv), sizeof(struct leaf_args), - &device->meta_state.accel_struct_build.leaf_pipeline, - &device->meta_state.accel_struct_build.leaf_p_layout); - if (result != VK_SUCCESS) - goto exit; - - result = create_build_pipeline_spv(device, lbvh_main_spv, sizeof(lbvh_main_spv), sizeof(struct lbvh_main_args), - &device->meta_state.accel_struct_build.lbvh_main_pipeline, - &device->meta_state.accel_struct_build.lbvh_main_p_layout); - if (result != VK_SUCCESS) - goto exit; - - result = create_build_pipeline_spv(device, lbvh_generate_ir_spv, sizeof(lbvh_generate_ir_spv), - sizeof(struct lbvh_generate_ir_args), - &device->meta_state.accel_struct_build.lbvh_generate_ir_pipeline, - &device->meta_state.accel_struct_build.lbvh_generate_ir_p_layout); - if (result != VK_SUCCESS) - goto exit; - - result = create_build_pipeline_spv(device, ploc_spv, sizeof(ploc_spv), sizeof(struct ploc_args), - &device->meta_state.accel_struct_build.ploc_pipeline, - &device->meta_state.accel_struct_build.ploc_p_layout); - if (result != VK_SUCCESS) - goto exit; - result = create_build_pipeline_spv(device, encode_spv, sizeof(encode_spv), sizeof(struct encode_args), &device->meta_state.accel_struct_build.encode_pipeline, &device->meta_state.accel_struct_build.encode_p_layout); @@ -584,20 +726,33 @@ radv_device_init_accel_struct_build_state(struct radv_device *device) if (result != VK_SUCCESS) goto exit; - result = create_build_pipeline_spv(device, morton_spv, sizeof(morton_spv), sizeof(struct morton_args), - &device->meta_state.accel_struct_build.morton_pipeline, - &device->meta_state.accel_struct_build.morton_p_layout); - if (result != VK_SUCCESS) - goto exit; - result = create_build_pipeline_spv(device, update_spv, sizeof(update_spv), sizeof(struct update_args), &device->meta_state.accel_struct_build.update_pipeline, &device->meta_state.accel_struct_build.update_p_layout); if (result != VK_SUCCESS) goto exit; - device->meta_state.accel_struct_build.radix_sort = - radv_create_radix_sort_u64(radv_device_to_handle(device), &device->meta_state.alloc, device->meta_state.cache); + device->meta_state.accel_struct_build.radix_sort = vk_create_radix_sort_u64( + radv_device_to_handle(device), &device->meta_state.alloc, device->meta_state.cache, radix_sort_config); + + result = vk_meta_device_init(&device->vk, &device->meta_state.device); + if (result != VK_SUCCESS) + goto exit; + + device->meta_state.device.pipeline_cache = device->meta_state.cache; + + device->vk.as_build_ops = &build_ops; + device->vk.write_buffer_cp = radv_write_buffer_cp; + device->vk.flush_buffer_write_cp = radv_flush_buffer_write_cp; + device->vk.cmd_dispatch_unaligned = radv_cmd_dispatch_unaligned; + device->vk.cmd_fill_buffer_addr = radv_cmd_fill_buffer_addr; + + struct vk_acceleration_structure_build_args *build_args = &device->meta_state.accel_struct_build.build_args; + build_args->subgroup_size = 64; + build_args->bvh_bounds_offset = offsetof(struct radv_accel_struct_header, aabb); + build_args->emit_markers = device->sqtt.bo; + build_args->radix_sort = device->meta_state.accel_struct_build.radix_sort; + exit: mtx_unlock(&device->meta_state.mtx); return result; @@ -616,727 +771,6 @@ radv_device_init_accel_struct_copy_state(struct radv_device *device) return result; } -struct bvh_state { - uint32_t node_count; - uint32_t scratch_offset; - - uint32_t leaf_node_count; - uint32_t internal_node_count; - uint32_t leaf_node_size; - - struct acceleration_structure_layout accel_struct; - struct scratch_layout scratch; - struct build_config config; - - /* Radix sort state */ - uint32_t scatter_blocks; - uint32_t count_ru_scatter; - uint32_t histo_blocks; - uint32_t count_ru_histo; - struct rs_push_scatter push_scatter; -}; - -struct radv_bvh_batch_state { - bool any_compact; - bool any_non_compact; - bool any_ploc; - bool any_lbvh; - bool any_updateable; - bool any_non_updateable; - bool any_update; -}; - -static uint32_t -pack_geometry_id_and_flags(uint32_t geometry_id, uint32_t flags) -{ - uint32_t geometry_id_and_flags = geometry_id; - if (flags & VK_GEOMETRY_OPAQUE_BIT_KHR) - geometry_id_and_flags |= RADV_GEOMETRY_OPAQUE; - - return geometry_id_and_flags; -} - -static struct radv_bvh_geometry_data -fill_geometry_data(VkAccelerationStructureTypeKHR type, struct bvh_state *bvh_state, uint32_t geom_index, - const VkAccelerationStructureGeometryKHR *geometry, - const VkAccelerationStructureBuildRangeInfoKHR *build_range_info) -{ - struct radv_bvh_geometry_data data = { - .first_id = bvh_state->node_count, - .geometry_id = pack_geometry_id_and_flags(geom_index, geometry->flags), - .geometry_type = geometry->geometryType, - }; - - switch (geometry->geometryType) { - case VK_GEOMETRY_TYPE_TRIANGLES_KHR: - assert(type == VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR); - - data.data = geometry->geometry.triangles.vertexData.deviceAddress + - build_range_info->firstVertex * geometry->geometry.triangles.vertexStride; - data.indices = geometry->geometry.triangles.indexData.deviceAddress; - - if (geometry->geometry.triangles.indexType == VK_INDEX_TYPE_NONE_KHR) - data.data += build_range_info->primitiveOffset; - else - data.indices += build_range_info->primitiveOffset; - - data.transform = geometry->geometry.triangles.transformData.deviceAddress; - if (data.transform) - data.transform += build_range_info->transformOffset; - - data.stride = geometry->geometry.triangles.vertexStride; - data.vertex_format = geometry->geometry.triangles.vertexFormat; - data.index_format = geometry->geometry.triangles.indexType; - break; - case VK_GEOMETRY_TYPE_AABBS_KHR: - assert(type == VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR); - - data.data = geometry->geometry.aabbs.data.deviceAddress + build_range_info->primitiveOffset; - data.stride = geometry->geometry.aabbs.stride; - break; - case VK_GEOMETRY_TYPE_INSTANCES_KHR: - assert(type == VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR); - - data.data = geometry->geometry.instances.data.deviceAddress + build_range_info->primitiveOffset; - - if (geometry->geometry.instances.arrayOfPointers) - data.stride = 8; - else - data.stride = sizeof(VkAccelerationStructureInstanceKHR); - break; - default: - unreachable("Unknown geometryType"); - } - - return data; -} - -static void -build_leaves(VkCommandBuffer commandBuffer, uint32_t infoCount, - const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, - const VkAccelerationStructureBuildRangeInfoKHR *const *ppBuildRangeInfos, struct bvh_state *bvh_states, - bool updateable) -{ - VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); - - radv_write_user_event_marker(cmd_buffer, UserEventPush, "leaves"); - - device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, - updateable ? device->meta_state.accel_struct_build.leaf_updateable_pipeline - : device->meta_state.accel_struct_build.leaf_pipeline); - - for (uint32_t i = 0; i < infoCount; ++i) { - if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE) - continue; - if (bvh_states[i].config.updateable != updateable) - continue; - - VK_FROM_HANDLE(vk_acceleration_structure, accel_struct, pInfos[i].dstAccelerationStructure); - - struct leaf_args leaf_consts = { - .ir = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset, - .bvh = vk_acceleration_structure_get_va(accel_struct) + bvh_states[i].accel_struct.leaf_nodes_offset, - .header = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset, - .ids = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[0], - }; - - for (unsigned j = 0; j < pInfos[i].geometryCount; ++j) { - const VkAccelerationStructureGeometryKHR *geom = - pInfos[i].pGeometries ? &pInfos[i].pGeometries[j] : pInfos[i].ppGeometries[j]; - - const VkAccelerationStructureBuildRangeInfoKHR *build_range_info = &ppBuildRangeInfos[i][j]; - - leaf_consts.geom_data = fill_geometry_data(pInfos[i].type, &bvh_states[i], j, geom, build_range_info); - - vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.leaf_p_layout, - VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(leaf_consts), &leaf_consts); - radv_unaligned_dispatch(cmd_buffer, build_range_info->primitiveCount, 1, 1); - - bvh_states[i].leaf_node_count += build_range_info->primitiveCount; - bvh_states[i].node_count += build_range_info->primitiveCount; - } - } - - radv_write_user_event_marker(cmd_buffer, UserEventPop, NULL); -} - -static void -morton_generate(VkCommandBuffer commandBuffer, uint32_t infoCount, - const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states, - enum radv_cmd_flush_bits flush_bits) -{ - VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); - - radv_write_user_event_marker(cmd_buffer, UserEventPush, "morton"); - - device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, - device->meta_state.accel_struct_build.morton_pipeline); - - for (uint32_t i = 0; i < infoCount; ++i) { - if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE) - continue; - const struct morton_args consts = { - .bvh = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset, - .header = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset, - .ids = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[0], - }; - - vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.morton_p_layout, - VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts); - radv_unaligned_dispatch(cmd_buffer, bvh_states[i].node_count, 1, 1); - } - - radv_write_user_event_marker(cmd_buffer, UserEventPop, NULL); - - cmd_buffer->state.flush_bits |= flush_bits; -} - -static void -morton_sort(VkCommandBuffer commandBuffer, uint32_t infoCount, - const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states, - enum radv_cmd_flush_bits flush_bits) -{ - /* Copyright 2019 The Fuchsia Authors. */ - VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); - - radv_write_user_event_marker(cmd_buffer, UserEventPush, "sort"); - - radix_sort_vk_t *rs = device->meta_state.accel_struct_build.radix_sort; - - /* - * OVERVIEW - * - * 1. Pad the keyvals in `scatter_even`. - * 2. Zero the `histograms` and `partitions`. - * --- BARRIER --- - * 3. HISTOGRAM is dispatched before PREFIX. - * --- BARRIER --- - * 4. PREFIX is dispatched before the first SCATTER. - * --- BARRIER --- - * 5. One or more SCATTER dispatches. - * - * Note that the `partitions` buffer can be zeroed anytime before the first - * scatter. - */ - - /* How many passes? */ - uint32_t keyval_bytes = rs->config.keyval_dwords * (uint32_t)sizeof(uint32_t); - uint32_t keyval_bits = keyval_bytes * 8; - uint32_t key_bits = MIN2(MORTON_BIT_SIZE, keyval_bits); - uint32_t passes = (key_bits + RS_RADIX_LOG2 - 1) / RS_RADIX_LOG2; - - for (uint32_t i = 0; i < infoCount; ++i) { - if (bvh_states[i].node_count) - bvh_states[i].scratch_offset = bvh_states[i].scratch.sort_buffer_offset[passes & 1]; - else - bvh_states[i].scratch_offset = bvh_states[i].scratch.sort_buffer_offset[0]; - } - - /* - * PAD KEYVALS AND ZERO HISTOGRAM/PARTITIONS - * - * Pad fractional blocks with max-valued keyvals. - * - * Zero the histograms and partitions buffer. - * - * This assumes the partitions follow the histograms. - */ - - /* FIXME(allanmac): Consider precomputing some of these values and hang them off `rs`. */ - - /* How many scatter blocks? */ - uint32_t scatter_wg_size = 1 << rs->config.scatter.workgroup_size_log2; - uint32_t scatter_block_kvs = scatter_wg_size * rs->config.scatter.block_rows; - - /* - * How many histogram blocks? - * - * Note that it's OK to have more max-valued digits counted by the histogram - * than sorted by the scatters because the sort is stable. - */ - uint32_t histo_wg_size = 1 << rs->config.histogram.workgroup_size_log2; - uint32_t histo_block_kvs = histo_wg_size * rs->config.histogram.block_rows; - - uint32_t pass_idx = (keyval_bytes - passes); - - for (uint32_t i = 0; i < infoCount; ++i) { - if (!bvh_states[i].node_count) - continue; - if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE) - continue; - - uint64_t keyvals_even_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[0]; - uint64_t internal_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_internal_offset; - - bvh_states[i].scatter_blocks = (bvh_states[i].node_count + scatter_block_kvs - 1) / scatter_block_kvs; - bvh_states[i].count_ru_scatter = bvh_states[i].scatter_blocks * scatter_block_kvs; - - bvh_states[i].histo_blocks = (bvh_states[i].count_ru_scatter + histo_block_kvs - 1) / histo_block_kvs; - bvh_states[i].count_ru_histo = bvh_states[i].histo_blocks * histo_block_kvs; - - /* Fill with max values */ - if (bvh_states[i].count_ru_histo > bvh_states[i].node_count) { - radv_fill_buffer(cmd_buffer, NULL, NULL, keyvals_even_addr + bvh_states[i].node_count * keyval_bytes, - (bvh_states[i].count_ru_histo - bvh_states[i].node_count) * keyval_bytes, 0xFFFFFFFF); - } - - /* - * Zero histograms and invalidate partitions. - * - * Note that the partition invalidation only needs to be performed once - * because the even/odd scatter dispatches rely on the the previous pass to - * leave the partitions in an invalid state. - * - * Note that the last workgroup doesn't read/write a partition so it doesn't - * need to be initialized. - */ - uint32_t histo_partition_count = passes + bvh_states[i].scatter_blocks - 1; - - uint32_t fill_base = pass_idx * (RS_RADIX_SIZE * sizeof(uint32_t)); - - radv_fill_buffer(cmd_buffer, NULL, NULL, internal_addr + rs->internal.histograms.offset + fill_base, - histo_partition_count * (RS_RADIX_SIZE * sizeof(uint32_t)), 0); - } - - /* - * Pipeline: HISTOGRAM - * - * TODO(allanmac): All subgroups should try to process approximately the same - * number of blocks in order to minimize tail effects. This was implemented - * and reverted but should be reimplemented and benchmarked later. - */ - vk_barrier_transfer_w_to_compute_r(commandBuffer); - - device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, - rs->pipelines.named.histogram); - - for (uint32_t i = 0; i < infoCount; ++i) { - if (!bvh_states[i].node_count) - continue; - if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE) - continue; - - uint64_t keyvals_even_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[0]; - uint64_t internal_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_internal_offset; - - /* Dispatch histogram */ - struct rs_push_histogram push_histogram = { - .devaddr_histograms = internal_addr + rs->internal.histograms.offset, - .devaddr_keyvals = keyvals_even_addr, - .passes = passes, - }; - - vk_common_CmdPushConstants(commandBuffer, rs->pipeline_layouts.named.histogram, VK_SHADER_STAGE_COMPUTE_BIT, 0, - sizeof(push_histogram), &push_histogram); - - vk_common_CmdDispatch(commandBuffer, bvh_states[i].histo_blocks, 1, 1); - } - - /* - * Pipeline: PREFIX - * - * Launch one workgroup per pass. - */ - vk_barrier_compute_w_to_compute_r(commandBuffer); - - device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.prefix); - - for (uint32_t i = 0; i < infoCount; ++i) { - if (!bvh_states[i].node_count) - continue; - if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE) - continue; - - uint64_t internal_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_internal_offset; - - struct rs_push_prefix push_prefix = { - .devaddr_histograms = internal_addr + rs->internal.histograms.offset, - }; - - vk_common_CmdPushConstants(commandBuffer, rs->pipeline_layouts.named.prefix, VK_SHADER_STAGE_COMPUTE_BIT, 0, - sizeof(push_prefix), &push_prefix); - - vk_common_CmdDispatch(commandBuffer, passes, 1, 1); - } - - /* Pipeline: SCATTER */ - vk_barrier_compute_w_to_compute_r(commandBuffer); - - uint32_t histogram_offset = pass_idx * (RS_RADIX_SIZE * sizeof(uint32_t)); - - for (uint32_t i = 0; i < infoCount; i++) { - uint64_t keyvals_even_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[0]; - uint64_t keyvals_odd_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[1]; - uint64_t internal_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_internal_offset; - - bvh_states[i].push_scatter = (struct rs_push_scatter){ - .devaddr_keyvals_even = keyvals_even_addr, - .devaddr_keyvals_odd = keyvals_odd_addr, - .devaddr_partitions = internal_addr + rs->internal.partitions.offset, - .devaddr_histograms = internal_addr + rs->internal.histograms.offset + histogram_offset, - }; - } - - bool is_even = true; - - while (true) { - uint32_t pass_dword = pass_idx / 4; - - /* Bind new pipeline */ - VkPipeline p = - is_even ? rs->pipelines.named.scatter[pass_dword].even : rs->pipelines.named.scatter[pass_dword].odd; - device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, p); - - /* Update push constants that changed */ - VkPipelineLayout pl = is_even ? rs->pipeline_layouts.named.scatter[pass_dword].even - : rs->pipeline_layouts.named.scatter[pass_dword].odd; - - for (uint32_t i = 0; i < infoCount; i++) { - if (!bvh_states[i].node_count) - continue; - if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE) - continue; - - bvh_states[i].push_scatter.pass_offset = (pass_idx & 3) * RS_RADIX_LOG2; - - vk_common_CmdPushConstants(commandBuffer, pl, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(struct rs_push_scatter), - &bvh_states[i].push_scatter); - - vk_common_CmdDispatch(commandBuffer, bvh_states[i].scatter_blocks, 1, 1); - - bvh_states[i].push_scatter.devaddr_histograms += (RS_RADIX_SIZE * sizeof(uint32_t)); - } - - /* Continue? */ - if (++pass_idx >= keyval_bytes) - break; - - vk_barrier_compute_w_to_compute_r(commandBuffer); - - is_even ^= true; - } - - radv_write_user_event_marker(cmd_buffer, UserEventPop, NULL); - - cmd_buffer->state.flush_bits |= flush_bits; -} - -static void -lbvh_build_internal(VkCommandBuffer commandBuffer, uint32_t infoCount, - const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states, - enum radv_cmd_flush_bits flush_bits) -{ - VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); - - radv_write_user_event_marker(cmd_buffer, UserEventPush, "lbvh"); - - device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, - device->meta_state.accel_struct_build.lbvh_main_pipeline); - - for (uint32_t i = 0; i < infoCount; ++i) { - if (bvh_states[i].config.internal_type != INTERNAL_BUILD_TYPE_LBVH) - continue; - - uint32_t src_scratch_offset = bvh_states[i].scratch_offset; - uint32_t internal_node_count = MAX2(bvh_states[i].node_count, 2) - 1; - - const struct lbvh_main_args consts = { - .bvh = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset, - .src_ids = pInfos[i].scratchData.deviceAddress + src_scratch_offset, - .node_info = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.lbvh_node_offset, - .id_count = bvh_states[i].node_count, - .internal_node_base = bvh_states[i].scratch.internal_node_offset - bvh_states[i].scratch.ir_offset, - }; - - vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.lbvh_main_p_layout, - VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts); - radv_unaligned_dispatch(cmd_buffer, internal_node_count, 1, 1); - bvh_states[i].node_count = internal_node_count; - bvh_states[i].internal_node_count = internal_node_count; - } - - cmd_buffer->state.flush_bits |= flush_bits; - - device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, - device->meta_state.accel_struct_build.lbvh_generate_ir_pipeline); - - for (uint32_t i = 0; i < infoCount; ++i) { - if (bvh_states[i].config.internal_type != INTERNAL_BUILD_TYPE_LBVH) - continue; - - const struct lbvh_generate_ir_args consts = { - .bvh = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset, - .node_info = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.lbvh_node_offset, - .header = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset, - .internal_node_base = bvh_states[i].scratch.internal_node_offset - bvh_states[i].scratch.ir_offset, - }; - - vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.lbvh_generate_ir_p_layout, - VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts); - radv_unaligned_dispatch(cmd_buffer, bvh_states[i].internal_node_count, 1, 1); - } - - radv_write_user_event_marker(cmd_buffer, UserEventPop, NULL); -} - -static void -ploc_build_internal(VkCommandBuffer commandBuffer, uint32_t infoCount, - const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states) -{ - VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); - - radv_write_user_event_marker(cmd_buffer, UserEventPush, "ploc"); - - device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, - device->meta_state.accel_struct_build.ploc_pipeline); - - for (uint32_t i = 0; i < infoCount; ++i) { - if (bvh_states[i].config.internal_type != INTERNAL_BUILD_TYPE_PLOC) - continue; - - uint32_t src_scratch_offset = bvh_states[i].scratch_offset; - uint32_t dst_scratch_offset = (src_scratch_offset == bvh_states[i].scratch.sort_buffer_offset[0]) - ? bvh_states[i].scratch.sort_buffer_offset[1] - : bvh_states[i].scratch.sort_buffer_offset[0]; - - const struct ploc_args consts = { - .bvh = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset, - .header = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset, - .ids_0 = pInfos[i].scratchData.deviceAddress + src_scratch_offset, - .ids_1 = pInfos[i].scratchData.deviceAddress + dst_scratch_offset, - .prefix_scan_partitions = - pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ploc_prefix_sum_partition_offset, - .internal_node_offset = bvh_states[i].scratch.internal_node_offset - bvh_states[i].scratch.ir_offset, - }; - - vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.ploc_p_layout, - VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts); - vk_common_CmdDispatch(commandBuffer, MAX2(DIV_ROUND_UP(bvh_states[i].node_count, PLOC_WORKGROUP_SIZE), 1), 1, 1); - } - - radv_write_user_event_marker(cmd_buffer, UserEventPop, NULL); -} - -static void -encode_nodes(VkCommandBuffer commandBuffer, uint32_t infoCount, - const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states, bool compact) -{ - VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); - - radv_write_user_event_marker(cmd_buffer, UserEventPush, "encode"); - - device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, - compact ? device->meta_state.accel_struct_build.encode_compact_pipeline - : device->meta_state.accel_struct_build.encode_pipeline); - - for (uint32_t i = 0; i < infoCount; ++i) { - if (compact != bvh_states[i].config.compact) - continue; - if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE) - continue; - - VK_FROM_HANDLE(vk_acceleration_structure, accel_struct, pInfos[i].dstAccelerationStructure); - - VkGeometryTypeKHR geometry_type = VK_GEOMETRY_TYPE_TRIANGLES_KHR; - - /* If the geometry count is 0, then the size does not matter - * because it will be multiplied with 0. - */ - if (pInfos[i].geometryCount) - geometry_type = - pInfos[i].pGeometries ? pInfos[i].pGeometries[0].geometryType : pInfos[i].ppGeometries[0]->geometryType; - - if (bvh_states[i].config.compact) { - uint32_t dst_offset = bvh_states[i].accel_struct.internal_nodes_offset - bvh_states[i].accel_struct.bvh_offset; - radv_update_buffer_cp(cmd_buffer, - pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset + - offsetof(struct radv_ir_header, dst_node_offset), - &dst_offset, sizeof(uint32_t)); - } - - const struct encode_args args = { - .intermediate_bvh = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset, - .output_bvh = vk_acceleration_structure_get_va(accel_struct) + bvh_states[i].accel_struct.bvh_offset, - .header = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset, - .output_bvh_offset = bvh_states[i].accel_struct.bvh_offset, - .leaf_node_count = bvh_states[i].leaf_node_count, - .geometry_type = geometry_type, - }; - vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.encode_p_layout, - VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(args), &args); - - struct radv_dispatch_info dispatch = { - .unaligned = true, - .ordered = true, - .va = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset + - offsetof(struct radv_ir_header, ir_internal_node_count), - }; - - radv_compute_dispatch(cmd_buffer, &dispatch); - } - /* This is the final access to the leaf nodes, no need to flush */ - - radv_write_user_event_marker(cmd_buffer, UserEventPop, NULL); -} - -static void -init_header(VkCommandBuffer commandBuffer, uint32_t infoCount, - const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states, - struct radv_bvh_batch_state *batch_state) -{ - VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); - - if (batch_state->any_compact) { - radv_write_user_event_marker(cmd_buffer, UserEventPush, "header"); - - device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, - device->meta_state.accel_struct_build.header_pipeline); - } - - for (uint32_t i = 0; i < infoCount; ++i) { - if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE) - continue; - VK_FROM_HANDLE(vk_acceleration_structure, accel_struct, pInfos[i].dstAccelerationStructure); - size_t base = offsetof(struct radv_accel_struct_header, compacted_size); - - uint64_t instance_count = - pInfos[i].type == VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR ? bvh_states[i].leaf_node_count : 0; - - if (bvh_states[i].config.compact) { - base = offsetof(struct radv_accel_struct_header, geometry_count); - - struct header_args args = { - .src = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset, - .dst = vk_acceleration_structure_get_va(accel_struct), - .bvh_offset = bvh_states[i].accel_struct.bvh_offset, - .instance_count = instance_count, - }; - - vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.header_p_layout, - VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(args), &args); - - radv_unaligned_dispatch(cmd_buffer, 1, 1, 1); - } - - struct radv_accel_struct_header header; - - header.instance_offset = bvh_states[i].accel_struct.bvh_offset + sizeof(struct radv_bvh_box32_node); - header.instance_count = instance_count; - header.compacted_size = bvh_states[i].accel_struct.size; - - header.copy_dispatch_size[0] = DIV_ROUND_UP(header.compacted_size, 16 * 64); - header.copy_dispatch_size[1] = 1; - header.copy_dispatch_size[2] = 1; - - header.serialization_size = - header.compacted_size + - align(sizeof(struct radv_accel_struct_serialization_header) + sizeof(uint64_t) * header.instance_count, 128); - - header.size = header.serialization_size - sizeof(struct radv_accel_struct_serialization_header) - - sizeof(uint64_t) * header.instance_count; - - header.build_flags = pInfos[i].flags; - header.geometry_count = pInfos[i].geometryCount; - - radv_update_buffer_cp(cmd_buffer, vk_acceleration_structure_get_va(accel_struct) + base, - (const char *)&header + base, sizeof(header) - base); - } - - if (batch_state->any_compact) - radv_write_user_event_marker(cmd_buffer, UserEventPop, NULL); -} - -static void -init_geometry_infos(VkCommandBuffer commandBuffer, uint32_t infoCount, - const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states, - const VkAccelerationStructureBuildRangeInfoKHR *const *ppBuildRangeInfos) -{ - for (uint32_t i = 0; i < infoCount; ++i) { - if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE) - continue; - VK_FROM_HANDLE(vk_acceleration_structure, accel_struct, pInfos[i].dstAccelerationStructure); - - uint64_t geometry_infos_size = pInfos[i].geometryCount * sizeof(struct radv_accel_struct_geometry_info); - - struct radv_accel_struct_geometry_info *geometry_infos = malloc(geometry_infos_size); - if (!geometry_infos) - continue; - - for (uint32_t j = 0; j < pInfos[i].geometryCount; ++j) { - const VkAccelerationStructureGeometryKHR *geometry = - pInfos[i].pGeometries ? pInfos[i].pGeometries + j : pInfos[i].ppGeometries[j]; - geometry_infos[j].type = geometry->geometryType; - geometry_infos[j].flags = geometry->flags; - geometry_infos[j].primitive_count = ppBuildRangeInfos[i][j].primitiveCount; - } - - radv_CmdUpdateBuffer(commandBuffer, accel_struct->buffer, - accel_struct->offset + bvh_states[i].accel_struct.geometry_info_offset, geometry_infos_size, - geometry_infos); - - free(geometry_infos); - } -} - -static void -update(VkCommandBuffer commandBuffer, uint32_t infoCount, const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, - const VkAccelerationStructureBuildRangeInfoKHR *const *ppBuildRangeInfos, struct bvh_state *bvh_states) -{ - VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); - - radv_write_user_event_marker(cmd_buffer, UserEventPush, "update"); - - device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, - device->meta_state.accel_struct_build.update_pipeline); - - for (uint32_t i = 0; i < infoCount; ++i) { - if (bvh_states[i].config.internal_type != INTERNAL_BUILD_TYPE_UPDATE) - continue; - - uint32_t leaf_node_count = 0; - for (uint32_t j = 0; j < pInfos[i].geometryCount; ++j) { - leaf_node_count += ppBuildRangeInfos[i][j].primitiveCount; - } - - VK_FROM_HANDLE(vk_acceleration_structure, src_bvh, pInfos[i].srcAccelerationStructure); - VK_FROM_HANDLE(vk_acceleration_structure, dst_bvh, pInfos[i].dstAccelerationStructure); - struct update_args update_consts = { - .src = vk_acceleration_structure_get_va(src_bvh), - .dst = vk_acceleration_structure_get_va(dst_bvh), - .leaf_bounds = pInfos[i].scratchData.deviceAddress, - .internal_ready_count = - pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.internal_ready_count_offset, - .leaf_node_count = leaf_node_count, - }; - - for (unsigned j = 0; j < pInfos[i].geometryCount; ++j) { - const VkAccelerationStructureGeometryKHR *geom = - pInfos[i].pGeometries ? &pInfos[i].pGeometries[j] : pInfos[i].ppGeometries[j]; - - const VkAccelerationStructureBuildRangeInfoKHR *build_range_info = &ppBuildRangeInfos[i][j]; - - update_consts.geom_data = fill_geometry_data(pInfos[i].type, &bvh_states[i], j, geom, build_range_info); - - vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.update_p_layout, - VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(update_consts), &update_consts); - radv_unaligned_dispatch(cmd_buffer, build_range_info->primitiveCount, 1, 1); - - bvh_states[i].leaf_node_count += build_range_info->primitiveCount; - bvh_states[i].node_count += build_range_info->primitiveCount; - } - } - - radv_write_user_event_marker(cmd_buffer, UserEventPop, NULL); -} - VKAPI_ATTR void VKAPI_CALL radv_CmdBuildAccelerationStructuresKHR(VkCommandBuffer commandBuffer, uint32_t infoCount, const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, @@ -1352,132 +786,14 @@ radv_CmdBuildAccelerationStructuresKHR(VkCommandBuffer commandBuffer, uint32_t i return; } - enum radv_cmd_flush_bits flush_bits = RADV_CMD_FLAG_CS_PARTIAL_FLUSH | - radv_src_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, - VK_ACCESS_2_SHADER_WRITE_BIT, NULL, NULL) | - radv_dst_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, - VK_ACCESS_2_SHADER_READ_BIT, NULL, NULL); - radv_meta_save(&saved_state, cmd_buffer, RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_DESCRIPTORS | RADV_META_SAVE_CONSTANTS); - struct bvh_state *bvh_states = calloc(infoCount, sizeof(struct bvh_state)); - - radv_describe_begin_accel_struct_build(cmd_buffer, infoCount); - - struct radv_bvh_batch_state batch_state = {0}; - - for (uint32_t i = 0; i < infoCount; ++i) { - uint32_t leaf_node_count = 0; - for (uint32_t j = 0; j < pInfos[i].geometryCount; ++j) { - leaf_node_count += ppBuildRangeInfos[i][j].primitiveCount; - } - - get_build_layout(device, leaf_node_count, pInfos + i, &bvh_states[i].accel_struct, &bvh_states[i].scratch); - - struct build_config config = build_config(leaf_node_count, pInfos + i); - bvh_states[i].config = config; - - if (config.compact) - batch_state.any_compact = true; - else - batch_state.any_non_compact = true; - - if (config.updateable) - batch_state.any_updateable = true; - else - batch_state.any_non_updateable = true; - - if (config.internal_type == INTERNAL_BUILD_TYPE_PLOC) { - batch_state.any_ploc = true; - } else if (config.internal_type == INTERNAL_BUILD_TYPE_LBVH) { - batch_state.any_lbvh = true; - } else if (config.internal_type == INTERNAL_BUILD_TYPE_UPDATE) { - batch_state.any_update = true; - } else { - unreachable("Unknown internal_build_type"); - } - - if (bvh_states[i].config.internal_type != INTERNAL_BUILD_TYPE_UPDATE) { - /* The internal node count is updated in lbvh_build_internal for LBVH - * and from the PLOC shader for PLOC. */ - struct radv_ir_header header = { - .min_bounds = {0x7fffffff, 0x7fffffff, 0x7fffffff}, - .max_bounds = {0x80000000, 0x80000000, 0x80000000}, - .dispatch_size_y = 1, - .dispatch_size_z = 1, - .sync_data = - { - .current_phase_end_counter = TASK_INDEX_INVALID, - /* Will be updated by the first PLOC shader invocation */ - .task_counts = {TASK_INDEX_INVALID, TASK_INDEX_INVALID}, - }, - }; - - radv_update_buffer_cp(cmd_buffer, pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset, - &header, sizeof(header)); - } else { - /* Prepare ready counts for internal nodes */ - radv_fill_buffer(cmd_buffer, NULL, NULL, - pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.internal_ready_count_offset, - bvh_states[i].scratch.update_size - bvh_states[i].scratch.internal_ready_count_offset, 0x0); - if (pInfos[i].srcAccelerationStructure != pInfos[i].dstAccelerationStructure) { - VK_FROM_HANDLE(vk_acceleration_structure, src_as, pInfos[i].srcAccelerationStructure); - VK_FROM_HANDLE(vk_acceleration_structure, dst_as, pInfos[i].dstAccelerationStructure); - - VK_FROM_HANDLE(radv_buffer, src_as_buffer, src_as->buffer); - VK_FROM_HANDLE(radv_buffer, dst_as_buffer, dst_as->buffer); - - /* Copy header/metadata */ - radv_copy_buffer(cmd_buffer, src_as_buffer->bo, dst_as_buffer->bo, src_as_buffer->offset + src_as->offset, - dst_as_buffer->offset + dst_as->offset, bvh_states[i].accel_struct.bvh_offset); - } - } - } cmd_buffer->state.current_event_type = EventInternalUnknown; - if (batch_state.any_lbvh || batch_state.any_ploc) { - if (batch_state.any_non_updateable) - build_leaves(commandBuffer, infoCount, pInfos, ppBuildRangeInfos, bvh_states, false); - if (batch_state.any_updateable) - build_leaves(commandBuffer, infoCount, pInfos, ppBuildRangeInfos, bvh_states, true); + vk_cmd_build_acceleration_structures(commandBuffer, &device->vk, &device->meta_state.device, infoCount, pInfos, + ppBuildRangeInfos, &device->meta_state.accel_struct_build.build_args); - cmd_buffer->state.flush_bits |= flush_bits; - - morton_generate(commandBuffer, infoCount, pInfos, bvh_states, flush_bits); - - morton_sort(commandBuffer, infoCount, pInfos, bvh_states, flush_bits); - - cmd_buffer->state.flush_bits |= flush_bits; - - if (batch_state.any_lbvh) - lbvh_build_internal(commandBuffer, infoCount, pInfos, bvh_states, flush_bits); - - if (batch_state.any_ploc) - ploc_build_internal(commandBuffer, infoCount, pInfos, bvh_states); - - cmd_buffer->state.flush_bits |= flush_bits; - - if (batch_state.any_non_compact) - encode_nodes(commandBuffer, infoCount, pInfos, bvh_states, false); - - if (batch_state.any_compact) - encode_nodes(commandBuffer, infoCount, pInfos, bvh_states, true); - - cmd_buffer->state.flush_bits |= flush_bits; - } - - init_header(commandBuffer, infoCount, pInfos, bvh_states, &batch_state); - - if (device->rra_trace.accel_structs) - init_geometry_infos(commandBuffer, infoCount, pInfos, bvh_states, ppBuildRangeInfos); - - if (batch_state.any_update) - update(commandBuffer, infoCount, pInfos, ppBuildRangeInfos, bvh_states); - - radv_describe_end_accel_struct_build(cmd_buffer); - - free(bvh_states); radv_meta_restore(&saved_state, cmd_buffer); } diff --git a/src/amd/vulkan/radv_device.h b/src/amd/vulkan/radv_device.h index e6e432e4771..dbd7f961c38 100644 --- a/src/amd/vulkan/radv_device.h +++ b/src/amd/vulkan/radv_device.h @@ -24,7 +24,9 @@ #include "radv_rra.h" #include "radv_shader.h" +#include "vk_acceleration_structure.h" #include "vk_device.h" +#include "vk_meta.h" #include "vk_texcompress_astc.h" #include "vk_texcompress_etc2.h" @@ -302,17 +304,6 @@ struct radv_meta_state { } dcc_retile; struct { - VkPipelineLayout leaf_p_layout; - VkPipeline leaf_pipeline; - VkPipeline leaf_updateable_pipeline; - VkPipelineLayout morton_p_layout; - VkPipeline morton_pipeline; - VkPipelineLayout lbvh_main_p_layout; - VkPipeline lbvh_main_pipeline; - VkPipelineLayout lbvh_generate_ir_p_layout; - VkPipeline lbvh_generate_ir_pipeline; - VkPipelineLayout ploc_p_layout; - VkPipeline ploc_pipeline; VkPipelineLayout encode_p_layout; VkPipeline encode_pipeline; VkPipeline encode_compact_pipeline; @@ -324,6 +315,7 @@ struct radv_meta_state { VkPipeline copy_pipeline; struct radix_sort_vk *radix_sort; + struct vk_acceleration_structure_build_args build_args; struct { VkBuffer buffer; @@ -340,6 +332,8 @@ struct radv_meta_state { VkDescriptorSetLayout ds_layout; VkPipelineLayout p_layout; } dgc_prepare; + + struct vk_meta_device device; }; struct radv_memory_trace_data { diff --git a/src/amd/vulkan/radv_rra.c b/src/amd/vulkan/radv_rra.c index 79e7802915d..c7adbf20e81 100644 --- a/src/amd/vulkan/radv_rra.c +++ b/src/amd/vulkan/radv_rra.c @@ -542,7 +542,7 @@ rra_transcode_triangle_node(struct rra_transcoding_context *ctx, const struct ra } static void -rra_transcode_aabb_node(struct rra_transcoding_context *ctx, const struct radv_bvh_aabb_node *src, radv_aabb bounds) +rra_transcode_aabb_node(struct rra_transcoding_context *ctx, const struct radv_bvh_aabb_node *src, vk_aabb bounds) { struct rra_aabb_node *dst = (struct rra_aabb_node *)(ctx->dst + ctx->dst_leaf_offset); ctx->dst_leaf_offset += sizeof(struct rra_aabb_node); @@ -580,7 +580,7 @@ rra_transcode_instance_node(struct rra_transcoding_context *ctx, const struct ra } static uint32_t rra_transcode_node(struct rra_transcoding_context *ctx, uint32_t parent_id, uint32_t src_id, - radv_aabb bounds); + vk_aabb bounds); static void rra_transcode_box16_node(struct rra_transcoding_context *ctx, const struct radv_bvh_box16_node *src) @@ -597,7 +597,7 @@ rra_transcode_box16_node(struct rra_transcoding_context *ctx, const struct radv_ continue; } - radv_aabb bounds = { + vk_aabb bounds = { .min = { _mesa_half_to_float(src->coords[i][0][0]), @@ -653,7 +653,7 @@ get_geometry_id(const void *node, uint32_t node_type) } static uint32_t -rra_transcode_node(struct rra_transcoding_context *ctx, uint32_t parent_id, uint32_t src_id, radv_aabb bounds) +rra_transcode_node(struct rra_transcoding_context *ctx, uint32_t parent_id, uint32_t src_id, vk_aabb bounds) { uint32_t node_type = src_id & 7; uint32_t src_offset = (src_id & (~7u)) << 3; diff --git a/src/amd/vulkan/bvh/lbvh_generate_ir.comp b/src/vulkan/runtime/bvh/lbvh_generate_ir.comp similarity index 58% rename from src/amd/vulkan/bvh/lbvh_generate_ir.comp rename to src/vulkan/runtime/bvh/lbvh_generate_ir.comp index 18821d13a79..818e568b4c1 100644 --- a/src/amd/vulkan/bvh/lbvh_generate_ir.comp +++ b/src/vulkan/runtime/bvh/lbvh_generate_ir.comp @@ -1,7 +1,24 @@ /* * Copyright © 2022 Bas Nieuwenhuizen * - * SPDX-License-Identifier: MIT + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. */ #version 460 @@ -18,9 +35,9 @@ #extension GL_EXT_buffer_reference2 : require #extension GL_KHR_memory_scope_semantics : require -layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; +#include "vk_build_interface.h" -#include "build_interface.h" +layout(local_size_x_id = SUBGROUP_SIZE_ID, local_size_y = 1, local_size_z = 1) in; TYPE(lbvh_node_info, 4); @@ -36,8 +53,8 @@ main(void) uint32_t idx = global_id; - uint32_t previous_id = RADV_BVH_INVALID_NODE; - radv_aabb previous_bounds; + uint32_t previous_id = VK_BVH_INVALID_NODE; + vk_aabb previous_bounds; previous_bounds.min = vec3(INFINITY); previous_bounds.max = vec3(-INFINITY); @@ -58,13 +75,13 @@ main(void) * parents, which is a requirement of the encoder. */ uint32_t dst_idx = - atomicAdd(DEREF(REF(radv_ir_header)(args.header)).ir_internal_node_count, 1); + atomicAdd(DEREF(REF(vk_ir_header)(args.header)).ir_internal_node_count, 1); - uint32_t current_offset = args.internal_node_base + dst_idx * SIZEOF(radv_ir_box_node); - uint32_t current_id = pack_ir_node_id(current_offset, radv_ir_node_internal); + uint32_t current_offset = args.internal_node_base + dst_idx * SIZEOF(vk_ir_box_node); + uint32_t current_id = pack_ir_node_id(current_offset, vk_ir_node_internal); - REF(radv_ir_box_node) node = REF(radv_ir_box_node)(OFFSET(args.bvh, current_offset)); - radv_aabb bounds = previous_bounds; + REF(vk_ir_box_node) node = REF(vk_ir_box_node)(OFFSET(args.bvh, current_offset)); + vk_aabb bounds = previous_bounds; lbvh_node_info info = DEREF(INDEX(lbvh_node_info, args.node_info, idx)); @@ -78,10 +95,10 @@ main(void) previous_child_index = 1; if (previous_child_index == -1) { - if (children[0] != RADV_BVH_INVALID_NODE) { + if (children[0] != VK_BVH_INVALID_NODE) { uint32_t child_offset = ir_id_to_offset(children[0]); - REF(radv_ir_node) child = REF(radv_ir_node)(OFFSET(args.bvh, child_offset)); - radv_aabb child_bounds = DEREF(child).aabb; + REF(vk_ir_node) child = REF(vk_ir_node)(OFFSET(args.bvh, child_offset)); + vk_aabb child_bounds = DEREF(child).aabb; bounds.min = min(bounds.min, child_bounds.min); bounds.max = max(bounds.max, child_bounds.max); } @@ -89,23 +106,23 @@ main(void) } /* Fetch the non-cached child */ - if (children[1 - previous_child_index] != RADV_BVH_INVALID_NODE) { + if (children[1 - previous_child_index] != VK_BVH_INVALID_NODE) { uint32_t child_offset = ir_id_to_offset(children[1 - previous_child_index]); - REF(radv_ir_node) child = REF(radv_ir_node)(OFFSET(args.bvh, child_offset)); - radv_aabb child_bounds = DEREF(child).aabb; + REF(vk_ir_node) child = REF(vk_ir_node)(OFFSET(args.bvh, child_offset)); + vk_aabb child_bounds = DEREF(child).aabb; bounds.min = min(bounds.min, child_bounds.min); bounds.max = max(bounds.max, child_bounds.max); } - radv_ir_box_node node_value; + vk_ir_box_node node_value; node_value.base.aabb = bounds; - node_value.bvh_offset = RADV_UNKNOWN_BVH_OFFSET; + node_value.bvh_offset = VK_UNKNOWN_BVH_OFFSET; node_value.children = children; DEREF(node) = node_value; - if (info.parent == RADV_BVH_INVALID_NODE) + if (info.parent == VK_BVH_INVALID_NODE) break; idx = info.parent & ~LBVH_RIGHT_CHILD_BIT; diff --git a/src/amd/vulkan/bvh/lbvh_main.comp b/src/vulkan/runtime/bvh/lbvh_main.comp similarity index 76% rename from src/amd/vulkan/bvh/lbvh_main.comp rename to src/vulkan/runtime/bvh/lbvh_main.comp index c6c51280985..c79a3164eb9 100644 --- a/src/amd/vulkan/bvh/lbvh_main.comp +++ b/src/vulkan/runtime/bvh/lbvh_main.comp @@ -1,7 +1,24 @@ /* * Copyright © 2022 Bas Nieuwenhuizen * - * SPDX-License-Identifier: MIT + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. */ #version 460 @@ -17,9 +34,9 @@ #extension GL_EXT_buffer_reference : require #extension GL_EXT_buffer_reference2 : require -layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; +#include "vk_build_interface.h" -#include "build_interface.h" +layout(local_size_x_id = SUBGROUP_SIZE_ID, local_size_y = 1, local_size_z = 1) in; TYPE(lbvh_node_info, 4); @@ -74,11 +91,11 @@ main() { if (args.id_count <= 1) { REF(lbvh_node_info) dst = REF(lbvh_node_info)(args.node_info); - DEREF(dst).parent = RADV_BVH_INVALID_NODE; + DEREF(dst).parent = VK_BVH_INVALID_NODE; DEREF(dst).path_count = 2; DEREF(dst).children[0] = - args.id_count == 1 ? DEREF(INDEX(key_id_pair, args.src_ids, 0)).id : RADV_BVH_INVALID_NODE; - DEREF(dst).children[1] = RADV_BVH_INVALID_NODE; + args.id_count == 1 ? DEREF(INDEX(key_id_pair, args.src_ids, 0)).id : VK_BVH_INVALID_NODE; + DEREF(dst).children[1] = VK_BVH_INVALID_NODE; return; } @@ -136,5 +153,5 @@ main() DEREF(dst).children[0] = DEREF(INDEX(key_id_pair, args.src_ids, left)).id; DEREF(dst).children[1] = DEREF(INDEX(key_id_pair, args.src_ids, right)).id; if (id == 0) - DEREF(dst).parent = RADV_BVH_INVALID_NODE; + DEREF(dst).parent = VK_BVH_INVALID_NODE; } diff --git a/src/vulkan/runtime/bvh/leaf.comp b/src/vulkan/runtime/bvh/leaf.comp new file mode 100644 index 00000000000..85f0756204a --- /dev/null +++ b/src/vulkan/runtime/bvh/leaf.comp @@ -0,0 +1,250 @@ +/* + * Copyright © 2022 Konstantin Seurer + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#version 460 + +#extension GL_GOOGLE_include_directive : require + +#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int64 : require +#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_buffer_reference : require +#extension GL_EXT_buffer_reference2 : require +#extension GL_KHR_shader_subgroup_vote : require +#extension GL_KHR_shader_subgroup_arithmetic : require +#extension GL_KHR_shader_subgroup_ballot : require + +#include "vk_build_interface.h" + +layout(local_size_x_id = SUBGROUP_SIZE_ID, local_size_y = 1, local_size_z = 1) in; + +layout(push_constant) uniform CONSTS { + leaf_args args; +}; + +/* A GLSL-adapted copy of VkAccelerationStructureInstanceKHR. */ +struct AccelerationStructureInstance { + mat3x4 transform; + uint32_t custom_instance_and_mask; + uint32_t sbt_offset_and_flags; + uint64_t accelerationStructureReference; +}; +TYPE(AccelerationStructureInstance, 8); + +bool +build_triangle(inout vk_aabb bounds, VOID_REF dst_ptr, vk_bvh_geometry_data geom_data, uint32_t global_id) +{ + bool is_valid = true; + triangle_indices indices = load_indices(geom_data.indices, geom_data.index_format, global_id); + + triangle_vertices vertices = load_vertices(geom_data.data, indices, geom_data.vertex_format, geom_data.stride); + + /* An inactive triangle is one for which the first (X) component of any vertex is NaN. If any + * other vertex component is NaN, and the first is not, the behavior is undefined. If the vertex + * format does not have a NaN representation, then all triangles are considered active. + */ + if (isnan(vertices.vertex[0].x) || isnan(vertices.vertex[1].x) || isnan(vertices.vertex[2].x)) +#if ALWAYS_ACTIVE + is_valid = false; +#else + return false; +#endif + + if (geom_data.transform != NULL) { + mat4 transform = mat4(1.0); + + for (uint32_t col = 0; col < 4; col++) + for (uint32_t row = 0; row < 3; row++) + transform[col][row] = DEREF(INDEX(float, geom_data.transform, col + row * 4)); + + for (uint32_t i = 0; i < 3; i++) + vertices.vertex[i] = transform * vertices.vertex[i]; + } + + REF(vk_ir_triangle_node) node = REF(vk_ir_triangle_node)(dst_ptr); + + bounds.min = vec3(INFINITY); + bounds.max = vec3(-INFINITY); + + for (uint32_t coord = 0; coord < 3; coord++) + for (uint32_t comp = 0; comp < 3; comp++) { + DEREF(node).coords[coord][comp] = vertices.vertex[coord][comp]; + bounds.min[comp] = min(bounds.min[comp], vertices.vertex[coord][comp]); + bounds.max[comp] = max(bounds.max[comp], vertices.vertex[coord][comp]); + } + + DEREF(node).base.aabb = bounds; + DEREF(node).triangle_id = global_id; + DEREF(node).geometry_id_and_flags = geom_data.geometry_id; + DEREF(node).id = 9; + + return is_valid; +} + +bool +build_aabb(inout vk_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint32_t geometry_id, uint32_t global_id) +{ + bool is_valid = true; + REF(vk_ir_aabb_node) node = REF(vk_ir_aabb_node)(dst_ptr); + + for (uint32_t vec = 0; vec < 2; vec++) + for (uint32_t comp = 0; comp < 3; comp++) { + float coord = DEREF(INDEX(float, src_ptr, comp + vec * 3)); + + if (vec == 0) + bounds.min[comp] = coord; + else + bounds.max[comp] = coord; + } + + /* An inactive AABB is one for which the minimum X coordinate is NaN. If any other component is + * NaN, and the first is not, the behavior is undefined. + */ + if (isnan(bounds.min.x)) +#if ALWAYS_ACTIVE + is_valid = false; +#else + return false; +#endif + + DEREF(node).base.aabb = bounds; + DEREF(node).primitive_id = global_id; + DEREF(node).geometry_id_and_flags = geometry_id; + + return is_valid; +} + +vk_aabb +calculate_instance_node_bounds(uint64_t base_ptr, mat3x4 otw_matrix) +{ + vk_aabb aabb; + + vk_aabb blas_aabb = DEREF(REF(vk_aabb)(base_ptr + BVH_BOUNDS_OFFSET)); + + for (uint32_t comp = 0; comp < 3; ++comp) { + aabb.min[comp] = otw_matrix[comp][3]; + aabb.max[comp] = otw_matrix[comp][3]; + for (uint32_t col = 0; col < 3; ++col) { + aabb.min[comp] += + min(otw_matrix[comp][col] * blas_aabb.min[col], otw_matrix[comp][col] * blas_aabb.max[col]); + aabb.max[comp] += + max(otw_matrix[comp][col] * blas_aabb.min[col], otw_matrix[comp][col] * blas_aabb.max[col]); + } + } + return aabb; +} + +bool +build_instance(inout vk_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint32_t global_id) +{ + REF(vk_ir_instance_node) node = REF(vk_ir_instance_node)(dst_ptr); + + AccelerationStructureInstance instance = DEREF(REF(AccelerationStructureInstance)(src_ptr)); + + /* An inactive instance is one whose acceleration structure handle is VK_NULL_HANDLE. Since the active terminology is + * only relevant for BVH updates, which we do not implement, we can also skip instances with mask == 0. + */ + if (instance.accelerationStructureReference == 0 || instance.custom_instance_and_mask < (1u << 24u)) + return false; + + DEREF(node).base_ptr = instance.accelerationStructureReference; + + mat4 transform = mat4(instance.transform); + DEREF(node).otw_matrix = mat3x4(transform); + + bounds = calculate_instance_node_bounds(instance.accelerationStructureReference, mat3x4(transform)); + + DEREF(node).base.aabb = bounds; + DEREF(node).custom_instance_and_mask = instance.custom_instance_and_mask; + DEREF(node).sbt_offset_and_flags = instance.sbt_offset_and_flags; + DEREF(node).instance_id = global_id; + + return true; +} + +void +main(void) +{ + uint32_t global_id = gl_GlobalInvocationID.x; + uint32_t primitive_id = args.geom_data.first_id + global_id; + + REF(key_id_pair) id_ptr = INDEX(key_id_pair, args.ids, primitive_id); + uint32_t src_offset = global_id * args.geom_data.stride; + + uint32_t dst_stride; + uint32_t node_type; + if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR) { + dst_stride = SIZEOF(vk_ir_triangle_node); + node_type = vk_ir_node_triangle; + } else if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_AABBS_KHR) { + dst_stride = SIZEOF(vk_ir_aabb_node); + node_type = vk_ir_node_aabb; + } else { + dst_stride = SIZEOF(vk_ir_instance_node); + node_type = vk_ir_node_instance; + } + + uint32_t dst_offset = primitive_id * dst_stride; + VOID_REF dst_ptr = OFFSET(args.bvh, dst_offset); + + vk_aabb bounds; + bool is_active; + if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR) { + is_active = build_triangle(bounds, dst_ptr, args.geom_data, global_id); + } else if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_AABBS_KHR) { + VOID_REF src_ptr = OFFSET(args.geom_data.data, src_offset); + is_active = build_aabb(bounds, src_ptr, dst_ptr, args.geom_data.geometry_id, global_id); + } else { + VOID_REF src_ptr = OFFSET(args.geom_data.data, src_offset); + /* arrayOfPointers */ + if (args.geom_data.stride == 8) { + src_ptr = DEREF(REF(VOID_REF)(src_ptr)); + } + + is_active = build_instance(bounds, src_ptr, dst_ptr, global_id); + } + +#if ALWAYS_ACTIVE + if (!is_active && args.geom_data.geometry_type != VK_GEOMETRY_TYPE_INSTANCES_KHR) { + bounds.min = vec3(0.0); + bounds.max = vec3(0.0); + is_active = true; + } +#endif + + DEREF(id_ptr).id = is_active ? pack_ir_node_id(dst_offset, node_type) : VK_BVH_INVALID_NODE; + + uvec4 ballot = subgroupBallot(is_active); + if (subgroupElect()) + atomicAdd(DEREF(args.header).active_leaf_count, subgroupBallotBitCount(ballot)); + + atomicMin(DEREF(args.header).min_bounds[0], to_emulated_float(bounds.min.x)); + atomicMin(DEREF(args.header).min_bounds[1], to_emulated_float(bounds.min.y)); + atomicMin(DEREF(args.header).min_bounds[2], to_emulated_float(bounds.min.z)); + atomicMax(DEREF(args.header).max_bounds[0], to_emulated_float(bounds.max.x)); + atomicMax(DEREF(args.header).max_bounds[1], to_emulated_float(bounds.max.y)); + atomicMax(DEREF(args.header).max_bounds[2], to_emulated_float(bounds.max.z)); +} diff --git a/src/vulkan/runtime/bvh/meson.build b/src/vulkan/runtime/bvh/meson.build new file mode 100644 index 00000000000..a2d751c295c --- /dev/null +++ b/src/vulkan/runtime/bvh/meson.build @@ -0,0 +1,81 @@ +# Copyright © 2022 Konstantin Seurer + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# source file, output name, defines +bvh_shaders = [ + [ + 'lbvh_generate_ir.comp', + 'lbvh_generate_ir', + [], + ], + [ + 'lbvh_main.comp', + 'lbvh_main', + [], + ], + [ + 'leaf.comp', + 'leaf', + ['ALWAYS_ACTIVE=0'], + ], + [ + 'leaf.comp', + 'leaf_always_active', + ['ALWAYS_ACTIVE=1'], + ], + [ + 'morton.comp', + 'morton', + [], + ], + [ + 'ploc_internal.comp', + 'ploc_internal', + [], + ], +] + +vk_bvh_include_dir = dir_source_root + '/src/vulkan/runtime/bvh' + +vk_bvh_includes = files( + 'vk_build_helpers.h', + 'vk_build_interface.h', + 'vk_bvh.h', +) + +bvh_spv = [] +foreach s : bvh_shaders + command = [ + prog_glslang, '-V', '-I' + vk_bvh_include_dir, '--target-env', 'spirv1.5', '-x', '-o', '@OUTPUT@', '@INPUT@' + ] + (with_mesa_debug ? ['-g'] : []) + command += glslang_quiet + + foreach define : s[2] + command += '-D' + define + endforeach + + bvh_spv += custom_target( + s[1] + '.spv.h', + input : s[0], + output : s[1] + '.spv.h', + command : command, + depend_files: vk_bvh_includes + ) +endforeach diff --git a/src/amd/vulkan/bvh/morton.comp b/src/vulkan/runtime/bvh/morton.comp similarity index 62% rename from src/amd/vulkan/bvh/morton.comp rename to src/vulkan/runtime/bvh/morton.comp index f795297a11c..75a6f15baf3 100644 --- a/src/amd/vulkan/bvh/morton.comp +++ b/src/vulkan/runtime/bvh/morton.comp @@ -1,7 +1,24 @@ /* * Copyright © 2022 Konstantin Seurer * - * SPDX-License-Identifier: MIT + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. */ #version 460 @@ -17,9 +34,9 @@ #extension GL_EXT_buffer_reference : require #extension GL_EXT_buffer_reference2 : require -layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; +#include "vk_build_interface.h" -#include "build_interface.h" +layout(local_size_x_id = SUBGROUP_SIZE_ID, local_size_y = 1, local_size_z = 1) in; layout(push_constant) uniform CONSTS { morton_args args; @@ -56,11 +73,11 @@ main(void) uint32_t id = DEREF(key_id).id; uint32_t key; - if (id != RADV_BVH_INVALID_NODE) { - radv_aabb bounds = DEREF(REF(radv_ir_node)OFFSET(args.bvh, ir_id_to_offset(id))).aabb; + if (id != VK_BVH_INVALID_NODE) { + vk_aabb bounds = DEREF(REF(vk_ir_node)OFFSET(args.bvh, ir_id_to_offset(id))).aabb; vec3 center = (bounds.min + bounds.max) * 0.5; - radv_aabb bvh_bounds; + vk_aabb bvh_bounds; bvh_bounds.min.x = from_emulated_float(DEREF(args.header).min_bounds[0]); bvh_bounds.min.y = from_emulated_float(DEREF(args.header).min_bounds[1]); bvh_bounds.min.z = from_emulated_float(DEREF(args.header).min_bounds[2]); diff --git a/src/amd/vulkan/bvh/ploc_internal.comp b/src/vulkan/runtime/bvh/ploc_internal.comp similarity index 76% rename from src/amd/vulkan/bvh/ploc_internal.comp rename to src/vulkan/runtime/bvh/ploc_internal.comp index 50fc40edc93..0ecf7d38d82 100644 --- a/src/amd/vulkan/bvh/ploc_internal.comp +++ b/src/vulkan/runtime/bvh/ploc_internal.comp @@ -1,7 +1,24 @@ /* * Copyright © 2022 Bas Nieuwenhuizen * - * SPDX-License-Identifier: MIT + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. */ #version 460 @@ -24,7 +41,7 @@ layout(local_size_x = 1024, local_size_y = 1, local_size_z = 1) in; #define USE_GLOBAL_SYNC -#include "build_interface.h" +#include "vk_build_interface.h" TYPE(ploc_prefix_scan_partition, 4); @@ -34,7 +51,8 @@ layout(push_constant) uniform CONSTS }; shared uint32_t exclusive_prefix_sum; -shared uint32_t aggregate_sums[PLOC_WORKGROUP_SIZE / 64]; +shared uint32_t aggregate_sums[PLOC_SUBGROUPS_PER_WORKGROUP]; +shared uint32_t aggregate_sums2[PLOC_SUBGROUPS_PER_WORKGROUP]; /* * Global prefix scan over all workgroups to find out the index of the collapsed node to write. @@ -45,8 +63,7 @@ uint32_t prefix_scan(uvec4 ballot, REF(ploc_prefix_scan_partition) partitions, uint32_t task_index) { if (gl_LocalInvocationIndex == 0) { - /* Temporary copy of exclusive_prefix_sum to avoid reading+writing LDS each addition */ - uint32_t local_exclusive_prefix_sum = 0; + exclusive_prefix_sum = 0; if (task_index >= gl_WorkGroupSize.x) { REF(ploc_prefix_scan_partition) current_partition = REF(ploc_prefix_scan_partition)(INDEX(ploc_prefix_scan_partition, partitions, task_index / gl_WorkGroupSize.x)); @@ -58,28 +75,55 @@ prefix_scan(uvec4 ballot, REF(ploc_prefix_scan_partition) partitions, uint32_t t if (atomicLoad(DEREF(previous_partition).inclusive_sum, gl_ScopeDevice, gl_StorageSemanticsBuffer, gl_SemanticsAcquire | gl_SemanticsMakeVisible) != 0xFFFFFFFF) { - local_exclusive_prefix_sum += DEREF(previous_partition).inclusive_sum; + atomicAdd(exclusive_prefix_sum, DEREF(previous_partition).inclusive_sum); break; } else { - local_exclusive_prefix_sum += DEREF(previous_partition).aggregate; + atomicAdd(exclusive_prefix_sum, DEREF(previous_partition).aggregate); previous_partition -= 1; } } /* Set the inclusive sum for the next workgroups */ atomicStore(DEREF(current_partition).inclusive_sum, - DEREF(current_partition).aggregate + local_exclusive_prefix_sum, gl_ScopeDevice, + DEREF(current_partition).aggregate + exclusive_prefix_sum, gl_ScopeDevice, gl_StorageSemanticsBuffer, gl_SemanticsRelease | gl_SemanticsMakeAvailable); } - exclusive_prefix_sum = local_exclusive_prefix_sum; } if (subgroupElect()) aggregate_sums[gl_SubgroupID] = subgroupBallotBitCount(ballot); barrier(); - if (gl_LocalInvocationID.x < PLOC_WORKGROUP_SIZE / 64) { - aggregate_sums[gl_LocalInvocationID.x] = - exclusive_prefix_sum + subgroupExclusiveAdd(aggregate_sums[gl_LocalInvocationID.x]); + if (PLOC_SUBGROUPS_PER_WORKGROUP <= SUBGROUP_SIZE) { + if (gl_LocalInvocationID.x < PLOC_SUBGROUPS_PER_WORKGROUP) { + aggregate_sums[gl_LocalInvocationID.x] = + exclusive_prefix_sum + subgroupExclusiveAdd(aggregate_sums[gl_LocalInvocationID.x]); + } + } else { + /* If the length of aggregate_sums[] is larger than SUBGROUP_SIZE, + * the prefix scan can't be done simply by subgroupExclusiveAdd. + */ + if (gl_LocalInvocationID.x < PLOC_SUBGROUPS_PER_WORKGROUP) + aggregate_sums2[gl_LocalInvocationID.x] = aggregate_sums[gl_LocalInvocationID.x]; + barrier(); + + /* Hillis Steele inclusive scan on aggregate_sums2 */ + for (uint32_t stride = 1; stride < PLOC_SUBGROUPS_PER_WORKGROUP; stride *= 2) { + uint32_t value = 0; + if (gl_LocalInvocationID.x >= stride && gl_LocalInvocationID.x < PLOC_SUBGROUPS_PER_WORKGROUP) + value = aggregate_sums2[gl_LocalInvocationID.x - stride]; + barrier(); + if (gl_LocalInvocationID.x < PLOC_SUBGROUPS_PER_WORKGROUP) + aggregate_sums2[gl_LocalInvocationID.x] += value; + barrier(); + } + + /* Adapt to exclusive and add the prefix_sum from previous workgroups */ + if (gl_LocalInvocationID.x < PLOC_SUBGROUPS_PER_WORKGROUP) { + if (gl_LocalInvocationID.x == 0) + aggregate_sums[gl_LocalInvocationID.x] = exclusive_prefix_sum; + else + aggregate_sums[gl_LocalInvocationID.x] = exclusive_prefix_sum + aggregate_sums2[gl_LocalInvocationID.x - 1]; + } } barrier(); @@ -90,20 +134,20 @@ prefix_scan(uvec4 ballot, REF(ploc_prefix_scan_partition) partitions, uint32_t t #define BVH_LEVEL_COST 0.2 uint32_t -push_node(uint32_t children[2], radv_aabb bounds[2]) +push_node(uint32_t children[2], vk_aabb bounds[2]) { uint32_t internal_node_index = atomicAdd(DEREF(args.header).ir_internal_node_count, 1); - uint32_t dst_offset = args.internal_node_offset + internal_node_index * SIZEOF(radv_ir_box_node); - uint32_t dst_id = pack_ir_node_id(dst_offset, radv_ir_node_internal); - REF(radv_ir_box_node) dst_node = REF(radv_ir_box_node)(OFFSET(args.bvh, dst_offset)); + uint32_t dst_offset = args.internal_node_offset + internal_node_index * SIZEOF(vk_ir_box_node); + uint32_t dst_id = pack_ir_node_id(dst_offset, vk_ir_node_internal); + REF(vk_ir_box_node) dst_node = REF(vk_ir_box_node)(OFFSET(args.bvh, dst_offset)); - radv_aabb total_bounds; + vk_aabb total_bounds; total_bounds.min = vec3(INFINITY); total_bounds.max = vec3(-INFINITY); for (uint i = 0; i < 2; ++i) { VOID_REF node = OFFSET(args.bvh, ir_id_to_offset(children[i])); - REF(radv_ir_node) child = REF(radv_ir_node)(node); + REF(vk_ir_node) child = REF(vk_ir_node)(node); total_bounds.min = min(total_bounds.min, bounds[i].min); total_bounds.max = max(total_bounds.max, bounds[i].max); @@ -112,7 +156,7 @@ push_node(uint32_t children[2], radv_aabb bounds[2]) } DEREF(dst_node).base.aabb = total_bounds; - DEREF(dst_node).bvh_offset = RADV_UNKNOWN_BVH_OFFSET; + DEREF(dst_node).bvh_offset = VK_UNKNOWN_BVH_OFFSET; return dst_id; } @@ -136,7 +180,7 @@ decode_neighbour_offset(uint32_t encoded_offset) #define NUM_PLOC_LDS_ITEMS PLOC_WORKGROUP_SIZE + 4 * PLOC_NEIGHBOURHOOD -shared radv_aabb shared_bounds[NUM_PLOC_LDS_ITEMS]; +shared vk_aabb shared_bounds[NUM_PLOC_LDS_ITEMS]; shared uint32_t nearest_neighbour_indices[NUM_PLOC_LDS_ITEMS]; uint32_t @@ -155,11 +199,11 @@ load_bounds(VOID_REF ids, uint32_t iter, uint32_t task_index, uint32_t lds_base, for (uint32_t i = task_index - 2 * neighbourhood_overlap; i < search_bound; i += gl_WorkGroupSize.x) { uint32_t id = load_id(ids, iter, i); - if (id == RADV_BVH_INVALID_NODE) + if (id == VK_BVH_INVALID_NODE) continue; VOID_REF addr = OFFSET(args.bvh, ir_id_to_offset(id)); - REF(radv_ir_node) node = REF(radv_ir_node)(addr); + REF(vk_ir_node) node = REF(vk_ir_node)(addr); shared_bounds[i - lds_base] = DEREF(node).aabb; } @@ -168,7 +212,7 @@ load_bounds(VOID_REF ids, uint32_t iter, uint32_t task_index, uint32_t lds_base, float combined_node_cost(uint32_t lds_base, uint32_t i, uint32_t j) { - radv_aabb combined_bounds; + vk_aabb combined_bounds; combined_bounds.min = min(shared_bounds[i - lds_base].min, shared_bounds[j - lds_base].min); combined_bounds.max = max(shared_bounds[i - lds_base].max, shared_bounds[j - lds_base].max); return aabb_surface_area(combined_bounds); @@ -187,10 +231,10 @@ main(void) if (DEREF(args.header).active_leaf_count <= 2) { if (gl_GlobalInvocationID.x == 0) { uint32_t internal_node_index = atomicAdd(DEREF(args.header).ir_internal_node_count, 1); - uint32_t dst_offset = args.internal_node_offset + internal_node_index * SIZEOF(radv_ir_box_node); - REF(radv_ir_box_node) dst_node = REF(radv_ir_box_node)(OFFSET(args.bvh, dst_offset)); + uint32_t dst_offset = args.internal_node_offset + internal_node_index * SIZEOF(vk_ir_box_node); + REF(vk_ir_box_node) dst_node = REF(vk_ir_box_node)(OFFSET(args.bvh, dst_offset)); - radv_aabb total_bounds; + vk_aabb total_bounds; total_bounds.min = vec3(INFINITY); total_bounds.max = vec3(-INFINITY); @@ -198,10 +242,10 @@ main(void) for (; i < DEREF(args.header).active_leaf_count; i++) { uint32_t child_id = DEREF(INDEX(key_id_pair, src_ids, i)).id; - if (child_id != RADV_BVH_INVALID_NODE) { + if (child_id != VK_BVH_INVALID_NODE) { VOID_REF node = OFFSET(args.bvh, ir_id_to_offset(child_id)); - REF(radv_ir_node) child = REF(radv_ir_node)(node); - radv_aabb bounds = DEREF(child).aabb; + REF(vk_ir_node) child = REF(vk_ir_node)(node); + vk_aabb bounds = DEREF(child).aabb; total_bounds.min = min(total_bounds.min, bounds.min); total_bounds.max = max(total_bounds.max, bounds.max); @@ -210,10 +254,10 @@ main(void) DEREF(dst_node).children[i] = child_id; } for (; i < 2; i++) - DEREF(dst_node).children[i] = RADV_BVH_INVALID_NODE; + DEREF(dst_node).children[i] = VK_BVH_INVALID_NODE; DEREF(dst_node).base.aabb = total_bounds; - DEREF(dst_node).bvh_offset = RADV_UNKNOWN_BVH_OFFSET; + DEREF(dst_node).bvh_offset = VK_UNKNOWN_BVH_OFFSET; } return; } @@ -329,11 +373,11 @@ main(void) if (task_index < neighbour_index) { uint32_t neighbour_id = load_id(src_ids, iter, neighbour_index); uint32_t children[2] = {id, neighbour_id}; - radv_aabb bounds[2] = {shared_bounds[task_index - lds_base], shared_bounds[neighbour_index - lds_base]}; + vk_aabb bounds[2] = {shared_bounds[task_index - lds_base], shared_bounds[neighbour_index - lds_base]}; DEREF(REF(uint32_t)(INDEX(uint32_t, dst_ids, task_index))) = push_node(children, bounds); DEREF(REF(uint32_t)(INDEX(uint32_t, dst_ids, neighbour_index))) = - RADV_BVH_INVALID_NODE; + VK_BVH_INVALID_NODE; } else { /* We still store in the other case so we don't destroy the node id needed to * create the internal node */ @@ -381,14 +425,14 @@ main(void) uint32_t id = task_index < current_task_count ? DEREF(REF(uint32_t)(INDEX(uint32_t, dst_ids, task_index))) - : RADV_BVH_INVALID_NODE; - uvec4 ballot = subgroupBallot(id != RADV_BVH_INVALID_NODE); + : VK_BVH_INVALID_NODE; + uvec4 ballot = subgroupBallot(id != VK_BVH_INVALID_NODE); uint32_t new_offset = prefix_scan(ballot, partitions, task_index); if (task_index >= current_task_count) continue; - if (id != RADV_BVH_INVALID_NODE) { + if (id != VK_BVH_INVALID_NODE) { DEREF(REF(uint32_t)(INDEX(uint32_t, src_ids, new_offset))) = id; ++new_offset; } diff --git a/src/vulkan/runtime/bvh/vk_build_helpers.h b/src/vulkan/runtime/bvh/vk_build_helpers.h new file mode 100644 index 00000000000..0a178adea14 --- /dev/null +++ b/src/vulkan/runtime/bvh/vk_build_helpers.h @@ -0,0 +1,522 @@ +/* + * Copyright © 2022 Konstantin Seurer + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef VK_BVH_BUILD_HELPERS_H +#define VK_BVH_BUILD_HELPERS_H + +#include "vk_bvh.h" + +#define VK_FORMAT_UNDEFINED 0 +#define VK_FORMAT_R4G4_UNORM_PACK8 1 +#define VK_FORMAT_R4G4B4A4_UNORM_PACK16 2 +#define VK_FORMAT_B4G4R4A4_UNORM_PACK16 3 +#define VK_FORMAT_R5G6B5_UNORM_PACK16 4 +#define VK_FORMAT_B5G6R5_UNORM_PACK16 5 +#define VK_FORMAT_R5G5B5A1_UNORM_PACK16 6 +#define VK_FORMAT_B5G5R5A1_UNORM_PACK16 7 +#define VK_FORMAT_A1R5G5B5_UNORM_PACK16 8 +#define VK_FORMAT_R8_UNORM 9 +#define VK_FORMAT_R8_SNORM 10 +#define VK_FORMAT_R8_USCALED 11 +#define VK_FORMAT_R8_SSCALED 12 +#define VK_FORMAT_R8_UINT 13 +#define VK_FORMAT_R8_SINT 14 +#define VK_FORMAT_R8_SRGB 15 +#define VK_FORMAT_R8G8_UNORM 16 +#define VK_FORMAT_R8G8_SNORM 17 +#define VK_FORMAT_R8G8_USCALED 18 +#define VK_FORMAT_R8G8_SSCALED 19 +#define VK_FORMAT_R8G8_UINT 20 +#define VK_FORMAT_R8G8_SINT 21 +#define VK_FORMAT_R8G8_SRGB 22 +#define VK_FORMAT_R8G8B8_UNORM 23 +#define VK_FORMAT_R8G8B8_SNORM 24 +#define VK_FORMAT_R8G8B8_USCALED 25 +#define VK_FORMAT_R8G8B8_SSCALED 26 +#define VK_FORMAT_R8G8B8_UINT 27 +#define VK_FORMAT_R8G8B8_SINT 28 +#define VK_FORMAT_R8G8B8_SRGB 29 +#define VK_FORMAT_B8G8R8_UNORM 30 +#define VK_FORMAT_B8G8R8_SNORM 31 +#define VK_FORMAT_B8G8R8_USCALED 32 +#define VK_FORMAT_B8G8R8_SSCALED 33 +#define VK_FORMAT_B8G8R8_UINT 34 +#define VK_FORMAT_B8G8R8_SINT 35 +#define VK_FORMAT_B8G8R8_SRGB 36 +#define VK_FORMAT_R8G8B8A8_UNORM 37 +#define VK_FORMAT_R8G8B8A8_SNORM 38 +#define VK_FORMAT_R8G8B8A8_USCALED 39 +#define VK_FORMAT_R8G8B8A8_SSCALED 40 +#define VK_FORMAT_R8G8B8A8_UINT 41 +#define VK_FORMAT_R8G8B8A8_SINT 42 +#define VK_FORMAT_R8G8B8A8_SRGB 43 +#define VK_FORMAT_B8G8R8A8_UNORM 44 +#define VK_FORMAT_B8G8R8A8_SNORM 45 +#define VK_FORMAT_B8G8R8A8_USCALED 46 +#define VK_FORMAT_B8G8R8A8_SSCALED 47 +#define VK_FORMAT_B8G8R8A8_UINT 48 +#define VK_FORMAT_B8G8R8A8_SINT 49 +#define VK_FORMAT_B8G8R8A8_SRGB 50 +#define VK_FORMAT_A8B8G8R8_UNORM_PACK32 51 +#define VK_FORMAT_A8B8G8R8_SNORM_PACK32 52 +#define VK_FORMAT_A8B8G8R8_USCALED_PACK32 53 +#define VK_FORMAT_A8B8G8R8_SSCALED_PACK32 54 +#define VK_FORMAT_A8B8G8R8_UINT_PACK32 55 +#define VK_FORMAT_A8B8G8R8_SINT_PACK32 56 +#define VK_FORMAT_A8B8G8R8_SRGB_PACK32 57 +#define VK_FORMAT_A2R10G10B10_UNORM_PACK32 58 +#define VK_FORMAT_A2R10G10B10_SNORM_PACK32 59 +#define VK_FORMAT_A2R10G10B10_USCALED_PACK32 60 +#define VK_FORMAT_A2R10G10B10_SSCALED_PACK32 61 +#define VK_FORMAT_A2R10G10B10_UINT_PACK32 62 +#define VK_FORMAT_A2R10G10B10_SINT_PACK32 63 +#define VK_FORMAT_A2B10G10R10_UNORM_PACK32 64 +#define VK_FORMAT_A2B10G10R10_SNORM_PACK32 65 +#define VK_FORMAT_A2B10G10R10_USCALED_PACK32 66 +#define VK_FORMAT_A2B10G10R10_SSCALED_PACK32 67 +#define VK_FORMAT_A2B10G10R10_UINT_PACK32 68 +#define VK_FORMAT_A2B10G10R10_SINT_PACK32 69 +#define VK_FORMAT_R16_UNORM 70 +#define VK_FORMAT_R16_SNORM 71 +#define VK_FORMAT_R16_USCALED 72 +#define VK_FORMAT_R16_SSCALED 73 +#define VK_FORMAT_R16_UINT 74 +#define VK_FORMAT_R16_SINT 75 +#define VK_FORMAT_R16_SFLOAT 76 +#define VK_FORMAT_R16G16_UNORM 77 +#define VK_FORMAT_R16G16_SNORM 78 +#define VK_FORMAT_R16G16_USCALED 79 +#define VK_FORMAT_R16G16_SSCALED 80 +#define VK_FORMAT_R16G16_UINT 81 +#define VK_FORMAT_R16G16_SINT 82 +#define VK_FORMAT_R16G16_SFLOAT 83 +#define VK_FORMAT_R16G16B16_UNORM 84 +#define VK_FORMAT_R16G16B16_SNORM 85 +#define VK_FORMAT_R16G16B16_USCALED 86 +#define VK_FORMAT_R16G16B16_SSCALED 87 +#define VK_FORMAT_R16G16B16_UINT 88 +#define VK_FORMAT_R16G16B16_SINT 89 +#define VK_FORMAT_R16G16B16_SFLOAT 90 +#define VK_FORMAT_R16G16B16A16_UNORM 91 +#define VK_FORMAT_R16G16B16A16_SNORM 92 +#define VK_FORMAT_R16G16B16A16_USCALED 93 +#define VK_FORMAT_R16G16B16A16_SSCALED 94 +#define VK_FORMAT_R16G16B16A16_UINT 95 +#define VK_FORMAT_R16G16B16A16_SINT 96 +#define VK_FORMAT_R16G16B16A16_SFLOAT 97 +#define VK_FORMAT_R32_UINT 98 +#define VK_FORMAT_R32_SINT 99 +#define VK_FORMAT_R32_SFLOAT 100 +#define VK_FORMAT_R32G32_UINT 101 +#define VK_FORMAT_R32G32_SINT 102 +#define VK_FORMAT_R32G32_SFLOAT 103 +#define VK_FORMAT_R32G32B32_UINT 104 +#define VK_FORMAT_R32G32B32_SINT 105 +#define VK_FORMAT_R32G32B32_SFLOAT 106 +#define VK_FORMAT_R32G32B32A32_UINT 107 +#define VK_FORMAT_R32G32B32A32_SINT 108 +#define VK_FORMAT_R32G32B32A32_SFLOAT 109 +#define VK_FORMAT_R64_UINT 110 +#define VK_FORMAT_R64_SINT 111 +#define VK_FORMAT_R64_SFLOAT 112 +#define VK_FORMAT_R64G64_UINT 113 +#define VK_FORMAT_R64G64_SINT 114 +#define VK_FORMAT_R64G64_SFLOAT 115 +#define VK_FORMAT_R64G64B64_UINT 116 +#define VK_FORMAT_R64G64B64_SINT 117 +#define VK_FORMAT_R64G64B64_SFLOAT 118 +#define VK_FORMAT_R64G64B64A64_UINT 119 +#define VK_FORMAT_R64G64B64A64_SINT 120 +#define VK_FORMAT_R64G64B64A64_SFLOAT 121 + +#define VK_INDEX_TYPE_UINT16 0 +#define VK_INDEX_TYPE_UINT32 1 +#define VK_INDEX_TYPE_NONE_KHR 1000165000 +#define VK_INDEX_TYPE_UINT8_EXT 1000265000 + +#define VK_GEOMETRY_TYPE_TRIANGLES_KHR 0 +#define VK_GEOMETRY_TYPE_AABBS_KHR 1 +#define VK_GEOMETRY_TYPE_INSTANCES_KHR 2 + +#define VK_GEOMETRY_INSTANCE_TRIANGLE_FACING_CULL_DISABLE_BIT_KHR 1 +#define VK_GEOMETRY_INSTANCE_TRIANGLE_FLIP_FACING_BIT_KHR 2 +#define VK_GEOMETRY_INSTANCE_FORCE_OPAQUE_BIT_KHR 4 +#define VK_GEOMETRY_INSTANCE_FORCE_NO_OPAQUE_BIT_KHR 8 + +#define TYPE(type, align) \ + layout(buffer_reference, buffer_reference_align = align, scalar) buffer type##_ref \ + { \ + type value; \ + }; + +#define REF(type) type##_ref +#define VOID_REF uint64_t +#define NULL 0 +#define DEREF(var) var.value + +#define SIZEOF(type) uint32_t(uint64_t(REF(type)(uint64_t(0)) + 1)) + +#define OFFSET(ptr, offset) (uint64_t(ptr) + offset) + +#define INFINITY (1.0 / 0.0) +#define NAN (0.0 / 0.0) + +#define INDEX(type, ptr, index) REF(type)(OFFSET(ptr, (index)*SIZEOF(type))) + +TYPE(int8_t, 1); +TYPE(uint8_t, 1); +TYPE(int16_t, 2); +TYPE(uint16_t, 2); +TYPE(int32_t, 4); +TYPE(uint32_t, 4); +TYPE(int64_t, 8); +TYPE(uint64_t, 8); + +TYPE(float, 4); + +TYPE(vec2, 4); +TYPE(vec3, 4); +TYPE(vec4, 4); + +TYPE(uvec4, 16); + +TYPE(VOID_REF, 8); + +/* copied from u_math.h */ +uint32_t +align(uint32_t value, uint32_t alignment) +{ + return (value + alignment - 1) & ~(alignment - 1); +} + +int32_t +to_emulated_float(float f) +{ + int32_t bits = floatBitsToInt(f); + return f < 0 ? -2147483648 - bits : bits; +} + +float +from_emulated_float(int32_t bits) +{ + return intBitsToFloat(bits < 0 ? -2147483648 - bits : bits); +} + +TYPE(vk_aabb, 4); + +struct key_id_pair { + uint32_t id; + uint32_t key; +}; +TYPE(key_id_pair, 4); + +TYPE(vk_accel_struct_serialization_header, 8); + +TYPE(vk_ir_header, 4); +TYPE(vk_ir_node, 4); +TYPE(vk_ir_box_node, 4); +TYPE(vk_ir_triangle_node, 4); +TYPE(vk_ir_aabb_node, 4); +TYPE(vk_ir_instance_node, 8); + +TYPE(vk_global_sync_data, 4); + +uint32_t +ir_id_to_offset(uint32_t id) +{ + return id & (~3u); +} + +uint32_t +ir_id_to_type(uint32_t id) +{ + return id & 3u; +} + +uint32_t +pack_ir_node_id(uint32_t offset, uint32_t type) +{ + return offset | type; +} + +float +aabb_surface_area(vk_aabb aabb) +{ + vec3 diagonal = aabb.max - aabb.min; + return 2 * diagonal.x * diagonal.y + 2 * diagonal.y * diagonal.z + 2 * diagonal.x * diagonal.z; +} + +/* Just a wrapper for 3 uints. */ +struct triangle_indices { + uint32_t index[3]; +}; + +triangle_indices +load_indices(VOID_REF indices, uint32_t index_format, uint32_t global_id) +{ + triangle_indices result; + + uint32_t index_base = global_id * 3; + + switch (index_format) { + case VK_INDEX_TYPE_UINT16: { + result.index[0] = DEREF(INDEX(uint16_t, indices, index_base + 0)); + result.index[1] = DEREF(INDEX(uint16_t, indices, index_base + 1)); + result.index[2] = DEREF(INDEX(uint16_t, indices, index_base + 2)); + break; + } + case VK_INDEX_TYPE_UINT32: { + result.index[0] = DEREF(INDEX(uint32_t, indices, index_base + 0)); + result.index[1] = DEREF(INDEX(uint32_t, indices, index_base + 1)); + result.index[2] = DEREF(INDEX(uint32_t, indices, index_base + 2)); + break; + } + case VK_INDEX_TYPE_NONE_KHR: { + result.index[0] = index_base + 0; + result.index[1] = index_base + 1; + result.index[2] = index_base + 2; + break; + } + case VK_INDEX_TYPE_UINT8_EXT: { + result.index[0] = DEREF(INDEX(uint8_t, indices, index_base + 0)); + result.index[1] = DEREF(INDEX(uint8_t, indices, index_base + 1)); + result.index[2] = DEREF(INDEX(uint8_t, indices, index_base + 2)); + break; + } + } + + return result; +} + +/* Just a wrapper for 3 vec4s. */ +struct triangle_vertices { + vec4 vertex[3]; +}; + +TYPE(float16_t, 2); + +triangle_vertices +load_vertices(VOID_REF vertices, triangle_indices indices, uint32_t vertex_format, uint32_t stride) +{ + triangle_vertices result; + + for (uint32_t i = 0; i < 3; i++) { + VOID_REF vertex_ptr = OFFSET(vertices, indices.index[i] * stride); + vec4 vertex = vec4(0.0, 0.0, 0.0, 1.0); + + switch (vertex_format) { + case VK_FORMAT_R32G32_SFLOAT: + vertex.x = DEREF(INDEX(float, vertex_ptr, 0)); + vertex.y = DEREF(INDEX(float, vertex_ptr, 1)); + break; + case VK_FORMAT_R32G32B32_SFLOAT: + case VK_FORMAT_R32G32B32A32_SFLOAT: + vertex.x = DEREF(INDEX(float, vertex_ptr, 0)); + vertex.y = DEREF(INDEX(float, vertex_ptr, 1)); + vertex.z = DEREF(INDEX(float, vertex_ptr, 2)); + break; + case VK_FORMAT_R16G16_SFLOAT: + vertex.x = DEREF(INDEX(float16_t, vertex_ptr, 0)); + vertex.y = DEREF(INDEX(float16_t, vertex_ptr, 1)); + break; + case VK_FORMAT_R16G16B16_SFLOAT: + case VK_FORMAT_R16G16B16A16_SFLOAT: + vertex.x = DEREF(INDEX(float16_t, vertex_ptr, 0)); + vertex.y = DEREF(INDEX(float16_t, vertex_ptr, 1)); + vertex.z = DEREF(INDEX(float16_t, vertex_ptr, 2)); + break; + case VK_FORMAT_R16G16_SNORM: + vertex.x = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 0)) / float(0x7FFF)); + vertex.y = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 1)) / float(0x7FFF)); + break; + case VK_FORMAT_R16G16B16A16_SNORM: + vertex.x = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 0)) / float(0x7FFF)); + vertex.y = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 1)) / float(0x7FFF)); + vertex.z = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 2)) / float(0x7FFF)); + break; + case VK_FORMAT_R8G8_SNORM: + vertex.x = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 0)) / float(0x7F)); + vertex.y = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 1)) / float(0x7F)); + break; + case VK_FORMAT_R8G8B8A8_SNORM: + vertex.x = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 0)) / float(0x7F)); + vertex.y = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 1)) / float(0x7F)); + vertex.z = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 2)) / float(0x7F)); + break; + case VK_FORMAT_R16G16_UNORM: + vertex.x = DEREF(INDEX(uint16_t, vertex_ptr, 0)) / float(0xFFFF); + vertex.y = DEREF(INDEX(uint16_t, vertex_ptr, 1)) / float(0xFFFF); + break; + case VK_FORMAT_R16G16B16A16_UNORM: + vertex.x = DEREF(INDEX(uint16_t, vertex_ptr, 0)) / float(0xFFFF); + vertex.y = DEREF(INDEX(uint16_t, vertex_ptr, 1)) / float(0xFFFF); + vertex.z = DEREF(INDEX(uint16_t, vertex_ptr, 2)) / float(0xFFFF); + break; + case VK_FORMAT_R8G8_UNORM: + vertex.x = DEREF(INDEX(uint8_t, vertex_ptr, 0)) / float(0xFF); + vertex.y = DEREF(INDEX(uint8_t, vertex_ptr, 1)) / float(0xFF); + break; + case VK_FORMAT_R8G8B8A8_UNORM: + vertex.x = DEREF(INDEX(uint8_t, vertex_ptr, 0)) / float(0xFF); + vertex.y = DEREF(INDEX(uint8_t, vertex_ptr, 1)) / float(0xFF); + vertex.z = DEREF(INDEX(uint8_t, vertex_ptr, 2)) / float(0xFF); + break; + case VK_FORMAT_A2B10G10R10_UNORM_PACK32: { + uint32_t data = DEREF(REF(uint32_t)(vertex_ptr)); + vertex.x = float(data & 0x3FF) / 0x3FF; + vertex.y = float((data >> 10) & 0x3FF) / 0x3FF; + vertex.z = float((data >> 20) & 0x3FF) / 0x3FF; + break; + } + } + + result.vertex[i] = vertex; + } + + return result; +} + +/** Compute ceiling of integer quotient of A divided by B. + From macros.h */ +#define DIV_ROUND_UP(A, B) (((A) + (B)-1) / (B)) + +#ifdef USE_GLOBAL_SYNC + +/* There might be more invocations available than tasks to do. + * In that case, the fetched task index is greater than the + * counter offset for the next phase. To avoid out-of-bounds + * accessing, phases will be skipped until the task index is + * is in-bounds again. */ +uint32_t num_tasks_to_skip = 0; +uint32_t phase_index = 0; +bool should_skip = false; +shared uint32_t global_task_index; + +shared uint32_t shared_phase_index; + +uint32_t +task_count(REF(vk_ir_header) header) +{ + uint32_t phase_index = DEREF(header).sync_data.phase_index; + return DEREF(header).sync_data.task_counts[phase_index & 1]; +} + +/* Sets the task count for the next phase. */ +void +set_next_task_count(REF(vk_ir_header) header, uint32_t new_count) +{ + uint32_t phase_index = DEREF(header).sync_data.phase_index; + DEREF(header).sync_data.task_counts[(phase_index + 1) & 1] = new_count; +} + +/* + * This function has two main objectives: + * Firstly, it partitions pending work among free invocations. + * Secondly, it guarantees global synchronization between different phases. + * + * After every call to fetch_task, a new task index is returned. + * fetch_task will also set num_tasks_to_skip. Use should_execute_phase + * to determine if the current phase should be executed or skipped. + * + * Since tasks are assigned per-workgroup, there is a possibility of the task index being + * greater than the total task count. + */ +uint32_t +fetch_task(REF(vk_ir_header) header, bool did_work) +{ + /* Perform a memory + control barrier for all buffer writes for the entire workgroup. + * This guarantees that once the workgroup leaves the PHASE loop, all invocations have finished + * and their results are written to memory. */ + controlBarrier(gl_ScopeWorkgroup, gl_ScopeDevice, gl_StorageSemanticsBuffer, + gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible); + if (gl_LocalInvocationIndex == 0) { + if (did_work) + atomicAdd(DEREF(header).sync_data.task_done_counter, 1); + global_task_index = atomicAdd(DEREF(header).sync_data.task_started_counter, 1); + + do { + /* Perform a memory barrier to refresh the current phase's end counter, in case + * another workgroup changed it. */ + memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer, + gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible); + + /* The first invocation of the first workgroup in a new phase is responsible to initiate the + * switch to a new phase. It is only possible to switch to a new phase if all tasks of the + * previous phase have been completed. Switching to a new phase and incrementing the phase + * end counter in turn notifies all invocations for that phase that it is safe to execute. + */ + if (global_task_index == DEREF(header).sync_data.current_phase_end_counter && + DEREF(header).sync_data.task_done_counter == DEREF(header).sync_data.current_phase_end_counter) { + if (DEREF(header).sync_data.next_phase_exit_flag != 0) { + DEREF(header).sync_data.phase_index = TASK_INDEX_INVALID; + memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer, + gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible); + } else { + atomicAdd(DEREF(header).sync_data.phase_index, 1); + DEREF(header).sync_data.current_phase_start_counter = DEREF(header).sync_data.current_phase_end_counter; + /* Ensure the changes to the phase index and start/end counter are visible for other + * workgroup waiting in the loop. */ + memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer, + gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible); + atomicAdd(DEREF(header).sync_data.current_phase_end_counter, + DIV_ROUND_UP(task_count(header), gl_WorkGroupSize.x)); + } + break; + } + + /* If other invocations have finished all nodes, break out; there is no work to do */ + if (DEREF(header).sync_data.phase_index == TASK_INDEX_INVALID) { + break; + } + } while (global_task_index >= DEREF(header).sync_data.current_phase_end_counter); + + shared_phase_index = DEREF(header).sync_data.phase_index; + } + + barrier(); + if (DEREF(header).sync_data.phase_index == TASK_INDEX_INVALID) + return TASK_INDEX_INVALID; + + num_tasks_to_skip = shared_phase_index - phase_index; + + uint32_t local_task_index = global_task_index - DEREF(header).sync_data.current_phase_start_counter; + return local_task_index * gl_WorkGroupSize.x + gl_LocalInvocationID.x; +} + +bool +should_execute_phase() +{ + if (num_tasks_to_skip > 0) { + /* Skip to next phase. */ + ++phase_index; + --num_tasks_to_skip; + return false; + } + return true; +} + +#define PHASE(header) \ + for (; task_index != TASK_INDEX_INVALID && should_execute_phase(); task_index = fetch_task(header, true)) +#endif + +#endif diff --git a/src/vulkan/runtime/bvh/vk_build_interface.h b/src/vulkan/runtime/bvh/vk_build_interface.h new file mode 100644 index 00000000000..0d2f1fed21c --- /dev/null +++ b/src/vulkan/runtime/bvh/vk_build_interface.h @@ -0,0 +1,103 @@ +/* + * Copyright © 2022 Konstantin Seurer + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef VK_BVH_BUILD_INTERFACE_H +#define VK_BVH_BUILD_INTERFACE_H + +#ifdef VULKAN +#include "vk_build_helpers.h" +#else +#include +#include "vk_bvh.h" +#define REF(type) uint64_t +#define VOID_REF uint64_t +#endif + +#define SUBGROUP_SIZE_ID 0 +#define BVH_BOUNDS_OFFSET_ID 1 +#ifdef VULKAN +layout (constant_id = SUBGROUP_SIZE_ID) const int SUBGROUP_SIZE = 64; +layout (constant_id = BVH_BOUNDS_OFFSET_ID) const int BVH_BOUNDS_OFFSET = 0; +#endif + +struct leaf_args { + VOID_REF bvh; + REF(vk_ir_header) header; + REF(key_id_pair) ids; + + vk_bvh_geometry_data geom_data; +}; + +struct morton_args { + VOID_REF bvh; + REF(vk_ir_header) header; + REF(key_id_pair) ids; +}; + +#define LBVH_RIGHT_CHILD_BIT_SHIFT 29 +#define LBVH_RIGHT_CHILD_BIT (1 << LBVH_RIGHT_CHILD_BIT_SHIFT) + +struct lbvh_node_info { + /* Number of children that have been processed (or are invalid/leaves) in + * the lbvh_generate_ir pass. + */ + uint32_t path_count; + + uint32_t children[2]; + uint32_t parent; +}; + +struct lbvh_main_args { + VOID_REF bvh; + REF(key_id_pair) src_ids; + VOID_REF node_info; + uint32_t id_count; + uint32_t internal_node_base; +}; + +struct lbvh_generate_ir_args { + VOID_REF bvh; + VOID_REF node_info; + VOID_REF header; + uint32_t internal_node_base; +}; + +struct ploc_prefix_scan_partition { + uint32_t aggregate; + uint32_t inclusive_sum; +}; + +#define PLOC_WORKGROUP_SIZE 1024 +#define PLOC_SUBGROUPS_PER_WORKGROUP \ + (DIV_ROUND_UP(PLOC_WORKGROUP_SIZE, SUBGROUP_SIZE)) + +struct ploc_args { + VOID_REF bvh; + VOID_REF prefix_scan_partitions; + REF(vk_ir_header) header; + VOID_REF ids_0; + VOID_REF ids_1; + uint32_t internal_node_offset; +}; + +#endif diff --git a/src/vulkan/runtime/bvh/vk_bvh.h b/src/vulkan/runtime/bvh/vk_bvh.h new file mode 100644 index 00000000000..f393fa443d4 --- /dev/null +++ b/src/vulkan/runtime/bvh/vk_bvh.h @@ -0,0 +1,156 @@ +/* + * Copyright © 2021 Bas Nieuwenhuizen + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef BVH_VK_BVH_H +#define BVH_VK_BVH_H + +#define vk_ir_node_triangle 0 +#define vk_ir_node_internal 1 +#define vk_ir_node_instance 2 +#define vk_ir_node_aabb 3 + +#define VK_GEOMETRY_OPAQUE (1u << 31) + +#ifdef VULKAN +#define VK_UUID_SIZE 16 +#else +#include +typedef struct vk_ir_node vk_ir_node; +typedef struct vk_global_sync_data vk_global_sync_data; +typedef struct vk_bvh_geometry_data vk_bvh_geometry_data; + +typedef struct { + float values[3][4]; +} mat3x4; + +typedef struct { + float x; + float y; + float z; +} vec3; + +typedef struct vk_aabb vk_aabb; +#endif + +struct vk_aabb { + vec3 min; + vec3 max; +}; + +/* This is the header structure for serialized acceleration structures, as + * defined by the Vulkan spec. + */ +struct vk_accel_struct_serialization_header { + uint8_t driver_uuid[VK_UUID_SIZE]; + uint8_t accel_struct_compat[VK_UUID_SIZE]; + uint64_t serialization_size; + uint64_t deserialization_size; + uint64_t instance_count; +#ifndef VULKAN + uint64_t instances[]; +#endif +}; + +struct vk_global_sync_data { + uint32_t task_counts[2]; + uint32_t task_started_counter; + uint32_t task_done_counter; + uint32_t current_phase_start_counter; + uint32_t current_phase_end_counter; + uint32_t phase_index; + /* If this flag is set, the shader should exit + * instead of executing another phase */ + uint32_t next_phase_exit_flag; +}; + +struct vk_ir_header { + int32_t min_bounds[3]; + int32_t max_bounds[3]; + uint32_t active_leaf_count; + /* Indirect dispatch dimensions for the encoder. + * ir_internal_node_count is the thread count in the X dimension, + * while Y and Z are always set to 1. */ + uint32_t ir_internal_node_count; + uint32_t dispatch_size_y; + uint32_t dispatch_size_z; + vk_global_sync_data sync_data; + uint32_t dst_node_offset; +}; + +struct vk_ir_node { + vk_aabb aabb; +}; + +#define VK_UNKNOWN_BVH_OFFSET 0xFFFFFFFF +#define VK_NULL_BVH_OFFSET 0xFFFFFFFE + +struct vk_ir_box_node { + vk_ir_node base; + uint32_t children[2]; + uint32_t bvh_offset; +}; + +struct vk_ir_aabb_node { + vk_ir_node base; + uint32_t primitive_id; + uint32_t geometry_id_and_flags; +}; + +struct vk_ir_triangle_node { + vk_ir_node base; + float coords[3][3]; + uint32_t triangle_id; + uint32_t id; + uint32_t geometry_id_and_flags; +}; + +struct vk_ir_instance_node { + vk_ir_node base; + /* See radv_bvh_instance_node */ + uint64_t base_ptr; + uint32_t custom_instance_and_mask; + uint32_t sbt_offset_and_flags; + mat3x4 otw_matrix; + uint32_t instance_id; +}; + +#define VK_BVH_INVALID_NODE 0xFFFFFFFF + +/* If the task index is set to this value, there is no + * more work to do. */ +#define TASK_INDEX_INVALID 0xFFFFFFFF + +struct vk_bvh_geometry_data { + uint64_t data; + uint64_t indices; + uint64_t transform; + + uint32_t geometry_id; + uint32_t geometry_type; + uint32_t first_id; + uint32_t stride; + uint32_t vertex_format; + uint32_t index_format; +}; + +#endif diff --git a/src/vulkan/runtime/meson.build b/src/vulkan/runtime/meson.build index b325ebe6f3d..9d34ae432f0 100644 --- a/src/vulkan/runtime/meson.build +++ b/src/vulkan/runtime/meson.build @@ -7,7 +7,6 @@ vulkan_lite_runtime_files = files( 'rmv/vk_rmv_common.c', 'rmv/vk_rmv_exporter.c', - 'vk_acceleration_structure.c', 'vk_blend.c', 'vk_buffer.c', 'vk_buffer_view.c', @@ -277,6 +276,8 @@ vulkan_runtime_deps = [ ] if prog_glslang.found() + subdir('radix_sort') + subdir('bvh') vulkan_runtime_files += files('vk_texcompress_astc.c') vulkan_runtime_files += custom_target( 'astc_spv.h', @@ -288,6 +289,10 @@ if prog_glslang.found() ], depfile : 'astc_spv.h.d', ) + vulkan_runtime_files += files('vk_acceleration_structure.c') + vulkan_runtime_files += radix_sort_files + vulkan_runtime_files += bvh_spv + vulkan_runtime_files += radix_sort_spv endif libvulkan_runtime = static_library( @@ -320,7 +325,10 @@ else ) endif -idep_vulkan_runtime_headers = idep_vulkan_lite_runtime_headers +idep_vulkan_runtime_headers = [idep_vulkan_lite_runtime_headers] +idep_vulkan_runtime_headers += declare_dependency( + include_directories : include_directories('bvh'), +) idep_vulkan_runtime = declare_dependency( dependencies : [ diff --git a/src/amd/vulkan/radix_sort/LICENSE b/src/vulkan/runtime/radix_sort/LICENSE similarity index 100% rename from src/amd/vulkan/radix_sort/LICENSE rename to src/vulkan/runtime/radix_sort/LICENSE diff --git a/src/amd/vulkan/radix_sort/common/macros.h b/src/vulkan/runtime/radix_sort/common/macros.h similarity index 100% rename from src/amd/vulkan/radix_sort/common/macros.h rename to src/vulkan/runtime/radix_sort/common/macros.h diff --git a/src/amd/vulkan/radix_sort/common/util.c b/src/vulkan/runtime/radix_sort/common/util.c similarity index 100% rename from src/amd/vulkan/radix_sort/common/util.c rename to src/vulkan/runtime/radix_sort/common/util.c diff --git a/src/amd/vulkan/radix_sort/common/util.h b/src/vulkan/runtime/radix_sort/common/util.h similarity index 100% rename from src/amd/vulkan/radix_sort/common/util.h rename to src/vulkan/runtime/radix_sort/common/util.h diff --git a/src/amd/vulkan/radix_sort/common/vk/barrier.c b/src/vulkan/runtime/radix_sort/common/vk/barrier.c similarity index 81% rename from src/amd/vulkan/radix_sort/common/vk/barrier.c rename to src/vulkan/runtime/radix_sort/common/vk/barrier.c index 58134dbd11a..e0865f6b770 100644 --- a/src/amd/vulkan/radix_sort/common/vk/barrier.c +++ b/src/vulkan/runtime/radix_sort/common/vk/barrier.c @@ -7,6 +7,8 @@ // #include "barrier.h" +#include "vulkan/runtime/vk_device.h" +#include "vulkan/runtime/vk_command_buffer.h" // // @@ -15,6 +17,10 @@ void vk_barrier_compute_w_to_compute_r(VkCommandBuffer cb) { + VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb); + const struct vk_device_dispatch_table *disp = + &cmd_buffer->base.device->dispatch_table; + static VkMemoryBarrier const mb = { .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, @@ -23,7 +29,7 @@ vk_barrier_compute_w_to_compute_r(VkCommandBuffer cb) .dstAccessMask = VK_ACCESS_SHADER_READ_BIT }; - vkCmdPipelineBarrier(cb, + disp->CmdPipelineBarrier(cb, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, @@ -42,6 +48,10 @@ vk_barrier_compute_w_to_compute_r(VkCommandBuffer cb) void vk_barrier_compute_w_to_transfer_r(VkCommandBuffer cb) { + VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb); + const struct vk_device_dispatch_table *disp = + &cmd_buffer->base.device->dispatch_table; + static VkMemoryBarrier const mb = { .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, @@ -50,7 +60,7 @@ vk_barrier_compute_w_to_transfer_r(VkCommandBuffer cb) .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT }; - vkCmdPipelineBarrier(cb, + disp->CmdPipelineBarrier(cb, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0, @@ -69,6 +79,10 @@ vk_barrier_compute_w_to_transfer_r(VkCommandBuffer cb) void vk_barrier_transfer_w_to_compute_r(VkCommandBuffer cb) { + VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb); + const struct vk_device_dispatch_table *disp = + &cmd_buffer->base.device->dispatch_table; + static VkMemoryBarrier const mb = { .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, @@ -77,7 +91,7 @@ vk_barrier_transfer_w_to_compute_r(VkCommandBuffer cb) .dstAccessMask = VK_ACCESS_SHADER_READ_BIT }; - vkCmdPipelineBarrier(cb, + disp->CmdPipelineBarrier(cb, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, @@ -96,6 +110,10 @@ vk_barrier_transfer_w_to_compute_r(VkCommandBuffer cb) void vk_barrier_transfer_w_to_compute_w(VkCommandBuffer cb) { + VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb); + const struct vk_device_dispatch_table *disp = + &cmd_buffer->base.device->dispatch_table; + static VkMemoryBarrier const mb = { .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, @@ -104,7 +122,7 @@ vk_barrier_transfer_w_to_compute_w(VkCommandBuffer cb) .dstAccessMask = VK_ACCESS_SHADER_WRITE_BIT }; - vkCmdPipelineBarrier(cb, + disp->CmdPipelineBarrier(cb, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, @@ -123,6 +141,10 @@ vk_barrier_transfer_w_to_compute_w(VkCommandBuffer cb) void vk_barrier_compute_w_to_indirect_compute_r(VkCommandBuffer cb) { + VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb); + const struct vk_device_dispatch_table *disp = + &cmd_buffer->base.device->dispatch_table; + static VkMemoryBarrier const mb = { .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, @@ -132,7 +154,7 @@ vk_barrier_compute_w_to_indirect_compute_r(VkCommandBuffer cb) VK_ACCESS_SHADER_READ_BIT }; - vkCmdPipelineBarrier(cb, + disp->CmdPipelineBarrier(cb, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, @@ -151,6 +173,10 @@ vk_barrier_compute_w_to_indirect_compute_r(VkCommandBuffer cb) void vk_barrier_transfer_w_compute_w_to_transfer_r(VkCommandBuffer cb) { + VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb); + const struct vk_device_dispatch_table *disp = + &cmd_buffer->base.device->dispatch_table; + static VkMemoryBarrier const mb = { .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, @@ -160,7 +186,7 @@ vk_barrier_transfer_w_compute_w_to_transfer_r(VkCommandBuffer cb) .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT }; - vkCmdPipelineBarrier(cb, + disp->CmdPipelineBarrier(cb, VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0, @@ -179,6 +205,10 @@ vk_barrier_transfer_w_compute_w_to_transfer_r(VkCommandBuffer cb) void vk_barrier_compute_w_to_host_r(VkCommandBuffer cb) { + VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb); + const struct vk_device_dispatch_table *disp = + &cmd_buffer->base.device->dispatch_table; + static VkMemoryBarrier const mb = { .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, @@ -187,7 +217,7 @@ vk_barrier_compute_w_to_host_r(VkCommandBuffer cb) .dstAccessMask = VK_ACCESS_HOST_READ_BIT }; - vkCmdPipelineBarrier(cb, + disp->CmdPipelineBarrier(cb, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, 0, @@ -206,6 +236,10 @@ vk_barrier_compute_w_to_host_r(VkCommandBuffer cb) void vk_barrier_transfer_w_to_host_r(VkCommandBuffer cb) { + VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb); + const struct vk_device_dispatch_table *disp = + &cmd_buffer->base.device->dispatch_table; + static VkMemoryBarrier const mb = { .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, @@ -214,7 +248,7 @@ vk_barrier_transfer_w_to_host_r(VkCommandBuffer cb) .dstAccessMask = VK_ACCESS_HOST_READ_BIT }; - vkCmdPipelineBarrier(cb, + disp->CmdPipelineBarrier(cb, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_HOST_BIT, 0, @@ -237,12 +271,16 @@ vk_memory_barrier(VkCommandBuffer cb, VkPipelineStageFlags dst_stage, VkAccessFlags dst_mask) { + VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb); + const struct vk_device_dispatch_table *disp = + &cmd_buffer->base.device->dispatch_table; + VkMemoryBarrier const mb = { .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, .pNext = NULL, .srcAccessMask = src_mask, .dstAccessMask = dst_mask }; - vkCmdPipelineBarrier(cb, src_stage, dst_stage, 0, 1, &mb, 0, NULL, 0, NULL); + disp->CmdPipelineBarrier(cb, src_stage, dst_stage, 0, 1, &mb, 0, NULL, 0, NULL); } // @@ -252,6 +290,10 @@ vk_memory_barrier(VkCommandBuffer cb, void vk_barrier_debug(VkCommandBuffer cb) { + VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb); + const struct vk_device_dispatch_table *disp = + &cmd_buffer->base.device->dispatch_table; + static VkMemoryBarrier const mb = { .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, @@ -288,7 +330,7 @@ vk_barrier_debug(VkCommandBuffer cb) VK_ACCESS_HOST_WRITE_BIT }; - vkCmdPipelineBarrier(cb, + disp->CmdPipelineBarrier(cb, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, diff --git a/src/amd/vulkan/radix_sort/common/vk/barrier.h b/src/vulkan/runtime/radix_sort/common/vk/barrier.h similarity index 100% rename from src/amd/vulkan/radix_sort/common/vk/barrier.h rename to src/vulkan/runtime/radix_sort/common/vk/barrier.h diff --git a/src/vulkan/runtime/radix_sort/meson.build b/src/vulkan/runtime/radix_sort/meson.build new file mode 100644 index 00000000000..138c0c9369a --- /dev/null +++ b/src/vulkan/runtime/radix_sort/meson.build @@ -0,0 +1,37 @@ +# Copyright © 2022 Konstantin Seurer + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +subdir('shaders') + +radix_sort_files = files( + 'common/vk/barrier.c', + 'common/vk/barrier.h', + 'common/macros.h', + 'common/util.c', + 'common/util.h', + 'shaders/push.h', + 'radix_sort_u64.c', + 'radix_sort_u64.h', + 'radix_sort_vk_devaddr.h', + 'radix_sort_vk_ext.h', + 'radix_sort_vk.c', + 'radix_sort_vk.h', + 'target.h' +) diff --git a/src/vulkan/runtime/radix_sort/radix_sort_u64.c b/src/vulkan/runtime/radix_sort/radix_sort_u64.c new file mode 100644 index 00000000000..0d5f9217656 --- /dev/null +++ b/src/vulkan/runtime/radix_sort/radix_sort_u64.c @@ -0,0 +1,59 @@ +/* + * Copyright © 2024 Valve Corporation + * SPDX-License-Identifier: MIT + */ + +#include "radix_sort_u64.h" +#include + +static const uint32_t init_spv[] = { +#include "radix_sort/shaders/init.comp.spv.h" +}; + +static const uint32_t fill_spv[] = { +#include "radix_sort/shaders/fill.comp.spv.h" +}; + +static const uint32_t histogram_spv[] = { +#include "radix_sort/shaders/histogram.comp.spv.h" +}; + +static const uint32_t prefix_spv[] = { +#include "radix_sort/shaders/prefix.comp.spv.h" +}; + +static const uint32_t scatter_0_even_spv[] = { +#include "radix_sort/shaders/scatter_0_even.comp.spv.h" +}; + +static const uint32_t scatter_0_odd_spv[] = { +#include "radix_sort/shaders/scatter_0_odd.comp.spv.h" +}; + +static const uint32_t scatter_1_even_spv[] = { +#include "radix_sort/shaders/scatter_1_even.comp.spv.h" +}; + +static const uint32_t scatter_1_odd_spv[] = { +#include "radix_sort/shaders/scatter_1_odd.comp.spv.h" +}; + + +radix_sort_vk_t * +vk_create_radix_sort_u64(VkDevice device, VkAllocationCallbacks const *ac, + VkPipelineCache pc, + struct radix_sort_vk_target_config config) +{ + assert(config.keyval_dwords == 2); + + const uint32_t *spv[8] = { + init_spv, fill_spv, histogram_spv, prefix_spv, + scatter_0_even_spv, scatter_0_odd_spv, scatter_1_even_spv, scatter_1_odd_spv, + }; + const uint32_t spv_sizes[8] = { + sizeof(init_spv), sizeof(fill_spv), sizeof(histogram_spv), sizeof(prefix_spv), + sizeof(scatter_0_even_spv), sizeof(scatter_0_odd_spv), sizeof(scatter_1_even_spv), sizeof(scatter_1_odd_spv), + }; + return radix_sort_vk_create(device, ac, pc, spv, spv_sizes, config); +} + diff --git a/src/vulkan/runtime/radix_sort/radix_sort_u64.h b/src/vulkan/runtime/radix_sort/radix_sort_u64.h new file mode 100644 index 00000000000..8bb37fe2082 --- /dev/null +++ b/src/vulkan/runtime/radix_sort/radix_sort_u64.h @@ -0,0 +1,24 @@ +/* + * Copyright © 2024 Valve Corporation + * SPDX-License-Identifier: MIT + */ + +#ifndef VK_RADIX_SORT_U64 +#define VK_RADIX_SORT_U64 + +#include "radix_sort_vk.h" + +#ifdef __cplusplus +extern "C" { +#endif + +radix_sort_vk_t * +vk_create_radix_sort_u64(VkDevice device, VkAllocationCallbacks const *ac, + VkPipelineCache pc, + struct radix_sort_vk_target_config config); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/amd/vulkan/radix_sort/radix_sort_vk.c b/src/vulkan/runtime/radix_sort/radix_sort_vk.c similarity index 83% rename from src/amd/vulkan/radix_sort/radix_sort_vk.c rename to src/vulkan/runtime/radix_sort/radix_sort_vk.c index 70253884fc4..31efd3d4a75 100644 --- a/src/amd/vulkan/radix_sort/radix_sort_vk.c +++ b/src/vulkan/runtime/radix_sort/radix_sort_vk.c @@ -11,6 +11,10 @@ #include "common/vk/barrier.h" #include "radix_sort_vk_devaddr.h" #include "shaders/push.h" +#include "shaders/config.h" + +#include "vk_command_buffer.h" +#include "vk_device.h" // // @@ -100,14 +104,41 @@ radix_sort_vk_get_memory_requirements(radix_sort_vk_t const * rs, // NOTE: Assumes .histograms are before .partitions. // // Last scatter workgroup skips writing to a partition. + // Each RS_RADIX_LOG2 (8) bit pass has a zero-initialized histogram. This + // is one RS_RADIX_SIZE histogram per keyval byte. // - // One histogram per (keyval byte + partitions) + // The last scatter workgroup skips writing to a partition so it doesn't + // need to be allocated. // - uint32_t const partitions = scatter_blocks - 1; + // If the device doesn't support "sequential dispatch" of workgroups, then + // we need a zero-initialized dword counter per radix pass in the keyval + // to atomically acquire a virtual workgroup id. On sequentially + // dispatched devices, this is simply `gl_WorkGroupID.x`. + // + // The "internal" memory map looks like this: + // + // +---------------------------------+ <-- 0 + // | histograms[keyval_size] | + // +---------------------------------+ <-- keyval_size * histo_size + // | partitions[scatter_blocks_ru-1] | + // +---------------------------------+ <-- (keyval_size + scatter_blocks_ru - 1) * histo_size + // | workgroup_ids[keyval_size] | + // +---------------------------------+ <-- (keyval_size + scatter_blocks_ru - 1) * histo_size + workgroup_ids_size + // + // The `.workgroup_ids[]` are located after the last partition. + // + VkDeviceSize const histo_size = RS_RADIX_SIZE * sizeof(uint32_t); - mr->internal_size = (mr->keyval_size + partitions) * (RS_RADIX_SIZE * sizeof(uint32_t)); + mr->internal_size = (mr->keyval_size + scatter_blocks - 1) * histo_size; mr->internal_alignment = internal_sg_size * sizeof(uint32_t); + // + // Support for nonsequential dispatch can be disabled. + // + VkDeviceSize const workgroup_ids_size = mr->keyval_size * sizeof(uint32_t); + + mr->internal_size += workgroup_ids_size; + // // Indirect // @@ -185,13 +216,17 @@ rs_pipeline_count(struct radix_sort_vk const * rs) } radix_sort_vk_t * -radix_sort_vk_create(VkDevice device, +radix_sort_vk_create(VkDevice _device, VkAllocationCallbacks const * ac, VkPipelineCache pc, const uint32_t* const* spv, const uint32_t* spv_sizes, struct radix_sort_vk_target_config config) { + VK_FROM_HANDLE(vk_device, device, _device); + + const struct vk_device_dispatch_table *disp = &device->dispatch_table; + // // Allocate radix_sort_vk // @@ -244,6 +279,38 @@ radix_sort_vk_create(VkDevice device, .size = sizeof(struct rs_push_scatter) }, // scatter_1_odd }; + uint32_t spec_constants[] = { + [RS_FILL_WORKGROUP_SIZE] = 1u << config.fill.workgroup_size_log2, + [RS_FILL_BLOCK_ROWS] = config.fill.block_rows, + [RS_HISTOGRAM_WORKGROUP_SIZE] = 1u << config.histogram.workgroup_size_log2, + [RS_HISTOGRAM_SUBGROUP_SIZE_LOG2] = config.histogram.subgroup_size_log2, + [RS_HISTOGRAM_BLOCK_ROWS] = config.histogram.block_rows, + [RS_PREFIX_WORKGROUP_SIZE] = 1u << config.prefix.workgroup_size_log2, + [RS_PREFIX_SUBGROUP_SIZE_LOG2] = config.prefix.subgroup_size_log2, + [RS_SCATTER_WORKGROUP_SIZE] = 1u << config.scatter.workgroup_size_log2, + [RS_SCATTER_SUBGROUP_SIZE_LOG2] = config.scatter.subgroup_size_log2, + [RS_SCATTER_BLOCK_ROWS] = config.scatter.block_rows, + [RS_SCATTER_NONSEQUENTIAL_DISPATCH] = config.nonsequential_dispatch, + }; + + VkSpecializationMapEntry spec_map[ARRAY_LENGTH_MACRO(spec_constants)]; + + for (uint32_t ii = 0; ii < ARRAY_LENGTH_MACRO(spec_constants); ii++) + { + spec_map[ii] = (VkSpecializationMapEntry) { + .constantID = ii, + .offset = sizeof(uint32_t) * ii, + .size = sizeof(uint32_t), + }; + } + + VkSpecializationInfo spec_info = { + .mapEntryCount = ARRAY_LENGTH_MACRO(spec_map), + .pMapEntries = spec_map, + .dataSize = sizeof(spec_constants), + .pData = spec_constants, + }; + VkPipelineLayoutCreateInfo plci = { .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, @@ -259,7 +326,7 @@ radix_sort_vk_create(VkDevice device, { plci.pPushConstantRanges = pcr + ii; - if (vkCreatePipelineLayout(device, &plci, NULL, rs->pipeline_layouts.handles + ii) != VK_SUCCESS) + if (disp->CreatePipelineLayout(_device, &plci, NULL, rs->pipeline_layouts.handles + ii) != VK_SUCCESS) goto fail_layout; } @@ -282,7 +349,7 @@ radix_sort_vk_create(VkDevice device, smci.codeSize = spv_sizes[ii]; smci.pCode = spv[ii]; - if (vkCreateShaderModule(device, &smci, ac, sms + ii) != VK_SUCCESS) + if (disp->CreateShaderModule(_device, &smci, ac, sms + ii) != VK_SUCCESS) goto fail_shader; } @@ -323,11 +390,11 @@ radix_sort_vk_create(VkDevice device, .flags = 0, \ .stage = { .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, \ .pNext = NULL, \ - .flags = 0, \ + .flags = VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT_EXT, \ .stage = VK_SHADER_STAGE_COMPUTE_BIT, \ .module = sms[idx_], \ .pName = "main", \ - .pSpecializationInfo = NULL }, \ + .pSpecializationInfo = &spec_info }, \ \ .layout = rs->pipeline_layouts.handles[idx_], \ .basePipelineHandle = VK_NULL_HANDLE, \ @@ -358,7 +425,7 @@ radix_sort_vk_create(VkDevice device, // // Create the compute pipelines // - if (vkCreateComputePipelines(device, pc, pipeline_count, cpcis, ac, rs->pipelines.handles) != VK_SUCCESS) + if (disp->CreateComputePipelines(_device, pc, pipeline_count, cpcis, ac, rs->pipelines.handles) != VK_SUCCESS) goto fail_pipeline; // @@ -366,7 +433,7 @@ radix_sort_vk_create(VkDevice device, // for (uint32_t ii = 0; ii < pipeline_count; ii++) { - vkDestroyShaderModule(device, sms[ii], ac); + disp->DestroyShaderModule(_device, sms[ii], ac); } #ifdef RS_VK_ENABLE_DEBUG_UTILS @@ -397,17 +464,17 @@ radix_sort_vk_create(VkDevice device, fail_pipeline: for (uint32_t ii = 0; ii < pipeline_count; ii++) { - vkDestroyPipeline(device, rs->pipelines.handles[ii], ac); + disp->DestroyPipeline(_device, rs->pipelines.handles[ii], ac); } fail_shader: for (uint32_t ii = 0; ii < pipeline_count; ii++) { - vkDestroyShaderModule(device, sms[ii], ac); + disp->DestroyShaderModule(_device, sms[ii], ac); } fail_layout: for (uint32_t ii = 0; ii < pipeline_count; ii++) { - vkDestroyPipelineLayout(device, rs->pipeline_layouts.handles[ii], ac); + disp->DestroyPipelineLayout(_device, rs->pipeline_layouts.handles[ii], ac); } free(rs); @@ -420,18 +487,22 @@ fail_layout: void radix_sort_vk_destroy(struct radix_sort_vk * rs, VkDevice d, VkAllocationCallbacks const * const ac) { + VK_FROM_HANDLE(vk_device, device, d); + + const struct vk_device_dispatch_table *disp = &device->dispatch_table; + uint32_t const pipeline_count = rs_pipeline_count(rs); // destroy pipelines for (uint32_t ii = 0; ii < pipeline_count; ii++) { - vkDestroyPipeline(d, rs->pipelines.handles[ii], ac); + disp->DestroyPipeline(d, rs->pipelines.handles[ii], ac); } // destroy pipeline layouts for (uint32_t ii = 0; ii < pipeline_count; ii++) { - vkDestroyPipelineLayout(d, rs->pipeline_layouts.handles[ii], ac); + disp->DestroyPipelineLayout(d, rs->pipeline_layouts.handles[ii], ac); } free(rs); @@ -441,8 +512,12 @@ radix_sort_vk_destroy(struct radix_sort_vk * rs, VkDevice d, VkAllocationCallbac // // static VkDeviceAddress -rs_get_devaddr(VkDevice device, VkDescriptorBufferInfo const * dbi) +rs_get_devaddr(VkDevice _device, VkDescriptorBufferInfo const * dbi) { + VK_FROM_HANDLE(vk_device, device, _device); + + const struct vk_device_dispatch_table *disp = &device->dispatch_table; + VkBufferDeviceAddressInfo const bdai = { .sType = VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO, @@ -450,7 +525,7 @@ rs_get_devaddr(VkDevice device, VkDescriptorBufferInfo const * dbi) .buffer = dbi->buffer }; - VkDeviceAddress const devaddr = vkGetBufferDeviceAddress(device, &bdai) + dbi->offset; + VkDeviceAddress const devaddr = disp->GetBufferDeviceAddress(_device, &bdai) + dbi->offset; return devaddr; } @@ -465,13 +540,17 @@ rs_ext_cmd_write_timestamp(struct radix_sort_vk_ext_timestamps * ext_timestamps, VkCommandBuffer cb, VkPipelineStageFlagBits pipeline_stage) { + VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb); + const struct vk_device_dispatch_table *disp = + &cmd_buffer->base.device->dispatch_table; + if ((ext_timestamps != NULL) && (ext_timestamps->timestamps_set < ext_timestamps->timestamp_count)) { - vkCmdWriteTimestamp(cb, - pipeline_stage, - ext_timestamps->timestamps, - ext_timestamps->timestamps_set++); + disp->CmdWriteTimestamp(cb, + pipeline_stage, + ext_timestamps->timestamps, + ext_timestamps->timestamps_set++); } } @@ -497,10 +576,14 @@ struct radix_sort_vk_ext_base void radix_sort_vk_sort_devaddr(radix_sort_vk_t const * rs, radix_sort_vk_sort_devaddr_info_t const * info, - VkDevice device, + VkDevice _device, VkCommandBuffer cb, VkDeviceAddress * keyvals_sorted) { + VK_FROM_HANDLE(vk_device, device, _device); + + const struct vk_device_dispatch_table *disp = &device->dispatch_table; + // // Anything to do? // @@ -557,16 +640,13 @@ radix_sort_vk_sort_devaddr(radix_sort_vk_t const * rs, // Label the command buffer // #ifdef RS_VK_ENABLE_DEBUG_UTILS - if (pfn_vkCmdBeginDebugUtilsLabelEXT != NULL) - { - VkDebugUtilsLabelEXT const label = { - .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT, - .pNext = NULL, - .pLabelName = "radix_sort_vk_sort", - }; + VkDebugUtilsLabelEXT const label = { + .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT, + .pNext = NULL, + .pLabelName = "radix_sort_vk_sort", + }; - pfn_vkCmdBeginDebugUtilsLabelEXT(cb, &label); - } + disp->CmdBeginDebugUtilsLabelEXT(cb, &label); #endif // @@ -679,16 +759,16 @@ radix_sort_vk_sort_devaddr(radix_sort_vk_t const * rs, .passes = passes }; - vkCmdPushConstants(cb, + disp->CmdPushConstants(cb, rs->pipeline_layouts.named.histogram, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_histogram), &push_histogram); - vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.histogram); + disp->CmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.histogram); - vkCmdDispatch(cb, histo_blocks, 1, 1); + disp->CmdDispatch(cb, histo_blocks, 1, 1); //////////////////////////////////////////////////////////////////////// // @@ -707,16 +787,16 @@ radix_sort_vk_sort_devaddr(radix_sort_vk_t const * rs, .devaddr_histograms = devaddr_histograms, }; - vkCmdPushConstants(cb, + disp->CmdPushConstants(cb, rs->pipeline_layouts.named.prefix, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_prefix), &push_prefix); - vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.prefix); + disp->CmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.prefix); - vkCmdDispatch(cb, passes, 1, 1); + disp->CmdDispatch(cb, passes, 1, 1); //////////////////////////////////////////////////////////////////////// // @@ -746,14 +826,14 @@ radix_sort_vk_sort_devaddr(radix_sort_vk_t const * rs, { uint32_t const pass_dword = pass_idx / 4; - vkCmdPushConstants(cb, + disp->CmdPushConstants(cb, rs->pipeline_layouts.named.scatter[pass_dword].even, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_scatter), &push_scatter); - vkCmdBindPipeline(cb, + disp->CmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.scatter[pass_dword].even); } @@ -762,7 +842,7 @@ radix_sort_vk_sort_devaddr(radix_sort_vk_t const * rs, while (true) { - vkCmdDispatch(cb, scatter_blocks, 1, 1); + disp->CmdDispatch(cb, scatter_blocks, 1, 1); // // Continue? @@ -788,7 +868,7 @@ radix_sort_vk_sort_devaddr(radix_sort_vk_t const * rs, // VkPipelineLayout const pl = is_even ? rs->pipeline_layouts.named.scatter[pass_dword].even // : rs->pipeline_layouts.named.scatter[pass_dword].odd; - vkCmdPushConstants(cb, + disp->CmdPushConstants(cb, pl, VK_SHADER_STAGE_COMPUTE_BIT, OFFSETOF_MACRO(struct rs_push_scatter, devaddr_histograms), @@ -801,7 +881,7 @@ radix_sort_vk_sort_devaddr(radix_sort_vk_t const * rs, VkPipeline const p = is_even ? rs->pipelines.named.scatter[pass_dword].even // : rs->pipelines.named.scatter[pass_dword].odd; - vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, p); + disp->CmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, p); } #ifdef RS_VK_ENABLE_EXTENSIONS @@ -812,10 +892,7 @@ radix_sort_vk_sort_devaddr(radix_sort_vk_t const * rs, // End the label // #ifdef RS_VK_ENABLE_DEBUG_UTILS - if (pfn_vkCmdEndDebugUtilsLabelEXT != NULL) - { - pfn_vkCmdEndDebugUtilsLabelEXT(cb); - } + disp->CmdEndDebugUtilsLabelEXT(cb); #endif } @@ -825,10 +902,14 @@ radix_sort_vk_sort_devaddr(radix_sort_vk_t const * rs, void radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const * rs, radix_sort_vk_sort_indirect_devaddr_info_t const * info, - VkDevice device, + VkDevice _device, VkCommandBuffer cb, VkDeviceAddress * keyvals_sorted) { + VK_FROM_HANDLE(vk_device, device, _device); + + const struct vk_device_dispatch_table *disp = &device->dispatch_table; + // // Anything to do? // @@ -886,16 +967,13 @@ radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const * // Label the command buffer // #ifdef RS_VK_ENABLE_DEBUG_UTILS - if (pfn_vkCmdBeginDebugUtilsLabelEXT != NULL) - { - VkDebugUtilsLabelEXT const label = { - .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT, - .pNext = NULL, - .pLabelName = "radix_sort_vk_sort_indirect", - }; + VkDebugUtilsLabelEXT const label = { + .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT, + .pNext = NULL, + .pLabelName = "radix_sort_vk_sort_indirect", + }; - pfn_vkCmdBeginDebugUtilsLabelEXT(cb, &label); - } + disp->CmdBeginDebugUtilsLabelEXT(cb, &label); #endif // @@ -938,16 +1016,16 @@ radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const * .passes = passes }; - vkCmdPushConstants(cb, + disp->CmdPushConstants(cb, rs->pipeline_layouts.named.init, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_init), &push_init); - vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.init); + disp->CmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.init); - vkCmdDispatch(cb, 1, 1, 1); + disp->CmdDispatch(cb, 1, 1, 1); } #ifdef RS_VK_ENABLE_EXTENSIONS @@ -967,14 +1045,14 @@ radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const * .dword = 0xFFFFFFFF }; - vkCmdPushConstants(cb, + disp->CmdPushConstants(cb, rs->pipeline_layouts.named.fill, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_pad), &push_pad); - vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.fill); + disp->CmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.fill); info->dispatch_indirect(cb, &info->indirect, offsetof(struct rs_indirect_info, dispatch.pad)); } @@ -992,14 +1070,14 @@ radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const * .dword = 0 }; - vkCmdPushConstants(cb, + disp->CmdPushConstants(cb, rs->pipeline_layouts.named.fill, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_zero), &push_zero); - vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.fill); + disp->CmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.fill); info->dispatch_indirect(cb, &info->indirect, offsetof(struct rs_indirect_info, dispatch.zero)); } @@ -1021,14 +1099,14 @@ radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const * .passes = passes }; - vkCmdPushConstants(cb, + disp->CmdPushConstants(cb, rs->pipeline_layouts.named.histogram, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_histogram), &push_histogram); - vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.histogram); + disp->CmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.histogram); info->dispatch_indirect(cb, &info->indirect, @@ -1049,16 +1127,16 @@ radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const * .devaddr_histograms = devaddr_histograms, }; - vkCmdPushConstants(cb, + disp->CmdPushConstants(cb, rs->pipeline_layouts.named.prefix, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_prefix), &push_prefix); - vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.prefix); + disp->CmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.prefix); - vkCmdDispatch(cb, passes, 1, 1); + disp->CmdDispatch(cb, passes, 1, 1); } #ifdef RS_VK_ENABLE_EXTENSIONS @@ -1088,14 +1166,14 @@ radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const * { uint32_t const pass_dword = pass_idx / 4; - vkCmdPushConstants(cb, + disp->CmdPushConstants(cb, rs->pipeline_layouts.named.scatter[pass_dword].even, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_scatter), &push_scatter); - vkCmdBindPipeline(cb, + disp->CmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.scatter[pass_dword].even); } @@ -1134,7 +1212,7 @@ radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const * VkPipelineLayout const pl = is_even ? rs->pipeline_layouts.named.scatter[pass_dword].even // : rs->pipeline_layouts.named.scatter[pass_dword].odd; - vkCmdPushConstants( + disp->CmdPushConstants( cb, pl, VK_SHADER_STAGE_COMPUTE_BIT, @@ -1148,7 +1226,7 @@ radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const * VkPipeline const p = is_even ? rs->pipelines.named.scatter[pass_dword].even // : rs->pipelines.named.scatter[pass_dword].odd; - vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, p); + disp->CmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, p); } } @@ -1160,10 +1238,7 @@ radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const * // End the label // #ifdef RS_VK_ENABLE_DEBUG_UTILS - if (pfn_vkCmdEndDebugUtilsLabelEXT != NULL) - { - pfn_vkCmdEndDebugUtilsLabelEXT(cb); - } + disp->CmdEndDebugUtilsLabelEXT(cb); #endif } @@ -1177,7 +1252,11 @@ radix_sort_vk_fill_buffer(VkCommandBuffer cb, VkDeviceSize size, uint32_t data) { - vkCmdFillBuffer(cb, buffer_info->buffer, buffer_info->offset + offset, size, data); + VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb); + const struct vk_device_dispatch_table *disp = + &cmd_buffer->base.device->dispatch_table; + + disp->CmdFillBuffer(cb, buffer_info->buffer, buffer_info->offset + offset, size, data); } // @@ -1221,7 +1300,11 @@ radix_sort_vk_dispatch_indirect(VkCommandBuffer cb, radix_sort_vk_buffer_info_t const * buffer_info, VkDeviceSize offset) { - vkCmdDispatchIndirect(cb, buffer_info->buffer, buffer_info->offset + offset); + VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, cb); + const struct vk_device_dispatch_table *disp = + &cmd_buffer->base.device->dispatch_table; + + disp->CmdDispatchIndirect(cb, buffer_info->buffer, buffer_info->offset + offset); } // diff --git a/src/amd/vulkan/radix_sort/radix_sort_vk.h b/src/vulkan/runtime/radix_sort/radix_sort_vk.h similarity index 100% rename from src/amd/vulkan/radix_sort/radix_sort_vk.h rename to src/vulkan/runtime/radix_sort/radix_sort_vk.h diff --git a/src/amd/vulkan/radix_sort/radix_sort_vk_devaddr.h b/src/vulkan/runtime/radix_sort/radix_sort_vk_devaddr.h similarity index 100% rename from src/amd/vulkan/radix_sort/radix_sort_vk_devaddr.h rename to src/vulkan/runtime/radix_sort/radix_sort_vk_devaddr.h diff --git a/src/amd/vulkan/radix_sort/radix_sort_vk_ext.h b/src/vulkan/runtime/radix_sort/radix_sort_vk_ext.h similarity index 100% rename from src/amd/vulkan/radix_sort/radix_sort_vk_ext.h rename to src/vulkan/runtime/radix_sort/radix_sort_vk_ext.h diff --git a/src/amd/vulkan/radix_sort/shaders/bufref.h b/src/vulkan/runtime/radix_sort/shaders/bufref.h similarity index 100% rename from src/amd/vulkan/radix_sort/shaders/bufref.h rename to src/vulkan/runtime/radix_sort/shaders/bufref.h diff --git a/src/vulkan/runtime/radix_sort/shaders/config.h b/src/vulkan/runtime/radix_sort/shaders/config.h new file mode 100644 index 00000000000..702f1649605 --- /dev/null +++ b/src/vulkan/runtime/radix_sort/shaders/config.h @@ -0,0 +1,33 @@ +// Copyright 2024 Valve Corporation +// SPDX-License-Identifier: MIT + +#ifdef VULKAN +#define CONFIG(_name, _id, default_val) layout (constant_id = _id) const int _name = default_val; +#else +enum rs_config { +#define CONFIG(_name, _id, default_val) _name = _id, +#endif + +#define RS_FILL_WORKGROUP_SIZE_ID 0 +CONFIG(RS_FILL_WORKGROUP_SIZE, RS_FILL_WORKGROUP_SIZE_ID, 7) +CONFIG(RS_FILL_BLOCK_ROWS, 1, 8) + +#define RS_HISTOGRAM_WORKGROUP_SIZE_ID 2 +CONFIG(RS_HISTOGRAM_WORKGROUP_SIZE, RS_HISTOGRAM_WORKGROUP_SIZE_ID, 7) +CONFIG(RS_HISTOGRAM_SUBGROUP_SIZE_LOG2, 3, 7) +CONFIG(RS_HISTOGRAM_BLOCK_ROWS, 4, 8) + +#define RS_PREFIX_WORKGROUP_SIZE_ID 5 +CONFIG(RS_PREFIX_WORKGROUP_SIZE, RS_PREFIX_WORKGROUP_SIZE_ID, 8) +CONFIG(RS_PREFIX_SUBGROUP_SIZE_LOG2, 6, 6) + +#define RS_SCATTER_WORKGROUP_SIZE_ID 7 +CONFIG(RS_SCATTER_WORKGROUP_SIZE, RS_SCATTER_WORKGROUP_SIZE_ID, 8) +CONFIG(RS_SCATTER_SUBGROUP_SIZE_LOG2, 8, 6) +CONFIG(RS_SCATTER_BLOCK_ROWS, 9, 14) + +CONFIG(RS_SCATTER_NONSEQUENTIAL_DISPATCH, 10, 0) + +#ifndef VULKAN +}; +#endif diff --git a/src/amd/vulkan/radix_sort/shaders/fill.comp b/src/vulkan/runtime/radix_sort/shaders/fill.comp similarity index 89% rename from src/amd/vulkan/radix_sort/shaders/fill.comp rename to src/vulkan/runtime/radix_sort/shaders/fill.comp index 76b446d8c5d..c85d650d0ff 100644 --- a/src/amd/vulkan/radix_sort/shaders/fill.comp +++ b/src/vulkan/runtime/radix_sort/shaders/fill.comp @@ -49,23 +49,11 @@ layout(push_constant) uniform block_push // RS_STRUCT_INDIRECT_INFO_FILL(); -// -// Check all switches are defined -// -#ifndef RS_FILL_WORKGROUP_SIZE_LOG2 -#error "Undefined: RS_FILL_WORKGROUP_SIZE_LOG2" -#endif - -// -#ifndef RS_FILL_BLOCK_ROWS -#error "Undefined: RS_FILL_BLOCK_ROWS" -#endif - // // Local macros // // clang-format off -#define RS_WORKGROUP_SIZE (1 << RS_FILL_WORKGROUP_SIZE_LOG2) +#define RS_WORKGROUP_SIZE (RS_FILL_WORKGROUP_SIZE) #define RS_BLOCK_DWORDS (RS_FILL_BLOCK_ROWS * RS_WORKGROUP_SIZE) #define RS_RADIX_MASK ((1 << RS_RADIX_LOG2) - 1) // clang-format on @@ -73,7 +61,7 @@ RS_STRUCT_INDIRECT_INFO_FILL(); // // // -layout(local_size_x = RS_WORKGROUP_SIZE) in; +layout(local_size_x_id = RS_FILL_WORKGROUP_SIZE_ID) in; // // diff --git a/src/amd/vulkan/radix_sort/shaders/histogram.comp b/src/vulkan/runtime/radix_sort/shaders/histogram.comp similarity index 78% rename from src/amd/vulkan/radix_sort/shaders/histogram.comp rename to src/vulkan/runtime/radix_sort/shaders/histogram.comp index 7d554630fe5..0eb078807b7 100644 --- a/src/amd/vulkan/radix_sort/shaders/histogram.comp +++ b/src/vulkan/runtime/radix_sort/shaders/histogram.comp @@ -61,26 +61,11 @@ layout(push_constant) uniform block_push #error "Undefined: RS_KEYVAL_DWORDS" #endif -// -#ifndef RS_HISTOGRAM_BLOCK_ROWS -#error "Undefined: RS_HISTOGRAM_BLOCK_ROWS" -#endif - -// -#ifndef RS_HISTOGRAM_WORKGROUP_SIZE_LOG2 -#error "Undefined: RS_HISTOGRAM_WORKGROUP_SIZE_LOG2" -#endif - -// -#ifndef RS_HISTOGRAM_SUBGROUP_SIZE_LOG2 -#error "Undefined: RS_HISTOGRAM_SUBGROUP_SIZE_LOG2" -#endif - // // Local macros // // clang-format off -#define RS_WORKGROUP_SIZE (1 << RS_HISTOGRAM_WORKGROUP_SIZE_LOG2) +#define RS_WORKGROUP_SIZE (RS_HISTOGRAM_WORKGROUP_SIZE) #define RS_SUBGROUP_SIZE (1 << RS_HISTOGRAM_SUBGROUP_SIZE_LOG2) #define RS_WORKGROUP_SUBGROUPS (RS_WORKGROUP_SIZE / RS_SUBGROUP_SIZE) #define RS_BLOCK_KEYVALS (RS_HISTOGRAM_BLOCK_ROWS * RS_WORKGROUP_SIZE) @@ -104,11 +89,8 @@ layout(push_constant) uniform block_push // #define RS_HISTOGRAM_BASE(pass_) ((RS_RADIX_SIZE * 4) * pass_) -#if (RS_WORKGROUP_SUBGROUPS == 1) -#define RS_HISTOGRAM_OFFSET(pass_) (RS_HISTOGRAM_BASE(pass_) + gl_SubgroupInvocationID * 4) -#else -#define RS_HISTOGRAM_OFFSET(pass_) (RS_HISTOGRAM_BASE(pass_) + gl_LocalInvocationID.x * 4) -#endif +#define RS_HISTOGRAM_OFFSET(pass_) \ + RS_HISTOGRAM_BASE(pass_) + (RS_WORKGROUP_SUBGROUPS == 1 ? gl_SubgroupInvocationID : gl_LocalInvocationID.x) * 4 // // Assumes (RS_RADIX_LOG2 == 8) @@ -167,7 +149,7 @@ shared rs_histogram_smem smem; // // // -layout(local_size_x = RS_WORKGROUP_SIZE) in; +layout(local_size_x_id = RS_HISTOGRAM_WORKGROUP_SIZE_ID) in; // // @@ -196,41 +178,38 @@ rs_histogram_zero() // // Zero SMEM histogram // -#if (RS_WORKGROUP_SUBGROUPS == 1) - - const uint32_t smem_offset = gl_SubgroupInvocationID; - - [[unroll]] for (RS_SUBGROUP_UNIFORM uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE) + if (RS_WORKGROUP_SUBGROUPS == 1) { - smem.histogram[smem_offset + ii] = 0; - } + const uint32_t smem_offset = gl_SubgroupInvocationID; -#elif (RS_WORKGROUP_SIZE < RS_RADIX_SIZE) - - const uint32_t smem_offset = gl_LocalInvocationID.x; - - [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE) - { - smem.histogram[smem_offset + ii] = 0; - } - - const uint32_t smem_idx = smem_offset + ((RS_RADIX_SIZE / RS_WORKGROUP_SIZE) * RS_WORKGROUP_SIZE); - - if (smem_idx < RS_RADIX_SIZE) + [[unroll]] for (RS_SUBGROUP_UNIFORM uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE) { - smem.histogram[smem_idx] = 0; + smem.histogram[smem_offset + ii] = 0; + } + } + else if (RS_WORKGROUP_SIZE < RS_RADIX_SIZE) + { + const uint32_t smem_offset = gl_LocalInvocationID.x; + + [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE) + { + smem.histogram[smem_offset + ii] = 0; } -#elif (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE) + const uint32_t smem_idx = smem_offset + ((RS_RADIX_SIZE / RS_WORKGROUP_SIZE) * RS_WORKGROUP_SIZE); -#if (RS_WORKGROUP_SIZE > RS_RADIX_SIZE) - if (gl_LocalInvocationID.x < RS_RADIX_SIZE) -#endif - { - smem.histogram[gl_LocalInvocationID.x] = 0; - } - -#endif + if (smem_idx < RS_RADIX_SIZE) + { + smem.histogram[smem_idx] = 0; + } + } + else if (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE) + { + if (RS_WORKGROUP_SIZE == RS_RADIX_SIZE || gl_LocalInvocationID.x < RS_RADIX_SIZE) + { + smem.histogram[gl_LocalInvocationID.x] = 0; + } + } } // @@ -242,50 +221,47 @@ rs_histogram_global_store(restrict buffer_rs_histograms rs_histograms) // // Store to GMEM // -#if (RS_WORKGROUP_SUBGROUPS == 1) - - const uint32_t smem_offset = gl_SubgroupInvocationID; - - [[unroll]] for (RS_SUBGROUP_UNIFORM uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE) + if (RS_WORKGROUP_SUBGROUPS == 1) { - const uint32_t count = smem.histogram[smem_offset + ii]; + const uint32_t smem_offset = gl_SubgroupInvocationID; - atomicAdd(rs_histograms.extent[ii], count); - } - -#elif (RS_WORKGROUP_SIZE < RS_RADIX_SIZE) - - const uint32_t smem_offset = gl_LocalInvocationID.x; - - [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE) - { - const uint32_t count = smem.histogram[smem_offset + ii]; - - atomicAdd(rs_histograms.extent[ii], count); - } - - const uint32_t smem_idx = smem_offset + ((RS_RADIX_SIZE / RS_WORKGROUP_SIZE) * RS_WORKGROUP_SIZE); - - if (smem_idx < RS_RADIX_SIZE) + [[unroll]] for (RS_SUBGROUP_UNIFORM uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE) { - const uint32_t count = smem.histogram[smem_idx]; + const uint32_t count = smem.histogram[smem_offset + ii]; - atomicAdd(rs_histograms.extent[((RS_RADIX_SIZE / RS_WORKGROUP_SIZE) * RS_WORKGROUP_SIZE)], - count); + atomicAdd(rs_histograms.extent[ii], count); + } + } + else if (RS_WORKGROUP_SIZE < RS_RADIX_SIZE) + { + const uint32_t smem_offset = gl_LocalInvocationID.x; + + [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE) + { + const uint32_t count = smem.histogram[smem_offset + ii]; + + atomicAdd(rs_histograms.extent[ii], count); } -#elif (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE) + const uint32_t smem_idx = smem_offset + ((RS_RADIX_SIZE / RS_WORKGROUP_SIZE) * RS_WORKGROUP_SIZE); -#if (RS_WORKGROUP_SIZE > RS_RADIX_SIZE) - if (gl_LocalInvocationID.x < RS_RADIX_SIZE) -#endif - { - const uint32_t count = smem.histogram[gl_LocalInvocationID.x]; + if (smem_idx < RS_RADIX_SIZE) + { + const uint32_t count = smem.histogram[smem_idx]; - atomicAdd(rs_histograms.extent[0], count); - } + atomicAdd(rs_histograms.extent[((RS_RADIX_SIZE / RS_WORKGROUP_SIZE) * RS_WORKGROUP_SIZE)], + count); + } + } + else if (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE) + { + if (RS_WORKGROUP_SIZE == RS_RADIX_SIZE || gl_LocalInvocationID.x < RS_RADIX_SIZE) + { + const uint32_t count = smem.histogram[gl_LocalInvocationID.x]; -#endif + atomicAdd(rs_histograms.extent[0], count); + } + } } #endif @@ -298,21 +274,19 @@ rs_histogram_global_store(restrict buffer_rs_histograms rs_histograms) void rs_histogram_atomic_after_write() { -#if (RS_WORKGROUP_SUBGROUPS == 1) - subgroupMemoryBarrierShared(); -#else - barrier(); -#endif + if (RS_WORKGROUP_SUBGROUPS == 1) + subgroupMemoryBarrierShared(); + else + barrier(); } void rs_histogram_read_after_atomic() { -#if (RS_WORKGROUP_SUBGROUPS == 1) - subgroupMemoryBarrierShared(); -#else - barrier(); -#endif + if (RS_WORKGROUP_SUBGROUPS == 1) + subgroupMemoryBarrierShared(); + else + barrier(); } #endif diff --git a/src/amd/vulkan/radix_sort/shaders/init.comp b/src/vulkan/runtime/radix_sort/shaders/init.comp similarity index 76% rename from src/amd/vulkan/radix_sort/shaders/init.comp rename to src/vulkan/runtime/radix_sort/shaders/init.comp index 1ffd48d79df..5865be65488 100644 --- a/src/amd/vulkan/radix_sort/shaders/init.comp +++ b/src/vulkan/runtime/radix_sort/shaders/init.comp @@ -53,9 +53,9 @@ RS_STRUCT_INDIRECT_INFO(); // Local macros // // clang-format off -#define RS_FILL_WORKGROUP_SIZE (1 << RS_FILL_WORKGROUP_SIZE_LOG2) -#define RS_SCATTER_WORKGROUP_SIZE (1 << RS_SCATTER_WORKGROUP_SIZE_LOG2) -#define RS_HISTOGRAM_WORKGROUP_SIZE (1 << RS_HISTOGRAM_WORKGROUP_SIZE_LOG2) +#define RS_FILL_WORKGROUP_SIZE (RS_FILL_WORKGROUP_SIZE) +#define RS_SCATTER_WORKGROUP_SIZE (RS_SCATTER_WORKGROUP_SIZE) +#define RS_HISTOGRAM_WORKGROUP_SIZE (RS_HISTOGRAM_WORKGROUP_SIZE) #define RS_FILL_BLOCK_DWORDS (RS_FILL_BLOCK_ROWS * RS_FILL_WORKGROUP_SIZE) #define RS_SCATTER_BLOCK_KEYVALS (RS_SCATTER_BLOCK_ROWS * RS_SCATTER_WORKGROUP_SIZE) @@ -150,12 +150,34 @@ main() // 256-dword partitions directly follow the 256-dword histograms, we // can dispatch just one FILL. // + // The "internal" memory map looks like this: + // + // +---------------------------------+ <-- 0 + // | histograms[keyval_size] | + // +---------------------------------+ <-- keyval_size * histo_dwords + // | partitions[scatter_blocks_ru-1] | + // +---------------------------------+ <-- (keyval_size + scatter_blocks_ru - 1) * histo_dwords + // | workgroup_ids[keyval_size] | + // +---------------------------------+ <-- (keyval_size + scatter_blocks_ru - 1) * histo_dwords + keyval_size + // + // NOTE(allanmac): The `.block_offset` and `.dword_offset_min` + // parameters are zeroes because the host can offset the buffer + // device address since the number of passes is known by the host. + // If we ever wanted to supported an indirect number of "key" bits + // in the sort, then this would need to change. + // + // NOTE(allanmac): The `.workgroup_ids[]` are only used if + // nonsequential dispatch isn't supported by the device. + // rs_indirect_info_fill zero; zero.block_offset = 0; zero.dword_offset_min = 0; zero.dword_offset_max_minus_min = (push.passes + scatter_ru_blocks - 1) * RS_RADIX_SIZE; + if (RS_SCATTER_NONSEQUENTIAL_DISPATCH != 0) + zero.dword_offset_max_minus_min += (RS_KEYVAL_DWORDS * 4); // one pass per byte + const uint32_t zero_ru_blocks = RS_COUNT_RU_BLOCKS(zero.dword_offset_max_minus_min, RS_FILL_BLOCK_DWORDS); diff --git a/src/vulkan/runtime/radix_sort/shaders/meson.build b/src/vulkan/runtime/radix_sort/shaders/meson.build new file mode 100644 index 00000000000..4152735b730 --- /dev/null +++ b/src/vulkan/runtime/radix_sort/shaders/meson.build @@ -0,0 +1,53 @@ +# Copyright © 2022 Konstantin Seurer + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +radix_sort_shaders = [ + 'init.comp', + 'fill.comp', + 'histogram.comp', + 'prefix.comp', + 'scatter_0_even.comp', + 'scatter_0_odd.comp', + 'scatter_1_even.comp', + 'scatter_1_odd.comp' +] + +shader_include_files = files( + 'bufref.h', + 'prefix_limits.h', + 'prefix.h', + 'push.h', + 'scatter.glsl', + 'config.h', +) + +defines = ['-DRS_KEYVAL_DWORDS=2'] + +radix_sort_spv = [] +foreach s : radix_sort_shaders + radix_sort_spv += custom_target( + s + '.spv.h', + input : s, + output : s + '.spv.h', + command : [ + prog_glslang, '-V', '--target-env', 'spirv1.5', '-x', '-o', '@OUTPUT@', '@INPUT@' + ] + defines + glslang_quiet + (with_mesa_debug ? ['-g'] : []), + depend_files: shader_include_files) +endforeach diff --git a/src/amd/vulkan/radix_sort/shaders/prefix.comp b/src/vulkan/runtime/radix_sort/shaders/prefix.comp similarity index 69% rename from src/amd/vulkan/radix_sort/shaders/prefix.comp rename to src/vulkan/runtime/radix_sort/shaders/prefix.comp index aae88869a6e..650d3305fd6 100644 --- a/src/amd/vulkan/radix_sort/shaders/prefix.comp +++ b/src/vulkan/runtime/radix_sort/shaders/prefix.comp @@ -46,41 +46,20 @@ layout(push_constant) uniform block_push #define RS_SUBGROUP_UNIFORM #endif -// -// Check all switches are defined -// -// -#ifndef RS_PREFIX_SUBGROUP_SIZE_LOG2 -#error "Undefined: RS_PREFIX_SUBGROUP_SIZE_LOG2" -#endif - -// -#ifndef RS_PREFIX_WORKGROUP_SIZE_LOG2 -#error "Undefined: RS_PREFIX_WORKGROUP_SIZE_LOG2" -#endif - // // Local macros // // clang-format off #define RS_KEYVAL_SIZE (RS_KEYVAL_DWORDS * 4) -#define RS_WORKGROUP_SIZE (1 << RS_PREFIX_WORKGROUP_SIZE_LOG2) +#define RS_WORKGROUP_SIZE (RS_PREFIX_WORKGROUP_SIZE) #define RS_SUBGROUP_SIZE (1 << RS_PREFIX_SUBGROUP_SIZE_LOG2) #define RS_WORKGROUP_SUBGROUPS (RS_WORKGROUP_SIZE / RS_SUBGROUP_SIZE) // clang-format on // -// There is no purpose in having a workgroup size larger than the -// radix size. -// -#if (RS_WORKGROUP_SIZE > RS_RADIX_SIZE) -#error "Error: (RS_WORKGROUP_SIZE > RS_RADIX_SIZE)" -#endif - // // -// -layout(local_size_x = RS_WORKGROUP_SIZE) in; +layout(local_size_x_id = RS_PREFIX_WORKGROUP_SIZE_ID) in; // // Histogram buffer reference @@ -95,34 +74,23 @@ layout(buffer_reference, std430) buffer buffer_rs_histograms // #include "prefix_limits.h" -// -// If multi-subgroup then define shared memory -// -#if (RS_WORKGROUP_SUBGROUPS > 1) - //---------------------------------------- shared uint32_t smem_sweep0[RS_SWEEP_0_SIZE]; #define RS_PREFIX_SWEEP0(idx_) smem_sweep0[idx_] //---------------------------------------- -#if (RS_SWEEP_1_SIZE > 0) //---------------------------------------- shared uint32_t smem_sweep1[RS_SWEEP_1_SIZE]; #define RS_PREFIX_SWEEP1(idx_) smem_sweep1[idx_] //---------------------------------------- -#endif -#if (RS_SWEEP_2_SIZE > 0) //---------------------------------------- shared uint32_t smem_sweep2[RS_SWEEP_2_SIZE]; #define RS_PREFIX_SWEEP2(idx_) smem_sweep2[idx_] //---------------------------------------- -#endif - -#endif // // Define function arguments @@ -151,37 +119,21 @@ main() // // Define buffer reference to read histograms // -#if (RS_WORKGROUP_SUBGROUPS == 1) - // - // Define histograms bufref for single subgroup - // // NOTE(allanmac): The histogram buffer reference could be adjusted // on the host to save a couple instructions at the cost of added // complexity. // + const uint32_t invocation_id = RS_WORKGROUP_SUBGROUPS == 1 ? gl_SubgroupInvocationID : gl_LocalInvocationID.x; + RS_SUBGROUP_UNIFORM const uint32_t histograms_base = ((RS_KEYVAL_SIZE - 1 - gl_WorkGroupID.x) * RS_RADIX_SIZE); - const uint32_t histograms_offset = (histograms_base + gl_SubgroupInvocationID) * 4; + const uint32_t histograms_offset = (histograms_base + invocation_id) * 4; RS_BUFREF_DEFINE_AT_OFFSET_UINT32(buffer_rs_histograms, rs_histograms, push.devaddr_histograms, histograms_offset); -#else - // - // Define histograms bufref for workgroup - // - RS_SUBGROUP_UNIFORM - const uint32_t histograms_base = ((RS_KEYVAL_SIZE - 1 - gl_WorkGroupID.x) * RS_RADIX_SIZE); - const uint32_t histograms_offset = (histograms_base + gl_LocalInvocationID.x) * 4; - - RS_BUFREF_DEFINE_AT_OFFSET_UINT32(buffer_rs_histograms, - rs_histograms, - push.devaddr_histograms, - histograms_offset); - -#endif // // Compute exclusive prefix of uint32_t[256] diff --git a/src/vulkan/runtime/radix_sort/shaders/prefix.h b/src/vulkan/runtime/radix_sort/shaders/prefix.h new file mode 100644 index 00000000000..f9582da0067 --- /dev/null +++ b/src/vulkan/runtime/radix_sort/shaders/prefix.h @@ -0,0 +1,356 @@ +// Copyright 2021 The Fuchsia Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_H_ +#define SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_H_ + +// +// Requires several defines +// +#ifndef RS_PREFIX_LIMITS +#error "Error: \"prefix_limits.h\" not loaded" +#endif + +#ifndef RS_PREFIX_ARGS +#error "Error: RS_PREFIX_ARGS undefined" +#endif + +#ifndef RS_PREFIX_LOAD +#error "Error: RS_PREFIX_LOAD undefined" +#endif + +#ifndef RS_PREFIX_STORE +#error "Error: RS_PREFIX_STORE undefined" +#endif + +// +// Optional switches: +// +// * Disable holding original inclusively scanned histogram values in registers. +// +// #define RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS +// + +// +// Compute exclusive prefix of uint32_t[256] +// +void +rs_prefix(RS_PREFIX_ARGS) +{ + if (RS_WORKGROUP_SUBGROUPS == 1) + { + // + // Workgroup is a single subgroup so no shared memory is required. + // + + // + // Exclusive scan-add the histogram + // + const uint32_t h0 = RS_PREFIX_LOAD(0); + const uint32_t h0_inc = subgroupInclusiveAdd(h0); + RS_SUBGROUP_UNIFORM uint32_t h_last = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1); + + RS_PREFIX_STORE(0) = h0_inc - h0; // exclusive + + // + // Each iteration is dependent on the previous so no unrolling. The + // compiler is free to hoist the loads upward though. + // + for (RS_SUBGROUP_UNIFORM uint32_t ii = RS_SUBGROUP_SIZE; // + ii < RS_RADIX_SIZE; + ii += RS_SUBGROUP_SIZE) + { + const uint32_t h = RS_PREFIX_LOAD(ii); + const uint32_t h_inc = subgroupInclusiveAdd(h) + h_last; + h_last = subgroupBroadcast(h_inc, RS_SUBGROUP_SIZE - 1); + + RS_PREFIX_STORE(ii) = h_inc - h; // exclusive + } + } + else + { + // + // Workgroup is multiple subgroups and uses shared memory to store + // the scan's intermediate results. + // + // Assumes a power-of-two subgroup, workgroup and radix size. + // + // Downsweep: Repeatedly scan reductions until they fit in a single + // subgroup. + // + // Upsweep: Then uniformly apply reductions to each subgroup. + // + // + // Subgroup Size | 4 | 8 | 16 | 32 | 64 | 128 | + // --------------+----+----+----+----+----+-----+ + // Sweep 0 | 64 | 32 | 16 | 8 | 4 | 2 | sweep_0[] + // Sweep 1 | 16 | 4 | - | - | - | - | sweep_1[] + // Sweep 2 | 4 | - | - | - | - | - | sweep_2[] + // --------------+----+----+----+----+----+-----+ + // Total dwords | 84 | 36 | 16 | 8 | 4 | 2 | + // --------------+----+----+----+----+----+-----+ + // +#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS + uint32_t h_exc[RS_H_COMPONENTS]; +#endif + + // + // Downsweep 0 + // + [[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++) + { + const uint32_t h = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE); + + const uint32_t h_inc = subgroupInclusiveAdd(h); + + const uint32_t smem_idx = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID; + + RS_PREFIX_SWEEP0(smem_idx) = subgroupBroadcast(h_inc, RS_SUBGROUP_SIZE - 1); + + // +#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS + h_exc[ii] = h_inc - h; +#else + RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) = h_inc - h; +#endif + } + + barrier(); + + // + // Skip generalizing these sweeps for all possible subgroups -- just + // write them directly. + // + if (RS_SUBGROUP_SIZE == 128) + { + // There are only two elements in SWEEP0 per subgroup. The scan is + // trivial so we fold it into the upsweep. + } + else if (RS_SUBGROUP_SIZE >= 16) + { + ////////////////////////////////////////////////////////////////////// + // + // Scan 0 + // + if (RS_SWEEP_0_SIZE != RS_WORKGROUP_SIZE && // workgroup has inactive components + gl_LocalInvocationID.x < RS_SWEEP_0_SIZE) + { + const uint32_t h0_red = RS_PREFIX_SWEEP0(gl_LocalInvocationID.x); + const uint32_t h0_inc = subgroupInclusiveAdd(h0_red); + + RS_PREFIX_SWEEP0(gl_LocalInvocationID.x) = h0_inc - h0_red; + } + + barrier(); + } + else if (RS_SUBGROUP_SIZE == 8) + { + if (RS_SWEEP_0_SIZE < RS_WORKGROUP_SIZE) + { + ////////////////////////////////////////////////////////////////////// + // + // Scan 0 and Downsweep 1 + // + if (gl_LocalInvocationID.x < RS_SWEEP_0_SIZE) // 32 invocations + { + const uint32_t h0_red = RS_PREFIX_SWEEP0(gl_LocalInvocationID.x); + const uint32_t h0_inc = subgroupInclusiveAdd(h0_red); + + RS_PREFIX_SWEEP0(gl_LocalInvocationID.x) = h0_inc - h0_red; + RS_PREFIX_SWEEP1(gl_SubgroupID) = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1); + } + } + else + { + ////////////////////////////////////////////////////////////////////// + // + // Scan 0 and Downsweep 1 + // + [[unroll]] for (uint32_t ii = 0; ii < RS_S0_PASSES; ii++) // 32 invocations + { + const uint32_t idx0 = (ii * RS_WORKGROUP_SIZE) + gl_LocalInvocationID.x; + const uint32_t idx1 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID; + + const uint32_t h0_red = RS_PREFIX_SWEEP0(idx0); + const uint32_t h0_inc = subgroupInclusiveAdd(h0_red); + + RS_PREFIX_SWEEP0(idx0) = h0_inc - h0_red; + RS_PREFIX_SWEEP1(idx1) = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1); + } + } + + barrier(); + + // + // Scan 1 + // + if (gl_LocalInvocationID.x < RS_SWEEP_1_SIZE) // 4 invocations + { + const uint32_t h1_red = RS_PREFIX_SWEEP1(gl_LocalInvocationID.x); + const uint32_t h1_inc = subgroupInclusiveAdd(h1_red); + + RS_PREFIX_SWEEP1(gl_LocalInvocationID.x) = h1_inc - h1_red; + } + + barrier(); + } + else if (RS_SUBGROUP_SIZE == 4) + { + ////////////////////////////////////////////////////////////////////// + // + // Scan 0 and Downsweep 1 + // + if (RS_SWEEP_0_SIZE < RS_WORKGROUP_SIZE) + { + if (gl_LocalInvocationID.x < RS_SWEEP_0_SIZE) // 64 invocations + { + const uint32_t h0_red = RS_PREFIX_SWEEP0(gl_LocalInvocationID.x); + const uint32_t h0_inc = subgroupInclusiveAdd(h0_red); + + RS_PREFIX_SWEEP0(gl_LocalInvocationID.x) = h0_inc - h0_red; + RS_PREFIX_SWEEP1(gl_SubgroupID) = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1); + } + } + else + { + [[unroll]] for (uint32_t ii = 0; ii < RS_S0_PASSES; ii++) // 64 invocations + { + const uint32_t idx0 = (ii * RS_WORKGROUP_SIZE) + gl_LocalInvocationID.x; + const uint32_t idx1 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID; + + const uint32_t h0_red = RS_PREFIX_SWEEP0(idx0); + const uint32_t h0_inc = subgroupInclusiveAdd(h0_red); + + RS_PREFIX_SWEEP0(idx0) = h0_inc - h0_red; + RS_PREFIX_SWEEP1(idx1) = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1); + } + } + + barrier(); + + // + // Scan 1 and Downsweep 2 + // + if (RS_SWEEP_1_SIZE < RS_WORKGROUP_SIZE) + { + if (gl_LocalInvocationID.x < RS_SWEEP_1_SIZE) // 16 invocations + { + const uint32_t h1_red = RS_PREFIX_SWEEP1(gl_LocalInvocationID.x); + const uint32_t h1_inc = subgroupInclusiveAdd(h1_red); + + RS_PREFIX_SWEEP1(gl_LocalInvocationID.x) = h1_inc - h1_red; + RS_PREFIX_SWEEP2(gl_SubgroupID) = subgroupBroadcast(h1_inc, RS_SUBGROUP_SIZE - 1); + } + } + else + { + [[unroll]] for (uint32_t ii = 0; ii < RS_S1_PASSES; ii++) // 16 invocations + { + const uint32_t idx1 = (ii * RS_WORKGROUP_SIZE) + gl_LocalInvocationID.x; + const uint32_t idx2 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID; + + const uint32_t h1_red = RS_PREFIX_SWEEP1(idx1); + const uint32_t h1_inc = subgroupInclusiveAdd(h1_red); + + RS_PREFIX_SWEEP1(idx1) = h1_inc - h1_red; + RS_PREFIX_SWEEP2(idx2) = subgroupBroadcast(h1_inc, RS_SUBGROUP_SIZE - 1); + } + } + + barrier(); + + // + // Scan 2 + // + // 4 invocations + // + if (gl_LocalInvocationID.x < RS_SWEEP_2_SIZE) + { + const uint32_t h2_red = RS_PREFIX_SWEEP2(gl_LocalInvocationID.x); + const uint32_t h2_inc = subgroupInclusiveAdd(h2_red); + + RS_PREFIX_SWEEP2(gl_LocalInvocationID.x) = h2_inc - h2_red; + } + + barrier(); + } + + ////////////////////////////////////////////////////////////////////// + // + // Final upsweep 0 + // + if (RS_SUBGROUP_SIZE == 128) + { + // There must be more than one subgroup per workgroup, but the maximum + // workgroup size is 256 so there must be exactly two subgroups per + // workgroup and RS_H_COMPONENTS must be 1. +#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS + RS_PREFIX_STORE(0) = h_exc[0] + (gl_SubgroupID > 0 ? RS_PREFIX_SWEEP0(0) : 0); +#else + const uint32_t h_exc = RS_PREFIX_LOAD(0); + + RS_PREFIX_STORE(0) = h_exc + (gl_SubgroupID > 0 ? RS_PREFIX_SWEEP0(0) : 0); +#endif + } + else if (RS_SUBGROUP_SIZE >= 16) + { + [[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++) + { + const uint32_t idx0 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID; + + // clang format issue +#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS + RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) = h_exc[ii] + RS_PREFIX_SWEEP0(idx0); +#else + const uint32_t h_exc = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE); + + RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) = h_exc + RS_PREFIX_SWEEP0(idx0); +#endif + } + } + else if (RS_SUBGROUP_SIZE == 8) + { + [[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++) + { + const uint32_t idx0 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID; + const uint32_t idx1 = idx0 / RS_SUBGROUP_SIZE; + +#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS + RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) = + h_exc[ii] + RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1); +#else + const uint32_t h_exc = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE); + + RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) = + h_exc + RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1); +#endif + } + } + else if (RS_SUBGROUP_SIZE == 4) + { + [[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++) + { + const uint32_t idx0 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID; + const uint32_t idx1 = idx0 / RS_SUBGROUP_SIZE; + const uint32_t idx2 = idx1 / RS_SUBGROUP_SIZE; + +#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS + RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) = + h_exc[ii] + (RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1) + RS_PREFIX_SWEEP2(idx2)); +#else + const uint32_t h_exc = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE); + + RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) = + h_exc + (RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1) + RS_PREFIX_SWEEP2(idx2)); +#endif + } + } + } +} + +// +// +// + +#endif // SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_H_ diff --git a/src/amd/vulkan/radix_sort/shaders/prefix_limits.h b/src/vulkan/runtime/radix_sort/shaders/prefix_limits.h similarity index 71% rename from src/amd/vulkan/radix_sort/shaders/prefix_limits.h rename to src/vulkan/runtime/radix_sort/shaders/prefix_limits.h index a98e554ad4a..4d0e89fb9c2 100644 --- a/src/amd/vulkan/radix_sort/shaders/prefix_limits.h +++ b/src/vulkan/runtime/radix_sort/shaders/prefix_limits.h @@ -10,17 +10,12 @@ // #define RS_PREFIX_LIMITS -// -// Multi-subgroup prefix requires shared memory. -// -#if (RS_WORKGROUP_SUBGROUPS > 1) - // clang-format off #define RS_H_COMPONENTS (RS_RADIX_SIZE / RS_WORKGROUP_SIZE) -#define RS_SWEEP_0_SIZE (RS_RADIX_SIZE / RS_SUBGROUP_SIZE) -#define RS_SWEEP_1_SIZE (RS_SWEEP_0_SIZE / RS_SUBGROUP_SIZE) -#define RS_SWEEP_2_SIZE (RS_SWEEP_1_SIZE / RS_SUBGROUP_SIZE) +#define RS_SWEEP_0_SIZE (RS_WORKGROUP_SUBGROUPS == 1 ? 0 : (RS_RADIX_SIZE / RS_SUBGROUP_SIZE)) +#define RS_SWEEP_1_SIZE (RS_WORKGROUP_SUBGROUPS == 1 ? 0 : (RS_SWEEP_0_SIZE / RS_SUBGROUP_SIZE)) +#define RS_SWEEP_2_SIZE (RS_WORKGROUP_SUBGROUPS == 1 ? 0 : (RS_SWEEP_1_SIZE / RS_SUBGROUP_SIZE)) #define RS_SWEEP_SIZE (RS_SWEEP_0_SIZE + RS_SWEEP_1_SIZE + RS_SWEEP_2_SIZE) @@ -32,15 +27,6 @@ #define RS_SWEEP_2_OFFSET (RS_SWEEP_1_OFFSET + RS_SWEEP_1_SIZE) // clang-format on -// -// Single subgroup prefix doesn't use shared memory. -// -#else - -#define RS_SWEEP_SIZE 0 - -#endif - // // // diff --git a/src/amd/vulkan/radix_sort/shaders/push.h b/src/vulkan/runtime/radix_sort/shaders/push.h similarity index 100% rename from src/amd/vulkan/radix_sort/shaders/push.h rename to src/vulkan/runtime/radix_sort/shaders/push.h diff --git a/src/amd/vulkan/radix_sort/shaders/scatter.glsl b/src/vulkan/runtime/radix_sort/shaders/scatter.glsl similarity index 58% rename from src/amd/vulkan/radix_sort/shaders/scatter.glsl rename to src/vulkan/runtime/radix_sort/shaders/scatter.glsl index b57d9e80850..bacd44682f5 100644 --- a/src/amd/vulkan/radix_sort/shaders/scatter.glsl +++ b/src/vulkan/runtime/radix_sort/shaders/scatter.glsl @@ -84,21 +84,6 @@ layout(push_constant) uniform block_push #error "Undefined: RS_SCATTER_KEYVAL_DWORD_BASE" #endif -// -#ifndef RS_SCATTER_BLOCK_ROWS -#error "Undefined: RS_SCATTER_BLOCK_ROWS" -#endif - -// -#ifndef RS_SCATTER_SUBGROUP_SIZE_LOG2 -#error "Undefined: RS_SCATTER_SUBGROUP_SIZE_LOG2" -#endif - -// -#ifndef RS_SCATTER_WORKGROUP_SIZE_LOG2 -#error "Undefined: RS_SCATTER_WORKGROUP_SIZE_LOG2" -#endif - // // Status masks are defined differently for the scatter_even and // scatter_odd shaders. @@ -140,7 +125,7 @@ layout(push_constant) uniform block_push // // clang-format off #define RS_KEYVAL_SIZE (RS_KEYVAL_DWORDS * 4) -#define RS_WORKGROUP_SIZE (1 << RS_SCATTER_WORKGROUP_SIZE_LOG2) +#define RS_WORKGROUP_SIZE (RS_SCATTER_WORKGROUP_SIZE) #define RS_SUBGROUP_SIZE (1 << RS_SCATTER_SUBGROUP_SIZE_LOG2) #define RS_WORKGROUP_SUBGROUPS (RS_WORKGROUP_SIZE / RS_SUBGROUP_SIZE) #define RS_SUBGROUP_KEYVALS (RS_SCATTER_BLOCK_ROWS * RS_SUBGROUP_SIZE) @@ -148,13 +133,6 @@ layout(push_constant) uniform block_push #define RS_RADIX_MASK ((1 << RS_RADIX_LOG2) - 1) // clang-format on -// -// Validate number of keyvals fit in a uint16_t. -// -#if (RS_BLOCK_KEYVALS >= 65536) -#error "Error: (RS_BLOCK_KEYVALS >= 65536)" -#endif - // // Keyval type // @@ -181,9 +159,7 @@ layout(push_constant) uniform block_push // Determine at compile time the base of the final iteration for // workgroups smaller than RS_RADIX_SIZE. // -#if (RS_WORKGROUP_SIZE < RS_RADIX_SIZE) #define RS_WORKGROUP_BASE_FINAL ((RS_RADIX_SIZE / RS_WORKGROUP_SIZE) * RS_WORKGROUP_SIZE) -#endif // // Max macro @@ -291,7 +267,7 @@ layout(push_constant) uniform block_push // // // -layout(local_size_x = RS_WORKGROUP_SIZE) in; +layout(local_size_x_id = RS_SCATTER_WORKGROUP_SIZE_ID) in; // // @@ -325,48 +301,55 @@ shared rs_scatter_smem smem; // The shared memory barrier is either subgroup-wide or // workgroup-wide. // -#if (RS_WORKGROUP_SUBGROUPS == 1) -#define RS_BARRIER() subgroupBarrier() -#else -#define RS_BARRIER() barrier() -#endif +void rsBarrier() +{ + if (RS_WORKGROUP_SUBGROUPS == 1) + subgroupBarrier(); + else + barrier(); +} // // If multi-subgroup then define shared memory // -#if (RS_WORKGROUP_SUBGROUPS > 1) //---------------------------------------- #define RS_PREFIX_SWEEP0(idx_) smem.extent[RS_SMEM_PREFIX_OFFSET + RS_SWEEP_0_OFFSET + (idx_)] //---------------------------------------- -#if (RS_SWEEP_1_SIZE > 0) //---------------------------------------- #define RS_PREFIX_SWEEP1(idx_) smem.extent[RS_SMEM_PREFIX_OFFSET + RS_SWEEP_1_OFFSET + (idx_)] //---------------------------------------- -#endif -#if (RS_SWEEP_2_SIZE > 0) //---------------------------------------- #define RS_PREFIX_SWEEP2(idx_) smem.extent[RS_SMEM_PREFIX_OFFSET + RS_SWEEP_2_OFFSET + (idx_)] //---------------------------------------- -#endif -#endif +uint32_t +invocation_id() +{ + return RS_WORKGROUP_SUBGROUPS == 1 ? gl_SubgroupID : gl_LocalInvocationID.x; +} // // Define prefix load/store functions // // clang-format off -#if (RS_WORKGROUP_SUBGROUPS == 1) -#define RS_PREFIX_LOAD(idx_) smem.extent[RS_SMEM_HISTOGRAM_OFFSET + gl_SubgroupInvocationID + (idx_)] -#define RS_PREFIX_STORE(idx_) smem.extent[RS_SMEM_HISTOGRAM_OFFSET + gl_SubgroupInvocationID + (idx_)] -#else -#define RS_PREFIX_LOAD(idx_) smem.extent[RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x + (idx_)] -#define RS_PREFIX_STORE(idx_) smem.extent[RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x + (idx_)] -#endif +#define RS_PREFIX_LOAD(idx_) smem.extent[RS_SMEM_HISTOGRAM_OFFSET + invocation_id() + (idx_)] +#define RS_PREFIX_STORE(idx_) smem.extent[RS_SMEM_HISTOGRAM_OFFSET + invocation_id() + (idx_)] // clang-format on +layout(buffer_reference, std430) buffer buffer_rs_workgroup_id +{ + uint32_t x[RS_KEYVAL_DWORDS * 4]; +}; + +#define RS_IS_FIRST_LOCAL_INVOCATION() (RS_WORKGROUP_SUBGROUPS == 1 ? gl_SubgroupInvocationID == 0 : gl_LocalInvocationID.x == 0) + +RS_SUBGROUP_UNIFORM uint32_t rs_gl_workgroup_id_x; + +#define RS_GL_WORKGROUP_ID_X (RS_SCATTER_NONSEQUENTIAL_DISPATCH != 0 ? rs_gl_workgroup_id_x : gl_WorkGroupID.x) + // // Load the prefix function // @@ -383,45 +366,43 @@ shared rs_scatter_smem smem; void rs_histogram_zero() { -#if (RS_WORKGROUP_SUBGROUPS == 1) - - const uint32_t smem_offset = RS_SMEM_HISTOGRAM_OFFSET + gl_SubgroupInvocationID; - - [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE) + if (RS_WORKGROUP_SUBGROUPS == 1) { - smem.extent[smem_offset + ii] = 0; - } + const uint32_t smem_offset = RS_SMEM_HISTOGRAM_OFFSET + gl_SubgroupInvocationID; -#elif (RS_WORKGROUP_SIZE < RS_RADIX_SIZE) - - const uint32_t smem_offset = RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x; - - [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE) - { - smem.extent[smem_offset + ii] = 0; - } - -#if (RS_WORKGROUP_BASE_FINAL < RS_RADIX_SIZE) - const uint32_t smem_offset_final = smem_offset + RS_WORKGROUP_BASE_FINAL; - - if (smem_offset_final < RS_RADIX_SIZE) + [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE) { - smem.histogram[smem_offset_final] = 0; + smem.extent[smem_offset + ii] = 0; } -#endif + } + else if (RS_WORKGROUP_SIZE < RS_RADIX_SIZE) + { + const uint32_t smem_offset = RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x; -#elif (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE) - -#if (RS_WORKGROUP_SIZE > RS_RADIX_SIZE) - if (gl_LocalInvocationID.x < RS_RADIX_SIZE) -#endif + [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE) { - smem.extent[RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x] = 0; + smem.extent[smem_offset + ii] = 0; } -#endif + if (RS_WORKGROUP_BASE_FINAL < RS_RADIX_SIZE) + { + const uint32_t smem_offset_final = smem_offset + RS_WORKGROUP_BASE_FINAL; - RS_BARRIER(); + if (smem_offset_final < RS_RADIX_SIZE) + { + smem.extent[smem_offset_final] = 0; + } + } + } + else + { + if (RS_WORKGROUP_SIZE == RS_RADIX_SIZE || gl_LocalInvocationID.x < RS_RADIX_SIZE) + { + smem.extent[RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x] = 0; + } + } + + rsBarrier(); } // @@ -450,11 +431,6 @@ rs_histogram_rank(const RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS], //---------------------------------------------------------------------- #ifdef RS_SCATTER_ENABLE_NV_MATCH - // - // 32 - // -#if (RS_SUBGROUP_SIZE == 32) - [[unroll]] for (uint32_t ii = 0; ii < RS_SCATTER_BLOCK_ROWS; ii++) { // @@ -470,13 +446,6 @@ rs_histogram_rank(const RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS], kr[ii] = (bitCount(match) << 16) | bitCount(match & gl_SubgroupLeMask.x); } - // - // Undefined! - // -#else -#error "Error: rs_histogram_rank() undefined for subgroup size" -#endif - //---------------------------------------------------------------------- // // Default is to emulate a `match` operation with ballots. @@ -484,79 +453,32 @@ rs_histogram_rank(const RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS], //---------------------------------------------------------------------- #elif !defined(RS_SCATTER_ENABLE_BROADCAST_MATCH) - // - // 64 - // -#if (RS_SUBGROUP_SIZE == 64) - [[unroll]] for (uint32_t ii = 0; ii < RS_SCATTER_BLOCK_ROWS; ii++) { const uint32_t digit = RS_KV_EXTRACT_DIGIT(kv[ii]); - u32vec2 match; + u32vec4 match; { - const bool is_one = RS_BIT_IS_ONE(digit, 0); - const u32vec2 ballot = subgroupBallot(is_one).xy; - const uint32_t mask = is_one ? 0 : 0xFFFFFFFF; + const bool is_one = RS_BIT_IS_ONE(digit, 0); + const u32vec4 ballot = subgroupBallot(is_one); + const u32vec4 mask = u32vec4(is_one ? 0 : 0xFFFFFFFF); - match.x = (ballot.x ^ mask); - match.y = (ballot.y ^ mask); + match = ballot ^ mask; } [[unroll]] for (int32_t bit = 1; bit < RS_RADIX_LOG2; bit++) { - const bool is_one = RS_BIT_IS_ONE(digit, bit); - const u32vec2 ballot = subgroupBallot(is_one).xy; - const uint32_t mask = is_one ? 0 : 0xFFFFFFFF; + const bool is_one = RS_BIT_IS_ONE(digit, bit); + const u32vec4 ballot = subgroupBallot(is_one); + const u32vec4 mask = u32vec4(is_one ? 0 : 0xFFFFFFFF); - match.x &= (ballot.x ^ mask); - match.y &= (ballot.y ^ mask); + match &= ballot ^ mask; } - kr[ii] = ((bitCount(match.x) + bitCount(match.y)) << 16) | - (bitCount(match.x & gl_SubgroupLeMask.x) + // - bitCount(match.y & gl_SubgroupLeMask.y)); + kr[ii] = (subgroupBallotBitCount(match) << 16) | subgroupBallotInclusiveBitCount(match); } - // - // <= 32 - // -#elif ((RS_SUBGROUP_SIZE <= 32) && !defined(RS_SCATTER_ENABLE_NV_MATCH)) - - [[unroll]] for (uint32_t ii = 0; ii < RS_SCATTER_BLOCK_ROWS; ii++) - { - const uint32_t digit = RS_KV_EXTRACT_DIGIT(kv[ii]); - - uint32_t match; - - { - const bool is_one = RS_BIT_IS_ONE(digit, 0); - const uint32_t ballot = subgroupBallot(is_one).x; - const uint32_t mask = is_one ? 0 : RS_SUBGROUP_MASK; - - match = (ballot ^ mask); - } - - [[unroll]] for (int32_t bit = 1; bit < RS_RADIX_LOG2; bit++) - { - const bool is_one = RS_BIT_IS_ONE(digit, bit); - const uint32_t ballot = subgroupBallot(is_one).x; - const uint32_t mask = is_one ? 0 : RS_SUBGROUP_MASK; - - match &= (ballot ^ mask); - } - - kr[ii] = (bitCount(match) << 16) | bitCount(match & gl_SubgroupLeMask.x); - } - - // - // Undefined! - // -#else -#error "Error: rs_histogram_rank() undefined for subgroup size" -#endif - //---------------------------------------------------------------------- // // Emulate a `match` operation with broadcasts. @@ -569,69 +491,58 @@ rs_histogram_rank(const RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS], // // 64 // -#if (RS_SUBGROUP_SIZE == 64) - - [[unroll]] for (uint32_t ii = 0; ii < RS_SCATTER_BLOCK_ROWS; ii++) + if (RS_SUBGROUP_SIZE == 64) { - const uint32_t digit = RS_KV_EXTRACT_DIGIT(kv[ii]); - - u32vec2 match; - - // subgroup invocation 0 + [[unroll]] for (uint32_t ii = 0; ii < RS_SCATTER_BLOCK_ROWS; ii++) { - match[0] = (subgroupBroadcast(digit, 0) == digit) ? (1u << 0) : 0; - } + const uint32_t digit = RS_KV_EXTRACT_DIGIT(kv[ii]); - // subgroup invocations 1-31 - [[unroll]] for (int32_t jj = 1; jj < 32; jj++) + u32vec2 match; + + // subgroup invocation 0 + { + match[0] = (subgroupBroadcast(digit, 0) == digit) ? (1u << 0) : 0; + } + + // subgroup invocations 1-31 + [[unroll]] for (int32_t jj = 1; jj < 32; jj++) + { + match[0] |= (subgroupBroadcast(digit, jj) == digit) ? (1u << jj) : 0; + } + + // subgroup invocation 32 + { + match[1] = (subgroupBroadcast(digit, 32) == digit) ? (1u << 0) : 0; + } + + // subgroup invocations 33-63 + [[unroll]] for (int32_t jj = 1; jj < 32; jj++) + { + match[1] |= (subgroupBroadcast(digit, jj) == digit) ? (1u << jj) : 0; + } + + kr[ii] = ((bitCount(match.x) + bitCount(match.y)) << 16) | + (bitCount(match.x & gl_SubgroupLeMask.x) + // + bitCount(match.y & gl_SubgroupLeMask.y)); + } + } else if (RS_SUBGROUP_SIZE <= 32) { + [[unroll]] for (uint32_t ii = 0; ii < RS_SCATTER_BLOCK_ROWS; ii++) { - match[0] |= (subgroupBroadcast(digit, jj) == digit) ? (1u << jj) : 0; - } + const uint32_t digit = RS_KV_EXTRACT_DIGIT(kv[ii]); - // subgroup invocation 32 - { - match[1] = (subgroupBroadcast(digit, 32) == digit) ? (1u << 0) : 0; - } + // subgroup invocation 0 + uint32_t match = (subgroupBroadcast(digit, 0) == digit) ? (1u << 0) : 0; - // subgroup invocations 33-63 - [[unroll]] for (int32_t jj = 1; jj < 32; jj++) - { - match[1] |= (subgroupBroadcast(digit, jj) == digit) ? (1u << jj) : 0; - } + // subgroup invocations 1-(RS_SUBGROUP_SIZE-1) + [[unroll]] for (int32_t jj = 1; jj < RS_SUBGROUP_SIZE; jj++) + { + match |= (subgroupBroadcast(digit, jj) == digit) ? (1u << jj) : 0; + } - kr[ii] = ((bitCount(match.x) + bitCount(match.y)) << 16) | - (bitCount(match.x & gl_SubgroupLeMask.x) + // - bitCount(match.y & gl_SubgroupLeMask.y)); + kr[ii] = (bitCount(match) << 16) | bitCount(match & gl_SubgroupLeMask.x); + } } - // - // <= 32 - // -#elif ((RS_SUBGROUP_SIZE <= 32) && !defined(RS_SCATTER_ENABLE_NV_MATCH)) - - [[unroll]] for (uint32_t ii = 0; ii < RS_SCATTER_BLOCK_ROWS; ii++) - { - const uint32_t digit = RS_KV_EXTRACT_DIGIT(kv[ii]); - - // subgroup invocation 0 - uint32_t match = (subgroupBroadcast(digit, 0) == digit) ? (1u << 0) : 0; - - // subgroup invocations 1-(RS_SUBGROUP_SIZE-1) - [[unroll]] for (int32_t jj = 1; jj < RS_SUBGROUP_SIZE; jj++) - { - match |= (subgroupBroadcast(digit, jj) == digit) ? (1u << jj) : 0; - } - - kr[ii] = (bitCount(match) << 16) | bitCount(match & gl_SubgroupLeMask.x); - } - - // - // Undefined! - // -#else -#error "Error: rs_histogram_rank() undefined for subgroup size" -#endif - #endif // @@ -660,7 +571,7 @@ rs_histogram_rank(const RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS], } } - RS_BARRIER(); + rsBarrier(); } } @@ -677,110 +588,103 @@ rs_first_prefix_store(restrict buffer_rs_partitions rs_partitions) // // Define the histogram reference // -#if (RS_WORKGROUP_SUBGROUPS == 1) - const uint32_t hist_offset = gl_SubgroupInvocationID * 4; -#else - const uint32_t hist_offset = gl_LocalInvocationID.x * 4; -#endif + const uint32_t hist_offset = invocation_id() * 4; readonly RS_BUFREF_DEFINE_AT_OFFSET_UINT32(buffer_rs_histogram, rs_histogram, push.devaddr_histograms, hist_offset); -#if (RS_WORKGROUP_SUBGROUPS == 1) - //////////////////////////////////////////////////////////////////////////// - // - // (RS_WORKGROUP_SUBGROUPS == 1) - // - const uint32_t smem_offset_h = RS_SMEM_HISTOGRAM_OFFSET + gl_SubgroupInvocationID; - const uint32_t smem_offset_l = RS_SMEM_LOOKBACK_OFFSET + gl_SubgroupInvocationID; - - [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE) + if (RS_WORKGROUP_SUBGROUPS == 1) { - const uint32_t exc = rs_histogram.extent[ii]; - const uint32_t red = smem.extent[smem_offset_h + ii]; + const uint32_t smem_offset_h = RS_SMEM_HISTOGRAM_OFFSET + gl_SubgroupInvocationID; + const uint32_t smem_offset_l = RS_SMEM_LOOKBACK_OFFSET + gl_SubgroupInvocationID; - smem.extent[smem_offset_l + ii] = exc; - - const uint32_t inc = exc + red; - - atomicStore(rs_partitions.extent[ii], - inc | RS_PARTITION_MASK_PREFIX, - gl_ScopeQueueFamily, - gl_StorageSemanticsBuffer, - gl_SemanticsRelease); - } - -#elif (RS_WORKGROUP_SIZE < RS_RADIX_SIZE) - //////////////////////////////////////////////////////////////////////////// - // - // (RS_WORKGROUP_SIZE < RS_RADIX_SIZE) - // - const uint32_t smem_offset_h = RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x; - const uint32_t smem_offset_l = RS_SMEM_LOOKBACK_OFFSET + gl_LocalInvocationID.x; - - [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE) - { - const uint32_t exc = rs_histogram.extent[ii]; - const uint32_t red = smem.extent[smem_offset_h + ii]; - - smem.extent[smem_offset_l + ii] = exc; - - const uint32_t inc = exc + red; - - atomicStore(rs_partitions.extent[ii], - inc | RS_PARTITION_MASK_PREFIX, - gl_ScopeQueueFamily, - gl_StorageSemanticsBuffer, - gl_SemanticsRelease); - } - -#if (RS_WORKGROUP_BASE_FINAL < RS_RADIX_SIZE) - const uint32_t smem_offset_final_h = smem_offset_h + RS_WORKGROUP_BASE_FINAL; - const uint32_t smem_offset_final_l = smem_offset_l + RS_WORKGROUP_BASE_FINAL; - - if (smem_offset_final < RS_RADIX_SIZE) + [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE) { - const uint32_t exc = rs_histogram.extent[RS_WORKGROUP_BASE_FINAL]; - const uint32_t red = smem.extent[smem_offset_final_h]; + const uint32_t exc = rs_histogram.extent[ii]; + const uint32_t red = smem.extent[smem_offset_h + ii]; - smem.extent[smem_offset_final_l] = exc; + smem.extent[smem_offset_l + ii] = exc; const uint32_t inc = exc + red; - atomicStore(rs_partitions.extent[RS_WORKGROUP_BASE_FINAL], + atomicStore(rs_partitions.extent[ii], inc | RS_PARTITION_MASK_PREFIX, gl_ScopeQueueFamily, gl_StorageSemanticsBuffer, - gl_SemanticsRelease); + gl_SemanticsRelease | gl_SemanticsMakeAvailable); } -#endif + } + else if (RS_WORKGROUP_SIZE < RS_RADIX_SIZE) + { + //////////////////////////////////////////////////////////////////////////// + // + // (RS_WORKGROUP_SIZE < RS_RADIX_SIZE) + // + const uint32_t smem_offset_h = RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x; + const uint32_t smem_offset_l = RS_SMEM_LOOKBACK_OFFSET + gl_LocalInvocationID.x; -#elif (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE) - //////////////////////////////////////////////////////////////////////////// - // - // (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE) - // -#if (RS_WORKGROUP_SIZE > RS_RADIX_SIZE) - if (gl_LocalInvocationID.x < RS_RADIX_SIZE) -#endif + [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE) { - const uint32_t exc = rs_histogram.extent[0]; - const uint32_t red = smem.extent[RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x]; + const uint32_t exc = rs_histogram.extent[ii]; + const uint32_t red = smem.extent[smem_offset_h + ii]; - smem.extent[RS_SMEM_LOOKBACK_OFFSET + gl_LocalInvocationID.x] = exc; + smem.extent[smem_offset_l + ii] = exc; const uint32_t inc = exc + red; - atomicStore(rs_partitions.extent[0], + atomicStore(rs_partitions.extent[ii], inc | RS_PARTITION_MASK_PREFIX, gl_ScopeQueueFamily, gl_StorageSemanticsBuffer, - gl_SemanticsRelease); + gl_SemanticsRelease | gl_SemanticsMakeAvailable); } -#endif + if (RS_WORKGROUP_BASE_FINAL < RS_RADIX_SIZE) + { + const uint32_t smem_offset_final_h = smem_offset_h + RS_WORKGROUP_BASE_FINAL; + const uint32_t smem_offset_final_l = smem_offset_l + RS_WORKGROUP_BASE_FINAL; + + if (smem_offset_final_h < RS_RADIX_SIZE) + { + const uint32_t exc = rs_histogram.extent[RS_WORKGROUP_BASE_FINAL]; + const uint32_t red = smem.extent[smem_offset_final_h]; + + smem.extent[smem_offset_final_l] = exc; + + const uint32_t inc = exc + red; + + atomicStore(rs_partitions.extent[RS_WORKGROUP_BASE_FINAL], + inc | RS_PARTITION_MASK_PREFIX, + gl_ScopeQueueFamily, + gl_StorageSemanticsBuffer, + gl_SemanticsRelease | gl_SemanticsMakeAvailable); + } + } + } + else + { + //////////////////////////////////////////////////////////////////////////// + // + // (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE) + // + if (RS_WORKGROUP_SIZE == RS_RADIX_SIZE || gl_LocalInvocationID.x < RS_RADIX_SIZE) + { + const uint32_t exc = rs_histogram.extent[0]; + const uint32_t red = smem.extent[RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x]; + + smem.extent[RS_SMEM_LOOKBACK_OFFSET + gl_LocalInvocationID.x] = exc; + + const uint32_t inc = exc + red; + + atomicStore(rs_partitions.extent[0], + inc | RS_PARTITION_MASK_PREFIX, + gl_ScopeQueueFamily, + gl_StorageSemanticsBuffer, + gl_SemanticsRelease | gl_SemanticsMakeAvailable); + } + } } // @@ -790,76 +694,77 @@ void rs_reduction_store(restrict buffer_rs_partitions rs_partitions, RS_SUBGROUP_UNIFORM const uint32_t partition_base) { -#if (RS_WORKGROUP_SUBGROUPS == 1) - //////////////////////////////////////////////////////////////////////////// - // - // (RS_WORKGROUP_SUBGROUPS == 1) - // - const uint32_t smem_offset = RS_SMEM_HISTOGRAM_OFFSET + gl_SubgroupInvocationID; - - [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE) + if (RS_WORKGROUP_SUBGROUPS == 1) { - const uint32_t red = smem.extent[smem_offset + ii]; + //////////////////////////////////////////////////////////////////////////// + // + // (RS_WORKGROUP_SUBGROUPS == 1) + // + const uint32_t smem_offset = RS_SMEM_HISTOGRAM_OFFSET + gl_SubgroupInvocationID; - atomicStore(rs_partitions.extent[partition_base + ii], - red | RS_PARTITION_MASK_REDUCTION, - gl_ScopeQueueFamily, - gl_StorageSemanticsBuffer, - gl_SemanticsRelease); - } - -#elif (RS_WORKGROUP_SIZE < RS_RADIX_SIZE) - //////////////////////////////////////////////////////////////////////////// - // - // (RS_WORKGROUP_SIZE < RS_RADIX_SIZE) - // - const uint32_t smem_offset = RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x; - - [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE) - { - const uint32_t red = smem.extent[smem_offset + ii]; - - atomicStore(rs_partitions.extent[partition_base + ii], - red | RS_PARTITION_MASK_REDUCTION, - gl_ScopeQueueFamily, - gl_StorageSemanticsBuffer, - gl_SemanticsRelease); - } - -#if (RS_WORKGROUP_BASE_FINAL < RS_RADIX_SIZE) - const uint32_t smem_offset_final = smem_offset + RS_WORKGROUP_BASE_FINAL; - - if (smem_offset_final < RS_RADIX_SIZE) + [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE) { - const uint32_t red = smem.extent[smem_offset_final]; + const uint32_t red = smem.extent[smem_offset + ii]; - atomicStore(rs_partitions.extent[partition_base + RS_WORKGROUP_BASE_FINAL], + atomicStore(rs_partitions.extent[partition_base + ii], red | RS_PARTITION_MASK_REDUCTION, gl_ScopeQueueFamily, gl_StorageSemanticsBuffer, - gl_SemanticsRelease); + gl_SemanticsRelease | gl_SemanticsMakeAvailable); } -#endif + } + else if (RS_WORKGROUP_SIZE < RS_RADIX_SIZE) + { + //////////////////////////////////////////////////////////////////////////// + // + // (RS_WORKGROUP_SIZE < RS_RADIX_SIZE) + // + const uint32_t smem_offset = RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x; -#elif (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE) - //////////////////////////////////////////////////////////////////////////// - // - // (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE) - // -#if (RS_WORKGROUP_SIZE > RS_RADIX_SIZE) - if (gl_LocalInvocationID.x < RS_RADIX_SIZE) -#endif + [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE) { - const uint32_t red = smem.extent[RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x]; + const uint32_t red = smem.extent[smem_offset + ii]; - atomicStore(rs_partitions.extent[partition_base], + atomicStore(rs_partitions.extent[partition_base + ii], red | RS_PARTITION_MASK_REDUCTION, gl_ScopeQueueFamily, gl_StorageSemanticsBuffer, - gl_SemanticsRelease); + gl_SemanticsRelease | gl_SemanticsMakeAvailable); } -#endif + if (RS_WORKGROUP_BASE_FINAL < RS_RADIX_SIZE) + { + const uint32_t smem_offset_final = smem_offset + RS_WORKGROUP_BASE_FINAL; + + if (smem_offset_final < RS_RADIX_SIZE) + { + const uint32_t red = smem.extent[smem_offset_final]; + + atomicStore(rs_partitions.extent[partition_base + RS_WORKGROUP_BASE_FINAL], + red | RS_PARTITION_MASK_REDUCTION, + gl_ScopeQueueFamily, + gl_StorageSemanticsBuffer, + gl_SemanticsRelease | gl_SemanticsMakeAvailable); + } + } + } + else if (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE) + { + //////////////////////////////////////////////////////////////////////////// + // + // (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE) + // + if (RS_WORKGROUP_SIZE == RS_RADIX_SIZE || gl_LocalInvocationID.x < RS_RADIX_SIZE) + { + const uint32_t red = smem.extent[RS_SMEM_HISTOGRAM_OFFSET + gl_LocalInvocationID.x]; + + atomicStore(rs_partitions.extent[partition_base], + red | RS_PARTITION_MASK_REDUCTION, + gl_ScopeQueueFamily, + gl_StorageSemanticsBuffer, + gl_SemanticsRelease | gl_SemanticsMakeAvailable); + } + } } // @@ -875,120 +780,15 @@ void rs_lookback_store(restrict buffer_rs_partitions rs_partitions, RS_SUBGROUP_UNIFORM const uint32_t partition_base) { -#if (RS_WORKGROUP_SUBGROUPS == 1) - //////////////////////////////////////////////////////////////////////////// - // - // (RS_WORKGROUP_SUBGROUPS == 1) - // - const uint32_t smem_offset = RS_SMEM_LOOKBACK_OFFSET + gl_SubgroupInvocationID; - - [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE) + if (RS_WORKGROUP_SUBGROUPS == 1) { - uint32_t partition_base_prev = partition_base - RS_RADIX_SIZE; - uint32_t exc = 0; - + //////////////////////////////////////////////////////////////////////////// // - // NOTE: Each workgroup invocation can proceed independently. - // Subgroups and workgroups do NOT have to coordinate. + // (RS_WORKGROUP_SUBGROUPS == 1) // - while (true) - { - const uint32_t prev = atomicLoad(rs_partitions.extent[partition_base_prev + ii], - gl_ScopeQueueFamily, - gl_StorageSemanticsBuffer, - gl_SemanticsAcquire); + const uint32_t smem_offset = RS_SMEM_LOOKBACK_OFFSET + gl_SubgroupInvocationID; - // spin until valid - if ((prev & RS_PARTITION_MASK_STATUS) == RS_PARTITION_MASK_INVALID) - { - continue; - } - - exc += (prev & RS_PARTITION_MASK_COUNT); - - if ((prev & RS_PARTITION_MASK_STATUS) != RS_PARTITION_MASK_PREFIX) - { - // continue accumulating reductions - partition_base_prev -= RS_RADIX_SIZE; - continue; - } - - // - // Otherwise, save the exclusive scan and atomically transform - // the reduction into an inclusive prefix status math: - // - // reduction + 1 = prefix - // - smem.extent[smem_offset + ii] = exc; - - atomicAdd(rs_partitions.extent[partition_base + ii], - exc | (1 << 30), - gl_ScopeQueueFamily, - gl_StorageSemanticsBuffer, - gl_SemanticsAcquireRelease); - break; - } - } - -#elif (RS_WORKGROUP_SIZE < RS_RADIX_SIZE) - //////////////////////////////////////////////////////////////////////////// - // - // (RS_WORKGROUP_SIZE < RS_RADIX_SIZE) - // - const uint32_t smem_offset = RS_SMEM_LOOKBACK_OFFSET + gl_LocalInvocationID.x; - - [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE) - { - uint32_t partition_base_prev = partition_base - RS_RADIX_SIZE; - uint32_t exc = 0; - - // - // NOTE: Each workgroup invocation can proceed independently. - // Subgroups and workgroups do NOT have to coordinate. - // - while (true) - { - const uint32_t prev = atomicLoad(rs_partitions.extent[partition_base_prev + ii], - gl_ScopeQueueFamily, - gl_StorageSemanticsBuffer, - gl_SemanticsAcquire); - - // spin until valid - if ((prev & RS_PARTITION_MASK_STATUS) == RS_PARTITION_MASK_INVALID) - { - continue; - } - - exc += (prev & RS_PARTITION_MASK_COUNT); - - if ((prev & RS_PARTITION_MASK_STATUS) != RS_PARTITION_MASK_PREFIX) - { - // continue accumulating reductions - partition_base_prev -= RS_RADIX_SIZE; - continue; - } - - // - // Otherwise, save the exclusive scan and atomically transform - // the reduction into an inclusive prefix status math: - // - // reduction + 1 = prefix - // - smem.extent[smem_offset + ii] = exc; - - atomicAdd(rs_partitions.extent[partition_base + ii], - exc | (1 << 30), - gl_ScopeQueueFamily, - gl_StorageSemanticsBuffer, - gl_SemanticsAcquireRelease); - break; - } - } - -#if (RS_WORKGROUP_BASE_FINAL < RS_RADIX_SIZE) - const uint32_t smem_offset_final = smem_offset + RS_WORKGROUP_BASE_FINAL; - - if (smem_offset_final < RS_SMEM_LOOKBACK_OFFSET + RS_RADIX_SIZE) + [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE) { uint32_t partition_base_prev = partition_base - RS_RADIX_SIZE; uint32_t exc = 0; @@ -1002,7 +802,7 @@ rs_lookback_store(restrict buffer_rs_partitions rs_partitions, const uint32_t prev = atomicLoad(rs_partitions.extent[partition_base_prev + ii], gl_ScopeQueueFamily, gl_StorageSemanticsBuffer, - gl_SemanticsAcquire); + gl_SemanticsAcquire | gl_SemanticsMakeVisible); // spin until valid if ((prev & RS_PARTITION_MASK_STATUS) == RS_PARTITION_MASK_INVALID) @@ -1027,7 +827,7 @@ rs_lookback_store(restrict buffer_rs_partitions rs_partitions, // smem.extent[smem_offset + ii] = exc; - atomicAdd(rs_partitions.extent[partition_base + RS_WORKGROUP_BASE_FINAL], + atomicAdd(rs_partitions.extent[partition_base + ii], exc | (1 << 30), gl_ScopeQueueFamily, gl_StorageSemanticsBuffer, @@ -1035,16 +835,16 @@ rs_lookback_store(restrict buffer_rs_partitions rs_partitions, break; } } -#endif + } + else if (RS_WORKGROUP_SIZE < RS_RADIX_SIZE) + { + //////////////////////////////////////////////////////////////////////////// + // + // (RS_WORKGROUP_SIZE < RS_RADIX_SIZE) + // + const uint32_t smem_offset = RS_SMEM_LOOKBACK_OFFSET + gl_LocalInvocationID.x; -#elif (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE) - //////////////////////////////////////////////////////////////////////////// - // - // (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE) - // -#if (RS_WORKGROUP_SIZE > RS_RADIX_SIZE) - if (gl_LocalInvocationID.x < RS_RADIX_SIZE) -#endif + [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE) { uint32_t partition_base_prev = partition_base - RS_RADIX_SIZE; uint32_t exc = 0; @@ -1055,10 +855,10 @@ rs_lookback_store(restrict buffer_rs_partitions rs_partitions, // while (true) { - const uint32_t prev = atomicLoad(rs_partitions.extent[partition_base_prev], + const uint32_t prev = atomicLoad(rs_partitions.extent[partition_base_prev + ii], gl_ScopeQueueFamily, gl_StorageSemanticsBuffer, - gl_SemanticsAcquire); + gl_SemanticsAcquire | gl_SemanticsMakeVisible); // spin until valid if ((prev & RS_PARTITION_MASK_STATUS) == RS_PARTITION_MASK_INVALID) @@ -1081,9 +881,9 @@ rs_lookback_store(restrict buffer_rs_partitions rs_partitions, // // reduction + 1 = prefix // - smem.extent[RS_SMEM_LOOKBACK_OFFSET + gl_LocalInvocationID.x] = exc; + smem.extent[smem_offset + ii] = exc; - atomicAdd(rs_partitions.extent[partition_base], + atomicAdd(rs_partitions.extent[partition_base + ii], exc | (1 << 30), gl_ScopeQueueFamily, gl_StorageSemanticsBuffer, @@ -1092,7 +892,113 @@ rs_lookback_store(restrict buffer_rs_partitions rs_partitions, } } -#endif + if (RS_WORKGROUP_BASE_FINAL < RS_RADIX_SIZE) + { + const uint32_t smem_offset_final = smem_offset + RS_WORKGROUP_BASE_FINAL; + + if (smem_offset_final < RS_SMEM_LOOKBACK_OFFSET + RS_RADIX_SIZE) + { + uint32_t partition_base_prev = partition_base - RS_RADIX_SIZE; + uint32_t exc = 0; + + // + // NOTE: Each workgroup invocation can proceed independently. + // Subgroups and workgroups do NOT have to coordinate. + // + while (true) + { + const uint32_t prev = atomicLoad(rs_partitions.extent[partition_base_prev + RS_WORKGROUP_BASE_FINAL], + gl_ScopeQueueFamily, + gl_StorageSemanticsBuffer, + gl_SemanticsAcquire | gl_SemanticsMakeVisible); + + // spin until valid + if ((prev & RS_PARTITION_MASK_STATUS) == RS_PARTITION_MASK_INVALID) + { + continue; + } + + exc += (prev & RS_PARTITION_MASK_COUNT); + + if ((prev & RS_PARTITION_MASK_STATUS) != RS_PARTITION_MASK_PREFIX) + { + // continue accumulating reductions + partition_base_prev -= RS_RADIX_SIZE; + continue; + } + + // + // Otherwise, save the exclusive scan and atomically transform + // the reduction into an inclusive prefix status math: + // + // reduction + 1 = prefix + // + smem.extent[smem_offset + RS_WORKGROUP_BASE_FINAL] = exc; + + atomicAdd(rs_partitions.extent[partition_base + RS_WORKGROUP_BASE_FINAL], + exc | (1 << 30), + gl_ScopeQueueFamily, + gl_StorageSemanticsBuffer, + gl_SemanticsAcquireRelease); + break; + } + } + } + } + else + { + //////////////////////////////////////////////////////////////////////////// + // + // (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE) + // + if (RS_WORKGROUP_SIZE == RS_RADIX_SIZE || gl_LocalInvocationID.x < RS_RADIX_SIZE) + { + uint32_t partition_base_prev = partition_base - RS_RADIX_SIZE; + uint32_t exc = 0; + + // + // NOTE: Each workgroup invocation can proceed independently. + // Subgroups and workgroups do NOT have to coordinate. + // + while (true) + { + const uint32_t prev = atomicLoad(rs_partitions.extent[partition_base_prev], + gl_ScopeQueueFamily, + gl_StorageSemanticsBuffer, + gl_SemanticsAcquire | gl_SemanticsMakeVisible); + + // spin until valid + if ((prev & RS_PARTITION_MASK_STATUS) == RS_PARTITION_MASK_INVALID) + { + continue; + } + + exc += (prev & RS_PARTITION_MASK_COUNT); + + if ((prev & RS_PARTITION_MASK_STATUS) != RS_PARTITION_MASK_PREFIX) + { + // continue accumulating reductions + partition_base_prev -= RS_RADIX_SIZE; + continue; + } + + // + // Otherwise, save the exclusive scan and atomically transform + // the reduction into an inclusive prefix status math: + // + // reduction + 1 = prefix + // + smem.extent[RS_SMEM_LOOKBACK_OFFSET + gl_LocalInvocationID.x] = exc; + + atomicAdd(rs_partitions.extent[partition_base], + exc | (1 << 30), + gl_ScopeQueueFamily, + gl_StorageSemanticsBuffer, + gl_SemanticsAcquireRelease); + break; + } + } + } } // @@ -1105,98 +1011,15 @@ void rs_lookback_skip_store(restrict buffer_rs_partitions rs_partitions, RS_SUBGROUP_UNIFORM const uint32_t partition_base) { -#if (RS_WORKGROUP_SUBGROUPS == 1) - //////////////////////////////////////////////////////////////////////////// - // - // (RS_WORKGROUP_SUBGROUPS == 1) - // - const uint32_t smem_offset = RS_SMEM_LOOKBACK_OFFSET + gl_SubgroupInvocationID; - - [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE) + if (RS_WORKGROUP_SUBGROUPS == 1) { - uint32_t partition_base_prev = partition_base - RS_RADIX_SIZE; - uint32_t exc = 0; - + //////////////////////////////////////////////////////////////////////////// // - // NOTE: Each workgroup invocation can proceed independently. - // Subgroups and workgroups do NOT have to coordinate. + // (RS_WORKGROUP_SUBGROUPS == 1) // - while (true) - { - const uint32_t prev = atomicLoad(rs_partitions.extent[partition_base_prev + ii], - gl_ScopeQueueFamily, - gl_StorageSemanticsBuffer, - gl_SemanticsAcquire); + const uint32_t smem_offset = RS_SMEM_LOOKBACK_OFFSET + gl_SubgroupInvocationID; - // spin until valid - if ((prev & RS_PARTITION_MASK_STATUS) == RS_PARTITION_MASK_INVALID) - { - continue; - } - - exc += (prev & RS_PARTITION_MASK_COUNT); - - if ((prev & RS_PARTITION_MASK_STATUS) != RS_PARTITION_MASK_PREFIX) - { - // continue accumulating reductions - partition_base_prev -= RS_RADIX_SIZE; - continue; - } - - // Otherwise, save the exclusive scan. - smem.extent[smem_offset + ii] = exc; - break; - } - } - -#elif (RS_WORKGROUP_SIZE < RS_RADIX_SIZE) - //////////////////////////////////////////////////////////////////////////// - // - // (RS_WORKGROUP_SIZE < RS_RADIX_SIZE) - // - const uint32_t smem_offset = RS_SMEM_LOOKBACK_OFFSET + gl_LocalInvocationID.x; - - [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE) - { - uint32_t partition_base_prev = partition_base - RS_RADIX_SIZE; - uint32_t exc = 0; - - // - // NOTE: Each workgroup invocation can proceed independently. - // Subgroups and workgroups do NOT have to coordinate. - // - while (true) - { - const uint32_t prev = atomicLoad(rs_partitions.extent[partition_base_prev + ii], - gl_ScopeQueueFamily, - gl_StorageSemanticsBuffer, - gl_SemanticsAcquire); - - // spin until valid - if ((prev & RS_PARTITION_MASK_STATUS) == RS_PARTITION_MASK_INVALID) - { - continue; - } - - exc += (prev & RS_PARTITION_MASK_COUNT); - - if ((prev & RS_PARTITION_MASK_STATUS) != RS_PARTITION_MASK_PREFIX) - { - // continue accumulating reductions - partition_base_prev -= RS_RADIX_SIZE; - continue; - } - - // Otherwise, save the exclusive scan. - smem.extent[smem_offset + ii] = exc; - break; - } - } - -#if (RS_WORKGROUP_BASE_FINAL < RS_RADIX_SIZE) - const uint32_t smem_offset_final = smem_offset + RS_WORKGROUP_BASE_FINAL; - - if (smem_offset_final < RS_RADIX_SIZE) + [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE) { uint32_t partition_base_prev = partition_base - RS_RADIX_SIZE; uint32_t exc = 0; @@ -1207,56 +1030,10 @@ rs_lookback_skip_store(restrict buffer_rs_partitions rs_partitions, // while (true) { - const uint32_t prev = - atomicLoad(rs_partitions.extent[partition_base_prev + RS_WORKGROUP_BASE_FINAL], - gl_ScopeQueueFamily, - gl_StorageSemanticsBuffer, - gl_SemanticsAcquire); - - // spin until valid - if ((prev & RS_PARTITION_MASK_STATUS) == RS_PARTITION_MASK_INVALID) - { - continue; - } - - exc += (prev & RS_PARTITION_MASK_COUNT); - - if ((prev & RS_PARTITION_MASK_STATUS) != RS_PARTITION_MASK_PREFIX) - { - // continue accumulating reductions - partition_base_prev -= RS_RADIX_SIZE; - continue; - } - - // Otherwise, save the exclusive scan. - smem.extent[smem_offset_final] = exc; - break; - } - } -#endif - -#elif (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE) - //////////////////////////////////////////////////////////////////////////// - // - // (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE) - // -#if (RS_WORKGROUP_SIZE > RS_RADIX_SIZE) - if (gl_LocalInvocationID.x < RS_RADIX_SIZE) -#endif - { - uint32_t partition_base_prev = partition_base - RS_RADIX_SIZE; - uint32_t exc = 0; - - // - // NOTE: Each workgroup invocation can proceed independently. - // Subgroups and workgroups do NOT have to coordinate. - // - while (true) - { - const uint32_t prev = atomicLoad(rs_partitions.extent[partition_base_prev], + const uint32_t prev = atomicLoad(rs_partitions.extent[partition_base_prev + ii], gl_ScopeQueueFamily, gl_StorageSemanticsBuffer, - gl_SemanticsAcquire); + gl_SemanticsAcquire | gl_SemanticsMakeVisible); // spin until valid if ((prev & RS_PARTITION_MASK_STATUS) == RS_PARTITION_MASK_INVALID) @@ -1274,12 +1051,142 @@ rs_lookback_skip_store(restrict buffer_rs_partitions rs_partitions, } // Otherwise, save the exclusive scan. - smem.extent[RS_SMEM_LOOKBACK_OFFSET + gl_LocalInvocationID.x] = exc; + smem.extent[smem_offset + ii] = exc; + break; + } + } + } + else if (RS_WORKGROUP_SIZE < RS_RADIX_SIZE) + { + //////////////////////////////////////////////////////////////////////////// + // + // (RS_WORKGROUP_SIZE < RS_RADIX_SIZE) + // + const uint32_t smem_offset = RS_SMEM_LOOKBACK_OFFSET + gl_LocalInvocationID.x; + + [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE) + { + uint32_t partition_base_prev = partition_base - RS_RADIX_SIZE; + uint32_t exc = 0; + + // + // NOTE: Each workgroup invocation can proceed independently. + // Subgroups and workgroups do NOT have to coordinate. + // + while (true) + { + const uint32_t prev = atomicLoad(rs_partitions.extent[partition_base_prev + ii], + gl_ScopeQueueFamily, + gl_StorageSemanticsBuffer, + gl_SemanticsAcquire | gl_SemanticsMakeVisible); + + // spin until valid + if ((prev & RS_PARTITION_MASK_STATUS) == RS_PARTITION_MASK_INVALID) + { + continue; + } + + exc += (prev & RS_PARTITION_MASK_COUNT); + + if ((prev & RS_PARTITION_MASK_STATUS) != RS_PARTITION_MASK_PREFIX) + { + // continue accumulating reductions + partition_base_prev -= RS_RADIX_SIZE; + continue; + } + + // Otherwise, save the exclusive scan. + smem.extent[smem_offset + ii] = exc; break; } } -#endif + if (RS_WORKGROUP_BASE_FINAL < RS_RADIX_SIZE) + { + const uint32_t smem_offset_final = smem_offset + RS_WORKGROUP_BASE_FINAL; + + if (smem_offset_final < RS_RADIX_SIZE) + { + uint32_t partition_base_prev = partition_base - RS_RADIX_SIZE; + uint32_t exc = 0; + + // + // NOTE: Each workgroup invocation can proceed independently. + // Subgroups and workgroups do NOT have to coordinate. + // + while (true) + { + const uint32_t prev = + atomicLoad(rs_partitions.extent[partition_base_prev + RS_WORKGROUP_BASE_FINAL], + gl_ScopeQueueFamily, + gl_StorageSemanticsBuffer, + gl_SemanticsAcquire | gl_SemanticsMakeVisible); + + // spin until valid + if ((prev & RS_PARTITION_MASK_STATUS) == RS_PARTITION_MASK_INVALID) + { + continue; + } + + exc += (prev & RS_PARTITION_MASK_COUNT); + + if ((prev & RS_PARTITION_MASK_STATUS) != RS_PARTITION_MASK_PREFIX) + { + // continue accumulating reductions + partition_base_prev -= RS_RADIX_SIZE; + continue; + } + + // Otherwise, save the exclusive scan. + smem.extent[smem_offset_final] = exc; + break; + } + } + } + } + else + { + //////////////////////////////////////////////////////////////////////////// + // + // (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE) + // + if (RS_WORKGROUP_SIZE == RS_RADIX_SIZE || gl_LocalInvocationID.x < RS_RADIX_SIZE) + { + uint32_t partition_base_prev = partition_base - RS_RADIX_SIZE; + uint32_t exc = 0; + + // + // NOTE: Each workgroup invocation can proceed independently. + // Subgroups and workgroups do NOT have to coordinate. + // + while (true) + { + const uint32_t prev = atomicLoad(rs_partitions.extent[partition_base_prev], + gl_ScopeQueueFamily, + gl_StorageSemanticsBuffer, + gl_SemanticsAcquire | gl_SemanticsMakeVisible); + + // spin until valid + if ((prev & RS_PARTITION_MASK_STATUS) == RS_PARTITION_MASK_INVALID) + { + continue; + } + + exc += (prev & RS_PARTITION_MASK_COUNT); + + if ((prev & RS_PARTITION_MASK_STATUS) != RS_PARTITION_MASK_PREFIX) + { + // continue accumulating reductions + partition_base_prev -= RS_RADIX_SIZE; + continue; + } + + // Otherwise, save the exclusive scan. + smem.extent[RS_SMEM_LOOKBACK_OFFSET + gl_LocalInvocationID.x] = exc; + break; + } + } + } } // @@ -1302,7 +1209,7 @@ rs_rank_to_local(const RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS], // // Reordering phase will overwrite histogram span. // - RS_BARRIER(); + rsBarrier(); } // @@ -1333,13 +1240,7 @@ rs_rank_to_global(const RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS], void rs_reorder(inout RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS], inout uint32_t kr[RS_SCATTER_BLOCK_ROWS]) { - // clang-format off -#if (RS_WORKGROUP_SUBGROUPS == 1) - const uint32_t smem_base = RS_SMEM_REORDER_OFFSET + gl_SubgroupInvocationID; -#else - const uint32_t smem_base = RS_SMEM_REORDER_OFFSET + gl_LocalInvocationID.x; -#endif - // clang-format on + const uint32_t smem_base = RS_SMEM_REORDER_OFFSET + invocation_id(); [[unroll]] for (uint32_t ii = 0; ii < RS_KEYVAL_DWORDS; ii++) { @@ -1353,7 +1254,7 @@ rs_reorder(inout RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS], inout uint32_t kr[RS_ smem.extent[smem_idx] = RS_KV_DWORD(kv[jj], ii); } - RS_BARRIER(); + rsBarrier(); // // Load keyval dword from sorted location @@ -1363,7 +1264,7 @@ rs_reorder(inout RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS], inout uint32_t kr[RS_ RS_KV_DWORD(kv[jj], ii) = smem.extent[smem_base + jj * RS_WORKGROUP_SIZE]; } - RS_BARRIER(); + rsBarrier(); } // @@ -1376,7 +1277,7 @@ rs_reorder(inout RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS], inout uint32_t kr[RS_ smem.extent[smem_idx] = uint32_t(kr[ii]); } - RS_BARRIER(); + rsBarrier(); // // Load kr[] from sorted location -- we only need the rank. @@ -1395,13 +1296,7 @@ void rs_reorder_1(inout RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS], inout uint32_t kr[RS_SCATTER_BLOCK_ROWS]) { - // clang-format off -#if (RS_WORKGROUP_SUBGROUPS == 1) - const uint32_t smem_base = RS_SMEM_REORDER_OFFSET + gl_SubgroupInvocationID; -#else - const uint32_t smem_base = RS_SMEM_REORDER_OFFSET + gl_LocalInvocationID.x; -#endif - // clang-format on + const uint32_t smem_base = RS_SMEM_REORDER_OFFSET + invocation_id(); [[unroll]] for (uint32_t ii = 0; ii < RS_KEYVAL_DWORDS; ii++) { @@ -1415,7 +1310,7 @@ rs_reorder_1(inout RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS], smem.extent[smem_idx] = RS_KV_DWORD(kv[jj], ii); } - RS_BARRIER(); + rsBarrier(); // // Load keyval dword from sorted location @@ -1425,7 +1320,7 @@ rs_reorder_1(inout RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS], RS_KV_DWORD(kv[jj], ii) = smem.extent[smem_base + jj * RS_WORKGROUP_SIZE]; } - RS_BARRIER(); + rsBarrier(); } // @@ -1438,7 +1333,7 @@ rs_reorder_1(inout RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS], smem.extent[smem_idx] = uint32_t(kr[ii]); } - RS_BARRIER(); + rsBarrier(); // // Load kr[] from sorted location -- we only need the rank. @@ -1459,7 +1354,7 @@ rs_load(out RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS]) // // Set up buffer reference // - const uint32_t kv_in_offset_keys = gl_WorkGroupID.x * RS_BLOCK_KEYVALS + + const uint32_t kv_in_offset_keys = RS_GL_WORKGROUP_ID_X * RS_BLOCK_KEYVALS + gl_SubgroupID * RS_SUBGROUP_KEYVALS + gl_SubgroupInvocationID; u32vec2 kv_in_offset; @@ -1530,6 +1425,58 @@ rs_store(const RS_KEYVAL_TYPE kv[RS_SCATTER_BLOCK_ROWS], const uint32_t kr[RS_SC void main() { + // + // If this is a nonsequential dispatch device then acquire a virtual + // workgroup id. + // + // This is only run once and is a special compile-time-enabled case + // so we leverage the existing `push.devaddr_partitions` address + // instead of altering the push constant structure definition. + // + if (RS_SCATTER_NONSEQUENTIAL_DISPATCH != 0) + { + if (RS_IS_FIRST_LOCAL_INVOCATION()) + { + // The "internal" memory map looks like this: + // + // +---------------------------------+ <-- 0 + // | histograms[keyval_size] | + // +---------------------------------+ <-- keyval_size * histo_size + // | partitions[scatter_blocks_ru-1] | + // +---------------------------------+ <-- (keyval_size + scatter_blocks_ru - 1) * histo_size + // | workgroup_ids[keyval_size] | + // +---------------------------------+ <-- (keyval_size + scatter_blocks_ru - 1) * histo_size + workgroup_ids_size + // + // Extended multiply to avoid 4GB overflow + // + u32vec2 workgroup_id_offset; + + umulExtended((gl_NumWorkGroups.x - 1), // virtual workgroup ids follow partitions[] + 4 * RS_RADIX_SIZE, // sizeof(uint32_t) * 256 + workgroup_id_offset.y, // msb + workgroup_id_offset.x); // lsb + + RS_BUFREF_DEFINE_AT_OFFSET_U32VEC2(buffer_rs_workgroup_id, + rs_workgroup_id, + push.devaddr_partitions, + workgroup_id_offset); + + const uint32_t x_idx = RS_SCATTER_KEYVAL_DWORD_BASE * 4 + (push.pass_offset / RS_RADIX_LOG2); + + smem.extent[0] = atomicAdd(rs_workgroup_id.x[x_idx], + 1, + gl_ScopeQueueFamily, + gl_StorageSemanticsBuffer, + gl_SemanticsAcquireRelease); + } + + rsBarrier(); + + rs_gl_workgroup_id_x = smem.extent[0]; + + rsBarrier(); + } + // // Load keyvals // @@ -1568,7 +1515,7 @@ main() [[unroll]] for (uint32_t ii = 0; ii < RS_SCATTER_BLOCK_ROWS; ii++) { - rs_kv_out.extent[gl_WorkGroupID.x * RS_BLOCK_KEYVALS + ii * RS_WORKGROUP_SIZE] = kr[ii]; + rs_kv_out.extent[RS_GL_WORKGROUP_ID_X * RS_BLOCK_KEYVALS + ii * RS_WORKGROUP_SIZE] = kr[ii]; } return; @@ -1594,11 +1541,7 @@ main() // // Define partitions bufref // -#if (RS_WORKGROUP_SUBGROUPS == 1) - const uint32_t partition_offset = gl_SubgroupInvocationID * 4; -#else - const uint32_t partition_offset = gl_LocalInvocationID.x * 4; -#endif + const uint32_t partition_offset = invocation_id() * 4; RS_BUFREF_DEFINE_AT_OFFSET_UINT32(buffer_rs_partitions, rs_partitions, @@ -1608,7 +1551,7 @@ main() // // The first partition is a special case. // - if (gl_WorkGroupID.x == 0) + if (RS_GL_WORKGROUP_ID_X == 0) { // // Other workgroups may lookback on this partition. @@ -1623,12 +1566,12 @@ main() // // Otherwise, this is not the first workgroup. // - RS_SUBGROUP_UNIFORM const uint32_t partition_base = gl_WorkGroupID.x * RS_RADIX_SIZE; + RS_SUBGROUP_UNIFORM const uint32_t partition_base = RS_GL_WORKGROUP_ID_X * RS_RADIX_SIZE; // // The last partition is a special case. // - if (gl_WorkGroupID.x + 1 < gl_NumWorkGroups.x) + if (RS_GL_WORKGROUP_ID_X + 1 < gl_NumWorkGroups.x) { // // Atomically store the reduction to the global partition. @@ -1667,7 +1610,7 @@ main() // // Barrier before reading prefix scanned histogram. // - RS_BARRIER(); + rsBarrier(); // // Convert keyval's rank to a local index @@ -1686,7 +1629,7 @@ main() // // Wait for lookback to complete. // - RS_BARRIER(); + rsBarrier(); #endif // diff --git a/src/amd/vulkan/radix_sort/shaders/scatter_0_even.comp b/src/vulkan/runtime/radix_sort/shaders/scatter_0_even.comp similarity index 100% rename from src/amd/vulkan/radix_sort/shaders/scatter_0_even.comp rename to src/vulkan/runtime/radix_sort/shaders/scatter_0_even.comp diff --git a/src/amd/vulkan/radix_sort/shaders/scatter_0_odd.comp b/src/vulkan/runtime/radix_sort/shaders/scatter_0_odd.comp similarity index 100% rename from src/amd/vulkan/radix_sort/shaders/scatter_0_odd.comp rename to src/vulkan/runtime/radix_sort/shaders/scatter_0_odd.comp diff --git a/src/amd/vulkan/radix_sort/shaders/scatter_1_even.comp b/src/vulkan/runtime/radix_sort/shaders/scatter_1_even.comp similarity index 100% rename from src/amd/vulkan/radix_sort/shaders/scatter_1_even.comp rename to src/vulkan/runtime/radix_sort/shaders/scatter_1_even.comp diff --git a/src/amd/vulkan/radix_sort/shaders/scatter_1_odd.comp b/src/vulkan/runtime/radix_sort/shaders/scatter_1_odd.comp similarity index 100% rename from src/amd/vulkan/radix_sort/shaders/scatter_1_odd.comp rename to src/vulkan/runtime/radix_sort/shaders/scatter_1_odd.comp diff --git a/src/amd/vulkan/radix_sort/target.h b/src/vulkan/runtime/radix_sort/target.h similarity index 94% rename from src/amd/vulkan/radix_sort/target.h rename to src/vulkan/runtime/radix_sort/target.h index 2164389757d..1ddac0ccc8e 100644 --- a/src/amd/vulkan/radix_sort/target.h +++ b/src/vulkan/runtime/radix_sort/target.h @@ -27,6 +27,7 @@ struct radix_sort_vk_target_config struct { uint32_t workgroup_size_log2; + uint32_t block_rows; } fill; struct @@ -48,6 +49,8 @@ struct radix_sort_vk_target_config uint32_t subgroup_size_log2; uint32_t block_rows; } scatter; + + bool nonsequential_dispatch; }; // diff --git a/src/vulkan/runtime/vk_acceleration_structure.c b/src/vulkan/runtime/vk_acceleration_structure.c index 074b94ea85c..ccea927f559 100644 --- a/src/vulkan/runtime/vk_acceleration_structure.c +++ b/src/vulkan/runtime/vk_acceleration_structure.c @@ -27,7 +27,41 @@ #include "vk_alloc.h" #include "vk_common_entrypoints.h" #include "vk_device.h" +#include "vk_command_buffer.h" #include "vk_log.h" +#include "vk_meta.h" + +#include "bvh/vk_build_interface.h" +#include "bvh/vk_bvh.h" + +#include "radix_sort/common/vk/barrier.h" +#include "radix_sort/shaders/push.h" + +#include "util/u_string.h" + +static const uint32_t leaf_spv[] = { +#include "bvh/leaf.spv.h" +}; + +static const uint32_t leaf_always_active_spv[] = { +#include "bvh/leaf_always_active.spv.h" +}; + +static const uint32_t morton_spv[] = { +#include "bvh/morton.spv.h" +}; + +static const uint32_t lbvh_main_spv[] = { +#include "bvh/lbvh_main.spv.h" +}; + +static const uint32_t lbvh_generate_ir_spv[] = { +#include "bvh/lbvh_generate_ir.spv.h" +}; + +static const uint32_t ploc_spv[] = { +#include "bvh/ploc_internal.spv.h" +}; VkDeviceAddress vk_acceleration_structure_get_va(struct vk_acceleration_structure *accel_struct) @@ -92,3 +126,1122 @@ vk_common_GetAccelerationStructureDeviceAddressKHR( VK_FROM_HANDLE(vk_acceleration_structure, accel_struct, pInfo->accelerationStructure); return vk_acceleration_structure_get_va(accel_struct); } + +#define KEY_ID_PAIR_SIZE 8 +#define MORTON_BIT_SIZE 24 + +enum internal_build_type { + INTERNAL_BUILD_TYPE_LBVH, + INTERNAL_BUILD_TYPE_PLOC, + INTERNAL_BUILD_TYPE_UPDATE, +}; + +struct build_config { + enum internal_build_type internal_type; + bool updateable; + uint32_t encode_key[MAX_ENCODE_PASSES]; +}; + +struct scratch_layout { + uint32_t size; + uint32_t update_size; + + uint32_t header_offset; + + /* Used for BUILD only. */ + + uint32_t sort_buffer_offset[2]; + uint32_t sort_internal_offset; + + uint32_t ploc_prefix_sum_partition_offset; + uint32_t lbvh_node_offset; + + uint32_t ir_offset; + uint32_t internal_node_offset; +}; + +static struct build_config +build_config(uint32_t leaf_count, + const VkAccelerationStructureBuildGeometryInfoKHR *build_info, + const struct vk_acceleration_structure_build_ops *ops) +{ + struct build_config config = {0}; + + if (leaf_count <= 4) + config.internal_type = INTERNAL_BUILD_TYPE_LBVH; + else if (build_info->type == VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR) + config.internal_type = INTERNAL_BUILD_TYPE_PLOC; + else if (!(build_info->flags & VK_BUILD_ACCELERATION_STRUCTURE_PREFER_FAST_BUILD_BIT_KHR) && + !(build_info->flags & VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_UPDATE_BIT_KHR)) + config.internal_type = INTERNAL_BUILD_TYPE_PLOC; + else + config.internal_type = INTERNAL_BUILD_TYPE_LBVH; + + if (build_info->mode == VK_BUILD_ACCELERATION_STRUCTURE_MODE_UPDATE_KHR && + build_info->type == VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR && + ops->update_as[0]) + config.internal_type = INTERNAL_BUILD_TYPE_UPDATE; + + if ((build_info->flags & VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_UPDATE_BIT_KHR) && + build_info->type == VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR && + ops->update_as[0]) + config.updateable = true; + + for (unsigned i = 0; i < ARRAY_SIZE(config.encode_key); i++) { + if (!ops->get_encode_key[i]) + break; + config.encode_key[i] = ops->get_encode_key[i](leaf_count, build_info->flags); + } + + return config; +} + +static void +get_scratch_layout(struct vk_device *device, + uint32_t leaf_count, + const VkAccelerationStructureBuildGeometryInfoKHR *build_info, + const struct vk_acceleration_structure_build_args *args, + struct scratch_layout *scratch) +{ + uint32_t internal_count = MAX2(leaf_count, 2) - 1; + + radix_sort_vk_memory_requirements_t requirements = { + 0, + }; + radix_sort_vk_get_memory_requirements(args->radix_sort, leaf_count, + &requirements); + + uint32_t ir_leaf_size; + switch (vk_get_as_geometry_type(build_info)) { + case VK_GEOMETRY_TYPE_TRIANGLES_KHR: + ir_leaf_size = sizeof(struct vk_ir_triangle_node); + break; + case VK_GEOMETRY_TYPE_AABBS_KHR: + ir_leaf_size = sizeof(struct vk_ir_aabb_node); + break; + case VK_GEOMETRY_TYPE_INSTANCES_KHR: + ir_leaf_size = sizeof(struct vk_ir_instance_node); + break; + default: + unreachable("Unknown VkGeometryTypeKHR"); + } + + + uint32_t offset = 0; + + uint32_t ploc_scratch_space = 0; + uint32_t lbvh_node_space = 0; + + struct build_config config = build_config(leaf_count, build_info, + device->as_build_ops); + + if (config.internal_type == INTERNAL_BUILD_TYPE_PLOC) + ploc_scratch_space = DIV_ROUND_UP(leaf_count, PLOC_WORKGROUP_SIZE) * sizeof(struct ploc_prefix_scan_partition); + else + lbvh_node_space = sizeof(struct lbvh_node_info) * internal_count; + + scratch->header_offset = offset; + offset += sizeof(struct vk_ir_header); + + scratch->sort_buffer_offset[0] = offset; + offset += requirements.keyvals_size; + + scratch->sort_buffer_offset[1] = offset; + offset += requirements.keyvals_size; + + scratch->sort_internal_offset = offset; + /* Internal sorting data is not needed when PLOC/LBVH are invoked, + * save space by aliasing them */ + scratch->ploc_prefix_sum_partition_offset = offset; + scratch->lbvh_node_offset = offset; + offset += MAX3(requirements.internal_size, ploc_scratch_space, lbvh_node_space); + + scratch->ir_offset = offset; + offset += ir_leaf_size * leaf_count; + + scratch->internal_node_offset = offset; + offset += sizeof(struct vk_ir_box_node) * internal_count; + + scratch->size = offset; + + if (build_info->type == VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR && + device->as_build_ops->update_as[0]) { + scratch->update_size = + device->as_build_ops->get_update_scratch_size(device, leaf_count); + } else { + scratch->update_size = offset; + } +} + +struct bvh_state { + uint32_t scratch_offset; + + uint32_t leaf_node_count; + uint32_t internal_node_count; + uint32_t leaf_node_size; + + struct scratch_layout scratch; + struct build_config config; + + /* Radix sort state */ + uint32_t scatter_blocks; + uint32_t count_ru_scatter; + uint32_t histo_blocks; + uint32_t count_ru_histo; + struct rs_push_scatter push_scatter; + + uint32_t last_encode_pass; +}; + +struct bvh_batch_state { + bool any_updateable; + bool any_non_updateable; + bool any_ploc; + bool any_lbvh; + bool any_update; +}; + +static VkResult +get_pipeline_spv(struct vk_device *device, struct vk_meta_device *meta, + const char *name, const uint32_t *spv, uint32_t spv_size, + unsigned push_constant_size, + const struct vk_acceleration_structure_build_args *args, + VkPipeline *pipeline, VkPipelineLayout *layout) +{ + size_t key_size = strlen(name); + + VkResult result = vk_meta_get_pipeline_layout( + device, meta, NULL, + &(VkPushConstantRange){ + VK_SHADER_STAGE_COMPUTE_BIT, 0, push_constant_size + }, + name, key_size, layout); + + if (result != VK_SUCCESS) + return result; + + VkPipeline pipeline_from_cache = vk_meta_lookup_pipeline(meta, name, key_size); + if (pipeline_from_cache != VK_NULL_HANDLE) { + *pipeline = pipeline_from_cache; + return VK_SUCCESS; + } + + VkShaderModuleCreateInfo module_info = { + .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, + .pNext = NULL, + .flags = 0, + .codeSize = spv_size, + .pCode = spv, + }; + + VkSpecializationMapEntry spec_map[2] = { + { + .constantID = SUBGROUP_SIZE_ID, + .offset = 0, + .size = sizeof(args->subgroup_size), + }, + { + .constantID = BVH_BOUNDS_OFFSET_ID, + .offset = sizeof(args->subgroup_size), + .size = sizeof(args->bvh_bounds_offset), + }, + }; + + uint32_t spec_constants[2] = { + args->subgroup_size, + args->bvh_bounds_offset + }; + + VkSpecializationInfo spec_info = { + .mapEntryCount = ARRAY_SIZE(spec_map), + .pMapEntries = spec_map, + .dataSize = sizeof(spec_constants), + .pData = spec_constants, + }; + + VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT rssci = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT, + .pNext = &module_info, + .requiredSubgroupSize = args->subgroup_size, + }; + + VkPipelineShaderStageCreateInfo shader_stage = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .pNext = &rssci, + .flags = VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT_EXT, + .stage = VK_SHADER_STAGE_COMPUTE_BIT, + .pName = "main", + .pSpecializationInfo = &spec_info, + }; + + VkComputePipelineCreateInfo pipeline_info = { + .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, + .stage = shader_stage, + .flags = 0, + .layout = *layout, + }; + + return vk_meta_create_compute_pipeline(device, meta, &pipeline_info, + name, key_size, pipeline); +} + +static uint32_t +pack_geometry_id_and_flags(uint32_t geometry_id, uint32_t flags) +{ + uint32_t geometry_id_and_flags = geometry_id; + if (flags & VK_GEOMETRY_OPAQUE_BIT_KHR) + geometry_id_and_flags |= VK_GEOMETRY_OPAQUE; + + return geometry_id_and_flags; +} + +struct vk_bvh_geometry_data +vk_fill_geometry_data(VkAccelerationStructureTypeKHR type, uint32_t first_id, uint32_t geom_index, + const VkAccelerationStructureGeometryKHR *geometry, + const VkAccelerationStructureBuildRangeInfoKHR *build_range_info) +{ + struct vk_bvh_geometry_data data = { + .first_id = first_id, + .geometry_id = pack_geometry_id_and_flags(geom_index, geometry->flags), + .geometry_type = geometry->geometryType, + }; + + switch (geometry->geometryType) { + case VK_GEOMETRY_TYPE_TRIANGLES_KHR: + assert(type == VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR); + + data.data = geometry->geometry.triangles.vertexData.deviceAddress + + build_range_info->firstVertex * geometry->geometry.triangles.vertexStride; + data.indices = geometry->geometry.triangles.indexData.deviceAddress; + + if (geometry->geometry.triangles.indexType == VK_INDEX_TYPE_NONE_KHR) + data.data += build_range_info->primitiveOffset; + else + data.indices += build_range_info->primitiveOffset; + + data.transform = geometry->geometry.triangles.transformData.deviceAddress; + if (data.transform) + data.transform += build_range_info->transformOffset; + + data.stride = geometry->geometry.triangles.vertexStride; + data.vertex_format = geometry->geometry.triangles.vertexFormat; + data.index_format = geometry->geometry.triangles.indexType; + break; + case VK_GEOMETRY_TYPE_AABBS_KHR: + assert(type == VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR); + + data.data = geometry->geometry.aabbs.data.deviceAddress + build_range_info->primitiveOffset; + data.stride = geometry->geometry.aabbs.stride; + break; + case VK_GEOMETRY_TYPE_INSTANCES_KHR: + assert(type == VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR); + + data.data = geometry->geometry.instances.data.deviceAddress + build_range_info->primitiveOffset; + + if (geometry->geometry.instances.arrayOfPointers) + data.stride = 8; + else + data.stride = sizeof(VkAccelerationStructureInstanceKHR); + break; + default: + unreachable("Unknown geometryType"); + } + + return data; +} + +static void +vk_cmd_begin_debug_marker(VkCommandBuffer commandBuffer, const char *format, ...) +{ + VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, commandBuffer); + struct vk_device *device = cmd_buffer->base.device; + + va_list ap; + va_start(ap, format); + + char *name; + if (vasprintf(&name, format, ap) == -1) + return; + + va_end(ap); + + VkDebugMarkerMarkerInfoEXT marker = { + .sType = VK_STRUCTURE_TYPE_DEBUG_MARKER_MARKER_INFO_EXT, + .pMarkerName = name, + }; + + device->dispatch_table.CmdDebugMarkerBeginEXT(commandBuffer, &marker); +} + +static void +vk_cmd_end_debug_marker(VkCommandBuffer commandBuffer) +{ + VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, commandBuffer); + struct vk_device *device = cmd_buffer->base.device; + + device->dispatch_table.CmdDebugMarkerEndEXT(commandBuffer); +} + +static VkResult +build_leaves(VkCommandBuffer commandBuffer, + struct vk_device *device, struct vk_meta_device *meta, + const struct vk_acceleration_structure_build_args *args, + uint32_t infoCount, + const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, + const VkAccelerationStructureBuildRangeInfoKHR *const *ppBuildRangeInfos, + struct bvh_state *bvh_states, + bool updateable) +{ + VkPipeline pipeline; + VkPipelineLayout layout; + + /* Many apps are broken and will make inactive primitives active when + * updating, even though this is disallowed by the spec. To handle this, + * we use a different variant for updateable acceleration structures when + * the driver implements an update pass. This passes through inactive leaf + * nodes as if they were active, with an empty bounding box. It's then the + * driver or HW's responsibility to filter out inactive nodes. + */ + VkResult result; + if (updateable) { + result = get_pipeline_spv(device, meta, "leaves_always_active", + leaf_always_active_spv, + sizeof(leaf_always_active_spv), + sizeof(struct leaf_args), args, &pipeline, &layout); + } else { + result = get_pipeline_spv(device, meta, "leaves", leaf_spv, sizeof(leaf_spv), + sizeof(struct leaf_args), args, &pipeline, &layout); + } + + if (result != VK_SUCCESS) + return result; + + if (args->emit_markers) + vk_cmd_begin_debug_marker(commandBuffer, "build_leaves"); + + const struct vk_device_dispatch_table *disp = &device->dispatch_table; + disp->CmdBindPipeline( + commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); + + for (uint32_t i = 0; i < infoCount; ++i) { + if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE) + continue; + if (bvh_states[i].config.updateable != updateable) + continue; + + struct leaf_args leaf_consts = { + .bvh = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset, + .header = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset, + .ids = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[0], + }; + + for (unsigned j = 0; j < pInfos[i].geometryCount; ++j) { + const VkAccelerationStructureGeometryKHR *geom = + pInfos[i].pGeometries ? &pInfos[i].pGeometries[j] : pInfos[i].ppGeometries[j]; + + const VkAccelerationStructureBuildRangeInfoKHR *build_range_info = &ppBuildRangeInfos[i][j]; + + leaf_consts.geom_data = vk_fill_geometry_data(pInfos[i].type, bvh_states[i].leaf_node_count, j, geom, build_range_info); + + disp->CmdPushConstants(commandBuffer, layout, + VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(leaf_consts), &leaf_consts); + device->cmd_dispatch_unaligned(commandBuffer, build_range_info->primitiveCount, 1, 1); + + bvh_states[i].leaf_node_count += build_range_info->primitiveCount; + } + } + + if (args->emit_markers) + vk_cmd_end_debug_marker(commandBuffer); + + return VK_SUCCESS; +} + +static VkResult +morton_generate(VkCommandBuffer commandBuffer, struct vk_device *device, + struct vk_meta_device *meta, + const struct vk_acceleration_structure_build_args *args, + uint32_t infoCount, + const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, + struct bvh_state *bvh_states) +{ + VkPipeline pipeline; + VkPipelineLayout layout; + + VkResult result = + get_pipeline_spv(device, meta, "morton", morton_spv, sizeof(morton_spv), + sizeof(struct morton_args), args, &pipeline, &layout); + + if (result != VK_SUCCESS) + return result; + + if (args->emit_markers) + vk_cmd_begin_debug_marker(commandBuffer, "morton_generate"); + + const struct vk_device_dispatch_table *disp = &device->dispatch_table; + disp->CmdBindPipeline( + commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); + + for (uint32_t i = 0; i < infoCount; ++i) { + if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE) + continue; + const struct morton_args consts = { + .bvh = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset, + .header = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset, + .ids = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[0], + }; + + disp->CmdPushConstants(commandBuffer, layout, + VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts); + device->cmd_dispatch_unaligned(commandBuffer, bvh_states[i].leaf_node_count, 1, 1); + } + + if (args->emit_markers) + vk_cmd_end_debug_marker(commandBuffer); + + return VK_SUCCESS; +} + +static void +morton_sort(VkCommandBuffer commandBuffer, struct vk_device *device, + const struct vk_acceleration_structure_build_args *args, + uint32_t infoCount, + const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states) +{ + const struct vk_device_dispatch_table *disp = &device->dispatch_table; + + if (args->emit_markers) + vk_cmd_begin_debug_marker(commandBuffer, "morton_sort"); + + /* Copyright 2019 The Fuchsia Authors. */ + const radix_sort_vk_t *rs = args->radix_sort; + + /* + * OVERVIEW + * + * 1. Pad the keyvals in `scatter_even`. + * 2. Zero the `histograms` and `partitions`. + * --- BARRIER --- + * 3. HISTOGRAM is dispatched before PREFIX. + * --- BARRIER --- + * 4. PREFIX is dispatched before the first SCATTER. + * --- BARRIER --- + * 5. One or more SCATTER dispatches. + * + * Note that the `partitions` buffer can be zeroed anytime before the first + * scatter. + */ + + /* How many passes? */ + uint32_t keyval_bytes = rs->config.keyval_dwords * (uint32_t)sizeof(uint32_t); + uint32_t keyval_bits = keyval_bytes * 8; + uint32_t key_bits = MIN2(MORTON_BIT_SIZE, keyval_bits); + uint32_t passes = (key_bits + RS_RADIX_LOG2 - 1) / RS_RADIX_LOG2; + + for (uint32_t i = 0; i < infoCount; ++i) { + if (bvh_states[i].leaf_node_count) + bvh_states[i].scratch_offset = bvh_states[i].scratch.sort_buffer_offset[passes & 1]; + else + bvh_states[i].scratch_offset = bvh_states[i].scratch.sort_buffer_offset[0]; + } + + /* + * PAD KEYVALS AND ZERO HISTOGRAM/PARTITIONS + * + * Pad fractional blocks with max-valued keyvals. + * + * Zero the histograms and partitions buffer. + * + * This assumes the partitions follow the histograms. + */ + + /* FIXME(allanmac): Consider precomputing some of these values and hang them off `rs`. */ + + /* How many scatter blocks? */ + uint32_t scatter_wg_size = 1 << rs->config.scatter.workgroup_size_log2; + uint32_t scatter_block_kvs = scatter_wg_size * rs->config.scatter.block_rows; + + /* + * How many histogram blocks? + * + * Note that it's OK to have more max-valued digits counted by the histogram + * than sorted by the scatters because the sort is stable. + */ + uint32_t histo_wg_size = 1 << rs->config.histogram.workgroup_size_log2; + uint32_t histo_block_kvs = histo_wg_size * rs->config.histogram.block_rows; + + uint32_t pass_idx = (keyval_bytes - passes); + + for (uint32_t i = 0; i < infoCount; ++i) { + if (!bvh_states[i].leaf_node_count) + continue; + if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE) + continue; + + uint64_t keyvals_even_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[0]; + uint64_t internal_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_internal_offset; + + bvh_states[i].scatter_blocks = (bvh_states[i].leaf_node_count + scatter_block_kvs - 1) / scatter_block_kvs; + bvh_states[i].count_ru_scatter = bvh_states[i].scatter_blocks * scatter_block_kvs; + + bvh_states[i].histo_blocks = (bvh_states[i].count_ru_scatter + histo_block_kvs - 1) / histo_block_kvs; + bvh_states[i].count_ru_histo = bvh_states[i].histo_blocks * histo_block_kvs; + + /* Fill with max values */ + if (bvh_states[i].count_ru_histo > bvh_states[i].leaf_node_count) { + device->cmd_fill_buffer_addr(commandBuffer, keyvals_even_addr + + bvh_states[i].leaf_node_count * keyval_bytes, + (bvh_states[i].count_ru_histo - bvh_states[i].leaf_node_count) * keyval_bytes, + 0xFFFFFFFF); + } + + /* + * Zero histograms and invalidate partitions. + * + * Note that the partition invalidation only needs to be performed once + * because the even/odd scatter dispatches rely on the the previous pass to + * leave the partitions in an invalid state. + * + * Note that the last workgroup doesn't read/write a partition so it doesn't + * need to be initialized. + */ + uint32_t histo_partition_count = passes + bvh_states[i].scatter_blocks - 1; + + uint32_t fill_base = pass_idx * (RS_RADIX_SIZE * sizeof(uint32_t)); + + device->cmd_fill_buffer_addr(commandBuffer, + internal_addr + rs->internal.histograms.offset + fill_base, + histo_partition_count * (RS_RADIX_SIZE * sizeof(uint32_t)) + keyval_bytes * sizeof(uint32_t), 0); + } + + /* + * Pipeline: HISTOGRAM + * + * TODO(allanmac): All subgroups should try to process approximately the same + * number of blocks in order to minimize tail effects. This was implemented + * and reverted but should be reimplemented and benchmarked later. + */ + vk_barrier_transfer_w_to_compute_r(commandBuffer); + + disp->CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, + rs->pipelines.named.histogram); + + for (uint32_t i = 0; i < infoCount; ++i) { + if (!bvh_states[i].leaf_node_count) + continue; + if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE) + continue; + + uint64_t keyvals_even_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[0]; + uint64_t internal_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_internal_offset; + + /* Dispatch histogram */ + struct rs_push_histogram push_histogram = { + .devaddr_histograms = internal_addr + rs->internal.histograms.offset, + .devaddr_keyvals = keyvals_even_addr, + .passes = passes, + }; + + disp->CmdPushConstants(commandBuffer, rs->pipeline_layouts.named.histogram, VK_SHADER_STAGE_COMPUTE_BIT, 0, + sizeof(push_histogram), &push_histogram); + + disp->CmdDispatch(commandBuffer, bvh_states[i].histo_blocks, 1, 1); + } + + /* + * Pipeline: PREFIX + * + * Launch one workgroup per pass. + */ + vk_barrier_compute_w_to_compute_r(commandBuffer); + + disp->CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, + rs->pipelines.named.prefix); + + for (uint32_t i = 0; i < infoCount; ++i) { + if (!bvh_states[i].leaf_node_count) + continue; + if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE) + continue; + + uint64_t internal_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_internal_offset; + + struct rs_push_prefix push_prefix = { + .devaddr_histograms = internal_addr + rs->internal.histograms.offset, + }; + + disp->CmdPushConstants(commandBuffer, rs->pipeline_layouts.named.prefix, VK_SHADER_STAGE_COMPUTE_BIT, 0, + sizeof(push_prefix), &push_prefix); + + disp->CmdDispatch(commandBuffer, passes, 1, 1); + } + + /* Pipeline: SCATTER */ + vk_barrier_compute_w_to_compute_r(commandBuffer); + + uint32_t histogram_offset = pass_idx * (RS_RADIX_SIZE * sizeof(uint32_t)); + + for (uint32_t i = 0; i < infoCount; i++) { + uint64_t keyvals_even_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[0]; + uint64_t keyvals_odd_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[1]; + uint64_t internal_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_internal_offset; + + bvh_states[i].push_scatter = (struct rs_push_scatter){ + .devaddr_keyvals_even = keyvals_even_addr, + .devaddr_keyvals_odd = keyvals_odd_addr, + .devaddr_partitions = internal_addr + rs->internal.partitions.offset, + .devaddr_histograms = internal_addr + rs->internal.histograms.offset + histogram_offset, + }; + } + + bool is_even = true; + + while (true) { + uint32_t pass_dword = pass_idx / 4; + + /* Bind new pipeline */ + VkPipeline p = + is_even ? rs->pipelines.named.scatter[pass_dword].even : rs->pipelines.named.scatter[pass_dword].odd; + disp->CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, p); + + /* Update push constants that changed */ + VkPipelineLayout pl = is_even ? rs->pipeline_layouts.named.scatter[pass_dword].even + : rs->pipeline_layouts.named.scatter[pass_dword].odd; + + for (uint32_t i = 0; i < infoCount; i++) { + if (!bvh_states[i].leaf_node_count) + continue; + if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE) + continue; + + bvh_states[i].push_scatter.pass_offset = (pass_idx & 3) * RS_RADIX_LOG2; + + disp->CmdPushConstants(commandBuffer, pl, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(struct rs_push_scatter), + &bvh_states[i].push_scatter); + + disp->CmdDispatch(commandBuffer, bvh_states[i].scatter_blocks, 1, 1); + + bvh_states[i].push_scatter.devaddr_histograms += (RS_RADIX_SIZE * sizeof(uint32_t)); + } + + /* Continue? */ + if (++pass_idx >= keyval_bytes) + break; + + vk_barrier_compute_w_to_compute_r(commandBuffer); + + is_even ^= true; + } + + if (args->emit_markers) + vk_cmd_end_debug_marker(commandBuffer); +} + +static VkResult +lbvh_build_internal(VkCommandBuffer commandBuffer, + struct vk_device *device, struct vk_meta_device *meta, + const struct vk_acceleration_structure_build_args *args, + uint32_t infoCount, + const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states) +{ + VkPipeline pipeline; + VkPipelineLayout layout; + + VkResult result = + get_pipeline_spv(device, meta, "lbvh_main", lbvh_main_spv, + sizeof(lbvh_main_spv), + sizeof(struct lbvh_main_args), args, &pipeline, &layout); + + if (result != VK_SUCCESS) + return result; + + if (args->emit_markers) + vk_cmd_begin_debug_marker(commandBuffer, "lbvh_build_internal"); + + const struct vk_device_dispatch_table *disp = &device->dispatch_table; + disp->CmdBindPipeline( + commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); + + for (uint32_t i = 0; i < infoCount; ++i) { + if (bvh_states[i].config.internal_type != INTERNAL_BUILD_TYPE_LBVH) + continue; + + uint32_t src_scratch_offset = bvh_states[i].scratch_offset; + uint32_t internal_node_count = MAX2(bvh_states[i].leaf_node_count, 2) - 1; + + const struct lbvh_main_args consts = { + .bvh = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset, + .src_ids = pInfos[i].scratchData.deviceAddress + src_scratch_offset, + .node_info = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.lbvh_node_offset, + .id_count = bvh_states[i].leaf_node_count, + .internal_node_base = bvh_states[i].scratch.internal_node_offset - bvh_states[i].scratch.ir_offset, + }; + + disp->CmdPushConstants(commandBuffer, layout, + VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts); + device->cmd_dispatch_unaligned(commandBuffer, internal_node_count, 1, 1); + bvh_states[i].internal_node_count = internal_node_count; + } + + vk_barrier_compute_w_to_compute_r(commandBuffer); + + result = + get_pipeline_spv(device, meta, "lbvh_generate_ir", lbvh_generate_ir_spv, + sizeof(lbvh_generate_ir_spv), + sizeof(struct lbvh_generate_ir_args), args, &pipeline, &layout); + + if (result != VK_SUCCESS) + return result; + + disp->CmdBindPipeline( + commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); + + for (uint32_t i = 0; i < infoCount; ++i) { + if (bvh_states[i].config.internal_type != INTERNAL_BUILD_TYPE_LBVH) + continue; + + const struct lbvh_generate_ir_args consts = { + .bvh = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset, + .node_info = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.lbvh_node_offset, + .header = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset, + .internal_node_base = bvh_states[i].scratch.internal_node_offset - bvh_states[i].scratch.ir_offset, + }; + + disp->CmdPushConstants(commandBuffer, layout, + VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts); + device->cmd_dispatch_unaligned(commandBuffer, bvh_states[i].internal_node_count, 1, 1); + } + + if (args->emit_markers) + vk_cmd_end_debug_marker(commandBuffer); + + return VK_SUCCESS; +} + +static VkResult +ploc_build_internal(VkCommandBuffer commandBuffer, + struct vk_device *device, struct vk_meta_device *meta, + const struct vk_acceleration_structure_build_args *args, + uint32_t infoCount, + const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states) +{ + VkPipeline pipeline; + VkPipelineLayout layout; + + VkResult result = + get_pipeline_spv(device, meta, "ploc", ploc_spv, + sizeof(ploc_spv), + sizeof(struct ploc_args), args, &pipeline, &layout); + + if (result != VK_SUCCESS) + return result; + + if (args->emit_markers) + vk_cmd_begin_debug_marker(commandBuffer, "ploc_build_internal"); + + const struct vk_device_dispatch_table *disp = &device->dispatch_table; + disp->CmdBindPipeline( + commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); + + for (uint32_t i = 0; i < infoCount; ++i) { + if (bvh_states[i].config.internal_type != INTERNAL_BUILD_TYPE_PLOC) + continue; + + uint32_t src_scratch_offset = bvh_states[i].scratch_offset; + uint32_t dst_scratch_offset = (src_scratch_offset == bvh_states[i].scratch.sort_buffer_offset[0]) + ? bvh_states[i].scratch.sort_buffer_offset[1] + : bvh_states[i].scratch.sort_buffer_offset[0]; + + const struct ploc_args consts = { + .bvh = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset, + .header = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset, + .ids_0 = pInfos[i].scratchData.deviceAddress + src_scratch_offset, + .ids_1 = pInfos[i].scratchData.deviceAddress + dst_scratch_offset, + .prefix_scan_partitions = + pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ploc_prefix_sum_partition_offset, + .internal_node_offset = bvh_states[i].scratch.internal_node_offset - bvh_states[i].scratch.ir_offset, + }; + + disp->CmdPushConstants(commandBuffer, layout, + VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts); + disp->CmdDispatch(commandBuffer, MAX2(DIV_ROUND_UP(bvh_states[i].leaf_node_count, PLOC_WORKGROUP_SIZE), 1), 1, 1); + } + + if (args->emit_markers) + vk_cmd_end_debug_marker(commandBuffer); + + return VK_SUCCESS; +} + +void +vk_cmd_build_acceleration_structures(VkCommandBuffer commandBuffer, + struct vk_device *device, + struct vk_meta_device *meta, + uint32_t infoCount, + const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, + const VkAccelerationStructureBuildRangeInfoKHR *const *ppBuildRangeInfos, + const struct vk_acceleration_structure_build_args *args) +{ + VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, commandBuffer); + const struct vk_acceleration_structure_build_ops *ops = device->as_build_ops; + + struct bvh_batch_state batch_state = {0}; + + struct bvh_state *bvh_states = calloc(infoCount, sizeof(struct bvh_state)); + + if (args->emit_markers) + vk_cmd_begin_debug_marker(commandBuffer, "vkCmdBuildAccelerationStructuresKHR(%u)", infoCount); + + for (uint32_t i = 0; i < infoCount; ++i) { + uint32_t leaf_node_count = 0; + for (uint32_t j = 0; j < pInfos[i].geometryCount; ++j) { + leaf_node_count += ppBuildRangeInfos[i][j].primitiveCount; + } + + get_scratch_layout(device, leaf_node_count, pInfos + i, args, &bvh_states[i].scratch); + + struct build_config config = build_config(leaf_node_count, pInfos + i, + device->as_build_ops); + bvh_states[i].config = config; + + if (config.updateable) + batch_state.any_updateable = true; + else + batch_state.any_non_updateable = true; + + if (config.internal_type == INTERNAL_BUILD_TYPE_PLOC) { + batch_state.any_ploc = true; + } else if (config.internal_type == INTERNAL_BUILD_TYPE_LBVH) { + batch_state.any_lbvh = true; + } else if (config.internal_type == INTERNAL_BUILD_TYPE_UPDATE) { + batch_state.any_update = true; + } else { + unreachable("Unknown internal_build_type"); + } + + if (bvh_states[i].config.internal_type != INTERNAL_BUILD_TYPE_UPDATE) { + /* The internal node count is updated in lbvh_build_internal for LBVH + * and from the PLOC shader for PLOC. */ + struct vk_ir_header header = { + .min_bounds = {0x7fffffff, 0x7fffffff, 0x7fffffff}, + .max_bounds = {0x80000000, 0x80000000, 0x80000000}, + .dispatch_size_y = 1, + .dispatch_size_z = 1, + .sync_data = + { + .current_phase_end_counter = TASK_INDEX_INVALID, + /* Will be updated by the first PLOC shader invocation */ + .task_counts = {TASK_INDEX_INVALID, TASK_INDEX_INVALID}, + }, + }; + + device->write_buffer_cp(commandBuffer, pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset, + &header, sizeof(header)); + } else { + VK_FROM_HANDLE(vk_acceleration_structure, src_as, pInfos[i].srcAccelerationStructure); + VK_FROM_HANDLE(vk_acceleration_structure, dst_as, pInfos[i].dstAccelerationStructure); + + ops->init_update_scratch(commandBuffer, pInfos[i].scratchData.deviceAddress, + leaf_node_count, src_as, dst_as); + } + } + + /* Wait for the write_buffer_cp to land before using in compute shaders */ + device->flush_buffer_write_cp(commandBuffer); + device->dispatch_table.CmdPipelineBarrier(commandBuffer, + VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + 0, /* dependencyFlags */ + 1, + &(VkMemoryBarrier) { + .srcAccessMask = 0, + .dstAccessMask = VK_ACCESS_SHADER_READ_BIT, + }, 0, NULL, 0, NULL); + + if (batch_state.any_lbvh || batch_state.any_ploc) { + VkResult result; + + if (batch_state.any_non_updateable) { + result = + build_leaves(commandBuffer, device, meta, args, infoCount, pInfos, + ppBuildRangeInfos, bvh_states, false); + + if (result != VK_SUCCESS) { + free(bvh_states); + vk_command_buffer_set_error(cmd_buffer, result); + return; + } + } + + if (batch_state.any_updateable) { + result = + build_leaves(commandBuffer, device, meta, args, infoCount, pInfos, + ppBuildRangeInfos, bvh_states, true); + + if (result != VK_SUCCESS) { + free(bvh_states); + vk_command_buffer_set_error(cmd_buffer, result); + return; + } + } + + vk_barrier_compute_w_to_compute_r(commandBuffer); + + result = + morton_generate(commandBuffer, device, meta, args, infoCount, pInfos, bvh_states); + + if (result != VK_SUCCESS) { + free(bvh_states); + vk_command_buffer_set_error(cmd_buffer, result); + return; + } + + vk_barrier_compute_w_to_compute_r(commandBuffer); + + morton_sort(commandBuffer, device, args, infoCount, pInfos, bvh_states); + + vk_barrier_compute_w_to_compute_r(commandBuffer); + + if (batch_state.any_lbvh) { + result = + lbvh_build_internal(commandBuffer, device, meta, args, infoCount, pInfos, bvh_states); + + if (result != VK_SUCCESS) { + free(bvh_states); + vk_command_buffer_set_error(cmd_buffer, result); + return; + } + } + + if (batch_state.any_ploc) { + result = + ploc_build_internal(commandBuffer, device, meta, args, infoCount, pInfos, bvh_states); + + if (result != VK_SUCCESS) { + vk_command_buffer_set_error(cmd_buffer, result); + return; + } + } + + vk_barrier_compute_w_to_compute_r(commandBuffer); + vk_barrier_compute_w_to_indirect_compute_r(commandBuffer); + } + + for (unsigned pass = 0; pass < ARRAY_SIZE(ops->encode_as); pass++) { + if (!ops->encode_as[pass] && !ops->update_as[pass]) + break; + + bool progress; + do { + progress = false; + + bool update; + uint32_t encode_key; + for (uint32_t i = 0; i < infoCount; ++i) { + if (bvh_states[i].last_encode_pass == pass + 1) + continue; + + if (!progress) { + update = (bvh_states[i].config.internal_type == + INTERNAL_BUILD_TYPE_UPDATE); + if (update && !ops->update_as[pass]) + continue; + if (!update && !ops->encode_as[pass]) + continue; + encode_key = bvh_states[i].config.encode_key[pass]; + progress = true; + if (update) + ops->update_bind_pipeline[pass](commandBuffer); + else + ops->encode_bind_pipeline[pass](commandBuffer, encode_key); + } else { + if (update != (bvh_states[i].config.internal_type == + INTERNAL_BUILD_TYPE_UPDATE) || + encode_key != bvh_states[i].config.encode_key[pass]) + continue; + } + + VK_FROM_HANDLE(vk_acceleration_structure, accel_struct, pInfos[i].dstAccelerationStructure); + + if (update) { + VK_FROM_HANDLE(vk_acceleration_structure, src, pInfos[i].srcAccelerationStructure); + ops->update_as[pass](commandBuffer, + &pInfos[i], + ppBuildRangeInfos[i], + bvh_states[i].leaf_node_count, + src, + accel_struct); + + } else { + ops->encode_as[pass](commandBuffer, + &pInfos[i], + ppBuildRangeInfos[i], + pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset, + pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset, + bvh_states[i].leaf_node_count, + encode_key, + accel_struct); + } + + bvh_states[i].last_encode_pass = pass + 1; + } + } while (progress); + } + + if (args->emit_markers) + vk_cmd_end_debug_marker(commandBuffer); + + free(bvh_states); +} + +void +vk_get_as_build_sizes(VkDevice _device, VkAccelerationStructureBuildTypeKHR buildType, + const VkAccelerationStructureBuildGeometryInfoKHR *pBuildInfo, + const uint32_t *pMaxPrimitiveCounts, + VkAccelerationStructureBuildSizesInfoKHR *pSizeInfo, + const struct vk_acceleration_structure_build_args *args) +{ + VK_FROM_HANDLE(vk_device, device, _device); + + uint32_t leaf_count = 0; + for (uint32_t i = 0; i < pBuildInfo->geometryCount; i++) + leaf_count += pMaxPrimitiveCounts[i]; + + struct scratch_layout scratch; + + get_scratch_layout(device, leaf_count, pBuildInfo, args, &scratch); + + pSizeInfo->accelerationStructureSize = + device->as_build_ops->get_as_size(_device, pBuildInfo, leaf_count); + pSizeInfo->updateScratchSize = scratch.update_size; + pSizeInfo->buildScratchSize = scratch.size; +} + +/* Return true if the common framework supports using this format for loading + * vertices. Must match the formats handled by load_vertices() on the GPU. + */ +bool +vk_acceleration_struct_vtx_format_supported(VkFormat format) +{ + switch (format) { + case VK_FORMAT_R32G32_SFLOAT: + case VK_FORMAT_R32G32B32_SFLOAT: + case VK_FORMAT_R32G32B32A32_SFLOAT: + case VK_FORMAT_R16G16_SFLOAT: + case VK_FORMAT_R16G16B16_SFLOAT: + case VK_FORMAT_R16G16B16A16_SFLOAT: + case VK_FORMAT_R16G16_SNORM: + case VK_FORMAT_R16G16_UNORM: + case VK_FORMAT_R16G16B16A16_SNORM: + case VK_FORMAT_R16G16B16A16_UNORM: + case VK_FORMAT_R8G8_SNORM: + case VK_FORMAT_R8G8_UNORM: + case VK_FORMAT_R8G8B8A8_SNORM: + case VK_FORMAT_R8G8B8A8_UNORM: + case VK_FORMAT_A2B10G10R10_UNORM_PACK32: + return true; + default: + return false; + } +} + diff --git a/src/vulkan/runtime/vk_acceleration_structure.h b/src/vulkan/runtime/vk_acceleration_structure.h index bcc2eff4660..b34d177cbfe 100644 --- a/src/vulkan/runtime/vk_acceleration_structure.h +++ b/src/vulkan/runtime/vk_acceleration_structure.h @@ -26,6 +26,11 @@ #define VK_ACCELERATION_STRUCTURE_H #include "vk_object.h" +#include "radix_sort/radix_sort_vk.h" + +#ifdef __cplusplus +extern "C" { +#endif struct vk_acceleration_structure { struct vk_object_base base; @@ -40,4 +45,88 @@ VkDeviceAddress vk_acceleration_structure_get_va(struct vk_acceleration_structur VK_DEFINE_NONDISP_HANDLE_CASTS(vk_acceleration_structure, base, VkAccelerationStructureKHR, VK_OBJECT_TYPE_ACCELERATION_STRUCTURE_KHR) +#define MAX_ENCODE_PASSES 2 +#define MAX_UPDATE_PASSES 2 + +struct vk_acceleration_structure_build_ops { + VkDeviceSize (*get_as_size)(VkDevice device, + const VkAccelerationStructureBuildGeometryInfoKHR *pBuildInfo, + uint32_t leaf_count); + VkDeviceSize (*get_update_scratch_size)(struct vk_device *device, uint32_t leaf_count); + uint32_t (*get_encode_key[MAX_ENCODE_PASSES])(VkAccelerationStructureTypeKHR type, + VkBuildAccelerationStructureFlagBitsKHR flags); + VkResult (*encode_bind_pipeline[MAX_ENCODE_PASSES])(VkCommandBuffer cmd_buffer, + uint32_t key); + void (*encode_as[MAX_ENCODE_PASSES])(VkCommandBuffer cmd_buffer, + const VkAccelerationStructureBuildGeometryInfoKHR *build_info, + const VkAccelerationStructureBuildRangeInfoKHR *build_range_infos, + VkDeviceAddress intermediate_as_addr, + VkDeviceAddress intermediate_header_addr, + uint32_t leaf_count, + uint32_t key, + struct vk_acceleration_structure *dst); + void (*init_update_scratch)(VkCommandBuffer cmd_buffer, + VkDeviceAddress scratch, + uint32_t leaf_count, + struct vk_acceleration_structure *src_as, + struct vk_acceleration_structure *dst_as); + void (*update_bind_pipeline[MAX_ENCODE_PASSES])(VkCommandBuffer cmd_buffer); + void (*update_as[MAX_ENCODE_PASSES])(VkCommandBuffer cmd_buffer, + const VkAccelerationStructureBuildGeometryInfoKHR *build_info, + const VkAccelerationStructureBuildRangeInfoKHR *build_range_infos, + uint32_t leaf_count, + struct vk_acceleration_structure *dst, + struct vk_acceleration_structure *src); +}; + +struct vk_acceleration_structure_build_args { + uint32_t subgroup_size; + uint32_t bvh_bounds_offset; + bool emit_markers; + const radix_sort_vk_t *radix_sort; +}; + +struct vk_meta_device; + +void vk_cmd_build_acceleration_structures(VkCommandBuffer cmdbuf, + struct vk_device *device, + struct vk_meta_device *meta, + uint32_t info_count, + const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, + const VkAccelerationStructureBuildRangeInfoKHR *const *ppBuildRangeInfos, + const struct vk_acceleration_structure_build_args *args); + +void vk_get_as_build_sizes(VkDevice _device, VkAccelerationStructureBuildTypeKHR buildType, + const VkAccelerationStructureBuildGeometryInfoKHR *pBuildInfo, + const uint32_t *pMaxPrimitiveCounts, + VkAccelerationStructureBuildSizesInfoKHR *pSizeInfo, + const struct vk_acceleration_structure_build_args *args); + +bool vk_acceleration_struct_vtx_format_supported(VkFormat format); + +static inline VkGeometryTypeKHR +vk_get_as_geometry_type(const VkAccelerationStructureBuildGeometryInfoKHR *build_info) +{ + if (build_info->geometryCount) { + if (build_info->pGeometries) + return build_info->pGeometries[0].geometryType; + else + return build_info->ppGeometries[0]->geometryType; + } + + /* If there are no geometries, the geometry type shouldn't matter, but + * return something. + */ + return VK_GEOMETRY_TYPE_TRIANGLES_KHR; +} + +struct vk_bvh_geometry_data +vk_fill_geometry_data(VkAccelerationStructureTypeKHR type, uint32_t first_id, uint32_t geom_index, + const VkAccelerationStructureGeometryKHR *geometry, + const VkAccelerationStructureBuildRangeInfoKHR *build_range_info); + +#ifdef __cplusplus +} +#endif + #endif diff --git a/src/vulkan/runtime/vk_device.h b/src/vulkan/runtime/vk_device.h index 4d7220f832f..83d41afab0c 100644 --- a/src/vulkan/runtime/vk_device.h +++ b/src/vulkan/runtime/vk_device.h @@ -37,6 +37,7 @@ extern "C" { #endif +struct vk_acceleration_structure_build_ops; struct vk_command_buffer_ops; struct vk_device_shader_ops; struct vk_sync; @@ -134,6 +135,9 @@ struct vk_device { /** Shader vtable for VK_EXT_shader_object and common pipelines */ const struct vk_device_shader_ops *shader_ops; + /** Acceleration structure build vtable for common BVH building. */ + const struct vk_acceleration_structure_build_ops *as_build_ops; + /** * Write data to a buffer from the command processor. This is simpler than * setting up a staging buffer and faster for small writes, but is not