diff --git a/src/amd/compiler/aco_insert_fp_mode.cpp b/src/amd/compiler/aco_insert_fp_mode.cpp index 1f116564294..e74f2334b29 100644 --- a/src/amd/compiler/aco_insert_fp_mode.cpp +++ b/src/amd/compiler/aco_insert_fp_mode.cpp @@ -279,6 +279,13 @@ emit_set_mode_block(fp_mode_ctx* ctx, Block* block) instr->opcode = aco_opcode::v_cvt_f16_f32; else instr->opcode = aco_opcode::s_cvt_f16_f32; + } else if (instr->opcode == aco_opcode::p_v_cvt_f16_f32_rtpi || + instr->opcode == aco_opcode::p_v_cvt_f16_f32_rtni) { + set_mode |= fp_state.require(mode_round16_64, instr->opcode == aco_opcode::p_v_cvt_f16_f32_rtpi ? fp_round_pi : fp_round_ni); + set_mode |= fp_state.require(mode_fp16_ovfl, default_state.fields[mode_fp16_ovfl]); + set_mode |= fp_state.require(mode_denorm16_64, default_state.fields[mode_denorm16_64]); + set_mode |= fp_state.require(mode_denorm32, default_state.fields[mode_denorm32]); + instr->opcode = aco_opcode::v_cvt_f16_f32; } else if (instr->opcode == aco_opcode::p_v_cvt_pk_fp8_f32_ovfl) { set_mode |= fp_state.require(mode_fp16_ovfl, 1); instr->opcode = aco_opcode::v_cvt_pk_fp8_f32; diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp index c7611d6df47..aa364f1a9de 100644 --- a/src/amd/compiler/aco_ir.cpp +++ b/src/amd/compiler/aco_ir.cpp @@ -718,6 +718,8 @@ instr_is_16bit(amd_gfx_level gfx_level, aco_opcode op) /* VOP1 */ case aco_opcode::v_cvt_f16_f32: case aco_opcode::p_v_cvt_f16_f32_rtne: + case aco_opcode::p_v_cvt_f16_f32_rtpi: + case aco_opcode::p_v_cvt_f16_f32_rtni: case aco_opcode::v_cvt_f16_u16: case aco_opcode::v_cvt_f16_i16: case aco_opcode::v_rcp_f16: diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index dfb457c3eaf..5ca1abe6a01 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -1029,6 +1029,8 @@ VOP1 = { ("v_cvt_i32_f32", dst(U32), src(F32), op(0x08)), ("v_cvt_f16_f32", dst(F16), src(F32), op(0x0a)), ("p_v_cvt_f16_f32_rtne", dst(F16), src(F32), op(-1)), + ("p_v_cvt_f16_f32_rtpi", dst(F16), src(F32), op(-1)), + ("p_v_cvt_f16_f32_rtni", dst(F16), src(F32), op(-1)), ("v_cvt_f32_f16", dst(F32), src(F16), op(0x0b)), ("v_cvt_rpi_i32_f32", dst(U32), src(F32), op(0x0c)), #v_cvt_nearest_i32_f32 in GFX11 ("v_cvt_flr_i32_f32", dst(U32), src(F32), op(0x0d)),#v_cvt_floor_i32_f32 in GFX11 diff --git a/src/amd/compiler/instruction_selection/aco_isel_setup.cpp b/src/amd/compiler/instruction_selection/aco_isel_setup.cpp index 4d559b15833..2eea24b6500 100644 --- a/src/amd/compiler/instruction_selection/aco_isel_setup.cpp +++ b/src/amd/compiler/instruction_selection/aco_isel_setup.cpp @@ -453,7 +453,9 @@ init_context(isel_context* ctx, nir_shader* shader) case nir_op_sdot_2x16_iadd_sat: case nir_op_bfdot2_bfadd: case nir_op_byte_perm_amd: - case nir_op_alignbyte_amd: type = RegType::vgpr; break; + case nir_op_alignbyte_amd: + case nir_op_f2f16_ru: + case nir_op_f2f16_rd: type = RegType::vgpr; break; case nir_op_fmul: case nir_op_ffma: case nir_op_fadd: diff --git a/src/amd/compiler/instruction_selection/aco_select_nir_alu.cpp b/src/amd/compiler/instruction_selection/aco_select_nir_alu.cpp index 43bc6f16dd6..7beab6b1fe5 100644 --- a/src/amd/compiler/instruction_selection/aco_select_nir_alu.cpp +++ b/src/amd/compiler/instruction_selection/aco_select_nir_alu.cpp @@ -2615,6 +2615,13 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr) } break; } + case nir_op_f2f16_ru: + case nir_op_f2f16_rd: + ctx->program->needs_fp_mode_insertion = true; + bld.vop1(instr->op == nir_op_f2f16_ru ? aco_opcode::p_v_cvt_f16_f32_rtpi + : aco_opcode::p_v_cvt_f16_f32_rtni, + Definition(dst), Operand(get_alu_src(ctx, instr->src[0]))); + break; case nir_op_f2f32: { if (dst.regClass() == s1) { assert(instr->src[0].src.ssa->bit_size == 16); diff --git a/src/amd/vulkan/bvh/build_helpers.h b/src/amd/vulkan/bvh/build_helpers.h index a63a534d349..895e1606d6d 100644 --- a/src/amd/vulkan/bvh/build_helpers.h +++ b/src/amd/vulkan/bvh/build_helpers.h @@ -8,6 +8,7 @@ #define BVH_BUILD_HELPERS_H #include "bvh.h" +#include "spirv_internal_exts.h" #include "vk_build_helpers.h" TYPE(radv_accel_struct_serialization_header, 8); @@ -110,4 +111,7 @@ radv_encode_blas_pointer_flags(uint32_t flags, uint32_t geometry_type) return ptr_flags; } +spirv_instruction(set = "MesaInternal", id = SpvOpFConvertRUMesa) float16_t radv_f32_to_f16_pos_inf(float f); +spirv_instruction(set = "MesaInternal", id = SpvOpFConvertRDMesa) float16_t radv_f32_to_f16_neg_inf(float f); + #endif /* BUILD_HELPERS_H */ diff --git a/src/amd/vulkan/bvh/build_interface.h b/src/amd/vulkan/bvh/build_interface.h index 15a7a2aaf5e..d3b726d296b 100644 --- a/src/amd/vulkan/bvh/build_interface.h +++ b/src/amd/vulkan/bvh/build_interface.h @@ -26,6 +26,7 @@ #define RADV_BUILD_FLAG_PAIR_COMPRESS_TRIANGLES (1u << (VK_BUILD_FLAG_COUNT + 5)) #define RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES (1u << (VK_BUILD_FLAG_COUNT + 6)) #define RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES_RETRY (1u << (VK_BUILD_FLAG_COUNT + 7)) +#define RADV_BUILD_FLAG_USE_BOX16 (1u << (VK_BUILD_FLAG_COUNT + 8)) #define RADV_COPY_MODE_COPY 0 #define RADV_COPY_MODE_SERIALIZE 1 diff --git a/src/amd/vulkan/bvh/bvh.h b/src/amd/vulkan/bvh/bvh.h index f6e867df6bb..b86cefbe1ba 100644 --- a/src/amd/vulkan/bvh/bvh.h +++ b/src/amd/vulkan/bvh/bvh.h @@ -34,6 +34,7 @@ #else #include typedef uint16_t float16_t; +typedef struct radv_aabb16 radv_aabb16; #endif struct radv_accel_struct_serialization_header { @@ -112,9 +113,18 @@ struct radv_bvh_instance_node { mat3x4 otw_matrix; }; +struct radv_aabb16 { + float16_t min_x; + float16_t min_y; + float16_t min_z; + float16_t max_x; + float16_t max_y; + float16_t max_z; +}; + struct radv_bvh_box16_node { uint32_t children[4]; - float16_t coords[4][2][3]; + radv_aabb16 coords[4]; }; struct radv_bvh_box32_node { diff --git a/src/amd/vulkan/bvh/encode.comp b/src/amd/vulkan/bvh/encode.comp index 53c6f853d2c..1fb4dc5d728 100644 --- a/src/amd/vulkan/bvh/encode.comp +++ b/src/amd/vulkan/bvh/encode.comp @@ -22,6 +22,32 @@ void set_parent(uint32_t child, uint32_t parent) DEREF(REF(uint32_t)(addr)) = parent; } +radv_aabb16 +radv_aabb_f32_to_f16(vk_aabb aabb) +{ + radv_aabb16 aabb16; + aabb16.min_x = radv_f32_to_f16_neg_inf(aabb.min.x); + aabb16.min_y = radv_f32_to_f16_neg_inf(aabb.min.y); + aabb16.min_z = radv_f32_to_f16_neg_inf(aabb.min.z); + aabb16.max_x = radv_f32_to_f16_pos_inf(aabb.max.x); + aabb16.max_y = radv_f32_to_f16_pos_inf(aabb.max.y); + aabb16.max_z = radv_f32_to_f16_pos_inf(aabb.max.z); + return aabb16; +} + +vk_aabb +radv_aabb_f16_to_f32(radv_aabb16 aabb16) +{ + vk_aabb aabb; + aabb.min.x = float(aabb16.min_x); + aabb.min.y = float(aabb16.min_y); + aabb.min.z = float(aabb16.min_z); + aabb.max.x = float(aabb16.max_x); + aabb.max.y = float(aabb16.max_y); + aabb.max.z = float(aabb16.max_z); + return aabb; +} + void main() { @@ -89,18 +115,15 @@ main() memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer, gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible); - uint32_t bvh_offset = is_root_node ? id_to_offset(RADV_BVH_ROOT_NODE) : DEREF(src_node).bvh_offset; - if (bvh_offset == VK_UNKNOWN_BVH_OFFSET) + uint32_t node_id = is_root_node ? RADV_BVH_ROOT_NODE : DEREF(src_node).bvh_offset; + if (node_id == VK_UNKNOWN_BVH_OFFSET) continue; - if (bvh_offset == VK_NULL_BVH_OFFSET) + if (node_id == VK_NULL_BVH_OFFSET) break; uint32_t flags = 0; - REF(radv_bvh_box32_node) dst_node = REF(radv_bvh_box32_node)(OFFSET(args.output_bvh, bvh_offset)); - uint32_t node_id = pack_node_id(bvh_offset, radv_bvh_node_box32); - uint32_t found_child_count = 0; uint32_t children[4] = {RADV_BVH_INVALID_NODE, RADV_BVH_INVALID_NODE, RADV_BVH_INVALID_NODE, RADV_BVH_INVALID_NODE}; @@ -158,20 +181,33 @@ main() break; } + REF(radv_bvh_box16_node) dst_node_f16 = REF(radv_bvh_box16_node)(OFFSET(args.output_bvh, id_to_offset(node_id))); + REF(radv_bvh_box32_node) dst_node_f32 = REF(radv_bvh_box32_node)(OFFSET(args.output_bvh, id_to_offset(node_id))); + bool is_box16 = VK_BUILD_FLAG(RADV_BUILD_FLAG_USE_BOX16) && id_to_type(node_id) == radv_bvh_node_box16; + for (uint32_t i = 0; i < found_child_count; ++i) { uint32_t type = ir_id_to_type(children[i]); uint32_t offset = ir_id_to_offset(children[i]); - uint32_t dst_offset; + uint32_t child_node_id; + + vk_aabb child_aabb = DEREF(REF(vk_ir_node)OFFSET(args.intermediate_bvh, offset)).aabb; if (type == vk_ir_node_internal) { - dst_offset = atomicAdd(DEREF(args.header).dst_node_offset, SIZEOF(radv_bvh_box32_node)); + radv_aabb16 child_aabb16 = radv_aabb_f32_to_f16(child_aabb); + float surface_area_f16 = aabb_surface_area(radv_aabb_f16_to_f32(child_aabb16)); + float surface_area_f32 = aabb_surface_area(child_aabb); + bool child_use_f16 = VK_BUILD_FLAG(RADV_BUILD_FLAG_USE_BOX16) && surface_area_f16 < surface_area_f32 * 1.5; - REF(vk_ir_box_node) child_node = REF(vk_ir_box_node)OFFSET(args.intermediate_bvh, offset); - DEREF(child_node).bvh_offset = dst_offset; + uint32_t dst_offset = atomicAdd(DEREF(args.header).dst_node_offset, + child_use_f16 ? SIZEOF(radv_bvh_box16_node) : SIZEOF(radv_bvh_box32_node)); + child_node_id = pack_node_id(dst_offset, child_use_f16 ? radv_bvh_node_box16 : radv_bvh_node_box32); + + REF(vk_ir_box_node) child_node = REF(vk_ir_box_node) OFFSET(args.intermediate_bvh, offset); + DEREF(child_node).bvh_offset = child_node_id; flags |= (DEREF(child_node).flags & 0x3) << i * 8; } else { uint32_t child_index = offset / ir_leaf_node_size; - dst_offset = dst_leaf_offset + child_index * output_leaf_node_size; + uint32_t dst_offset = dst_leaf_offset + child_index * output_leaf_node_size; if (type == vk_ir_node_instance) { vk_ir_instance_node src_node = @@ -182,47 +218,65 @@ main() uint32_t child_flags = fetch_child_flags(args.intermediate_bvh, children[i]); flags |= (child_flags & 0x3) << i * 8; } + + child_node_id = pack_node_id(dst_offset, ir_type_to_bvh_type(type)); } - vk_aabb child_aabb = - DEREF(REF(vk_ir_node)OFFSET(args.intermediate_bvh, offset)).aabb; - - /* On gfx11, infinities in AABB coords can cause garbage child nodes to be - * returned by box intersection tests with non-default box sorting modes. - * Subtract 1 from the integer representation of inf/-inf to turn it into - * the maximum/minimum representable floating-point value as a workaround. - */ - if (VK_BUILD_FLAG(RADV_BUILD_FLAG_NO_INFS)) { - for (uint32_t i = 0; i < 3; ++i) { - if (isinf(child_aabb.min[i])) - child_aabb.min[i] = uintBitsToFloat(floatBitsToUint(child_aabb.min[i]) - 1); - if (isinf(child_aabb.max[i])) - child_aabb.max[i] = uintBitsToFloat(floatBitsToUint(child_aabb.max[i]) - 1); + if (is_box16) { + DEREF(dst_node_f16).coords[i] = radv_aabb_f32_to_f16(child_aabb); + } else { + /* On gfx11, infinities in AABB coords can cause garbage child nodes to be + * returned by box intersection tests with non-default box sorting modes. + * Subtract 1 from the integer representation of inf/-inf to turn it into + * the maximum/minimum representable floating-point value as a workaround. + */ + if (VK_BUILD_FLAG(RADV_BUILD_FLAG_NO_INFS)) { + for (uint32_t i = 0; i < 3; ++i) { + if (isinf(child_aabb.min[i])) + child_aabb.min[i] = uintBitsToFloat(floatBitsToUint(child_aabb.min[i]) - 1); + if (isinf(child_aabb.max[i])) + child_aabb.max[i] = uintBitsToFloat(floatBitsToUint(child_aabb.max[i]) - 1); + } } + + DEREF(dst_node_f32).coords[i] = child_aabb; } - DEREF(dst_node).coords[i] = child_aabb; - - uint32_t child_id = pack_node_id(dst_offset, ir_type_to_bvh_type(type)); - children[i] = child_id; - set_parent(child_id, node_id); + children[i] = child_node_id; + set_parent(child_node_id, node_id); } - for (uint i = found_child_count; i < 4; ++i) { + if (is_box16) { + radv_aabb16 null_aabb; + null_aabb.min_x = NAN_F16; + null_aabb.min_y = NAN_F16; + null_aabb.min_z = NAN_F16; + null_aabb.max_x = NAN_F16; + null_aabb.max_y = NAN_F16; + null_aabb.max_z = NAN_F16; + for (uint i = found_child_count; i < 4; ++i) + DEREF(dst_node_f16).coords[i] = null_aabb; + } else { + for (uint i = found_child_count; i < 4; ++i) { for (uint comp = 0; comp < 3; ++comp) { - DEREF(dst_node).coords[i].min[comp] = NAN; - DEREF(dst_node).coords[i].max[comp] = NAN; + DEREF(dst_node_f32).coords[i].min[comp] = NAN; + DEREF(dst_node_f32).coords[i].max[comp] = NAN; } + } } /* Make changes to the children's BVH offset value available to the other invocations. */ memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer, gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible); - DEREF(dst_node).children = children; + if (is_box16) { + DEREF(dst_node_f16).children = children; + } else { + DEREF(dst_node_f32).children = children; - if (VK_BUILD_FLAG(VK_BUILD_FLAG_PROPAGATE_CULL_FLAGS)) - DEREF(dst_node).flags = flags; + if (VK_BUILD_FLAG(VK_BUILD_FLAG_PROPAGATE_CULL_FLAGS)) + DEREF(dst_node_f32).flags = flags; + } break; } diff --git a/src/amd/vulkan/bvh/meson.build b/src/amd/vulkan/bvh/meson.build index 3320ef67428..c0328db82c7 100644 --- a/src/amd/vulkan/bvh/meson.build +++ b/src/amd/vulkan/bvh/meson.build @@ -56,7 +56,7 @@ bvh_includes = files( bvh_spv = [] foreach s : bvh_shaders command = [ - prog_glslang, '-V', '-I' + bvh_include_dir, '-I' + vk_bvh_include_dir, '--target-env', 'spirv1.5', + prog_glslang, '-V', '-I' + bvh_include_dir, '-I' + vk_bvh_include_dir, '-I' + spirv_include_dir, '--target-env', 'spirv1.5', '-x', '-o', '@OUTPUT@', '@INPUT@', glslang_depfile, glslang_quiet, ] command += vk_glsl_shader_preamble diff --git a/src/amd/vulkan/layers/radv_rra_layer.c b/src/amd/vulkan/layers/radv_rra_layer.c index b95a1331d45..df0573cd3e0 100644 --- a/src/amd/vulkan/layers/radv_rra_layer.c +++ b/src/amd/vulkan/layers/radv_rra_layer.c @@ -374,7 +374,16 @@ rra_QueueSubmit2KHR(VkQueue _queue, uint32_t submitCount, const VkSubmitInfo2 *p struct radv_device *device = radv_queue_device(queue); VkResult result = device->layer_dispatch.rra.QueueSubmit2KHR(_queue, submitCount, pSubmits, _fence); - if (result != VK_SUCCESS || !device->rra_trace.triggered) + if (result != VK_SUCCESS) + return result; + + if (radv_bvh_stats_file()) { + result = radv_dump_bvh_stats(_queue); + if (result != VK_SUCCESS) + return result; + } + + if (!device->rra_trace.triggered) return result; uint32_t total_trace_count = 0; diff --git a/src/amd/vulkan/radv_acceleration_structure.c b/src/amd/vulkan/radv_acceleration_structure.c index 27c696b8a55..60c0d743f54 100644 --- a/src/amd/vulkan/radv_acceleration_structure.c +++ b/src/amd/vulkan/radv_acceleration_structure.c @@ -75,6 +75,7 @@ enum radv_encode_key_bits { RADV_ENCODE_KEY_WRITE_LEAF_NODE_OFFSETS = (1 << 0), RADV_ENCODE_KEY_PAIR_COMPRESS_GFX12 = (1 << 1), RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12 = (1 << 2), + RADV_ENCODE_KEY_USE_BOX16 = (1 << 3), }; static void @@ -287,6 +288,8 @@ radv_get_build_config(VkDevice _device, struct vk_acceleration_structure_build_s VK_FROM_HANDLE(radv_device, device, _device); struct radv_physical_device *pdev = radv_device_physical(device); + VkGeometryTypeKHR geometry_type = vk_get_as_geometry_type(state->build_info); + uint32_t encode_key = 0; if (radv_use_bvh8(pdev)) { /* @@ -302,11 +305,13 @@ radv_get_build_config(VkDevice _device, struct vk_acceleration_structure_build_s state->build_info->type != VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR) encode_key |= RADV_ENCODE_KEY_WRITE_LEAF_NODE_OFFSETS; - VkGeometryTypeKHR geometry_type = vk_get_as_geometry_type(state->build_info); if (!(state->build_info->flags & (VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_UPDATE_BIT_KHR | VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_DATA_ACCESS_KHR)) && geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR) encode_key |= RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12; + } else if (!radv_emulate_rt(pdev)) { + if (!(state->build_info->flags & VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_UPDATE_BIT_KHR)) + encode_key |= RADV_ENCODE_KEY_USE_BOX16; } state->config.encode_key[0] = encode_key; @@ -391,6 +396,8 @@ radv_build_flags(VkCommandBuffer commandBuffer, uint32_t key) flags |= RADV_BUILD_FLAG_PAIR_COMPRESS_TRIANGLES; if (key & RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12) flags |= RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES; + if (key & RADV_ENCODE_KEY_USE_BOX16) + flags |= RADV_BUILD_FLAG_USE_BOX16; return flags; } diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index 170f47506a1..41c97cdaa08 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -722,7 +722,7 @@ radv_device_init_tools(struct radv_device *device) if (result != VK_SUCCESS) return result; - if ((instance->vk.trace_mode & RADV_TRACE_MODE_RRA) && radv_enable_rt(pdev)) { + if (radv_bvh_dumping_enabled(instance) && radv_enable_rt(pdev)) { result = radv_rra_trace_init(device); if (result != VK_SUCCESS) return result; @@ -798,7 +798,7 @@ init_dispatch_tables(struct radv_device *device, struct radv_physical_device *pd if (instance->vk.trace_mode & RADV_TRACE_MODE_RGP) add_entrypoints(&b, &sqtt_device_entrypoints, RADV_RGP_DISPATCH_TABLE); - if ((instance->vk.trace_mode & RADV_TRACE_MODE_RRA) && radv_enable_rt(pdev)) + if (radv_bvh_dumping_enabled(instance) && radv_enable_rt(pdev)) add_entrypoints(&b, &rra_device_entrypoints, RADV_RRA_DISPATCH_TABLE); #ifndef _WIN32 diff --git a/src/amd/vulkan/radv_instance.h b/src/amd/vulkan/radv_instance.h index 84a4d88cb32..4fe5b723621 100644 --- a/src/amd/vulkan/radv_instance.h +++ b/src/amd/vulkan/radv_instance.h @@ -115,4 +115,17 @@ const char *radv_get_perftest_option_name(int id); bool radv_is_rt_wave64_enabled(const struct radv_instance *instance); +static const char * +radv_bvh_stats_file() +{ + return os_get_option("RADV_BVH_STATS_FILE"); +} + +static bool +radv_bvh_dumping_enabled(const struct radv_instance *instance) +{ + /* Gathering bvh stats uses a large part of the rra code for dumping bvhs. */ + return (instance->vk.trace_mode & RADV_TRACE_MODE_RRA) || radv_bvh_stats_file(); +} + #endif /* RADV_INSTANCE_H */ diff --git a/src/amd/vulkan/radv_rra.c b/src/amd/vulkan/radv_rra.c index bba3e87943b..e31a5ba858c 100644 --- a/src/amd/vulkan/radv_rra.c +++ b/src/amd/vulkan/radv_rra.c @@ -198,7 +198,8 @@ rra_fill_accel_struct_header_common(const struct radv_physical_device *pdev, str /* TODO: calculate active primitives */ .active_primitive_count = primitive_count, .geometry_description_count = header->geometry_count, - .interior_fp32_node_count = bvh_info->internal_nodes_size / sizeof(struct radv_bvh_box32_node), + .interior_fp32_node_count = bvh_info->box32_count, + .interior_fp16_node_count = bvh_info->box16_count, .leaf_node_count = primitive_count, .rt_driver_interface_version = 8 << 16, .rt_ip_version = pdev->info.rt_ip_version, @@ -488,6 +489,10 @@ radv_rra_trace_init(struct radv_device *device) device->rra_trace.ray_history = UTIL_DYNARRAY_INIT; + /* BVH stats dumping does not need ray history. */ + if (!(radv_physical_device_instance(pdev)->vk.trace_mode & RADV_TRACE_MODE_RRA)) + return VK_SUCCESS; + device->rra_trace.ray_history_buffer_size = debug_get_num_option("RADV_RRA_TRACE_HISTORY_SIZE", 100 * 1024 * 1024); if (device->rra_trace.ray_history_buffer_size < sizeof(struct radv_ray_history_header) + sizeof(struct radv_packed_end_trace_token)) @@ -624,6 +629,9 @@ radv_rra_trace_finish(VkDevice vk_device, struct radv_rra_trace_data *data) simple_mtx_destroy(&data->data_mtx); _mesa_hash_table_destroy(data->accel_structs, NULL); _mesa_hash_table_u64_destroy(data->accel_struct_vas); + + if (data->stats_file) + fclose(data->stats_file); } void @@ -789,7 +797,7 @@ rra_map_accel_struct_data(struct rra_copy_context *ctx, uint32_t i) if (radv_GetEventStatus(ctx->device, data->build_event) != VK_EVENT_SET) return NULL; - if (data->buffer->memory) { + if (data->buffer && data->buffer->memory) { VkMemoryMapInfo memory_map_info = { .sType = VK_STRUCTURE_TYPE_MEMORY_MAP_INFO, .memory = data->buffer->memory, @@ -1297,3 +1305,167 @@ cleanup: free(accel_struct_offsets); return result; } + +static void +dump_bvh_stats(struct radv_device *device, struct vk_acceleration_structure *accel_struct, + struct radv_rra_accel_struct_data *accel_struct_data, uint8_t *data, struct hash_table_u64 *blas_sah, + bool tlas_pass) +{ + const struct radv_physical_device *pdev = radv_device_physical(device); + const struct radv_instance *instance = radv_physical_device_instance(pdev); + + struct radv_accel_struct_header *header = (struct radv_accel_struct_header *)data; + + bool is_tlas = header->instance_count > 0; + if (is_tlas != tlas_pass) + return; + + /* convert root node id to offset */ + uint32_t src_root_offset = (RADV_BVH_ROOT_NODE & ~7) << 3; + + if (rra_validate_header(accel_struct_data, header)) { + return; + } + if (radv_use_bvh8(pdev)) { + if (rra_validate_node_gfx12(device->rra_trace.accel_struct_vas, data + header->bvh_offset, + data + header->bvh_offset + src_root_offset, header->geometry_count, + accel_struct_data->size, !is_tlas, 0)) { + return; + } + } else { + if (rra_validate_node_gfx10_3(device->rra_trace.accel_struct_vas, data + header->bvh_offset, + data + header->bvh_offset + src_root_offset, header->geometry_count, + accel_struct_data->size, !is_tlas, 0)) { + return; + } + } + + if (!device->rra_trace.stats_file) { + device->rra_trace.stats_file = fopen(radv_bvh_stats_file(), "w"); + fprintf(device->rra_trace.stats_file, "app,name,type,allocated_size,compacted_size"); + if (radv_use_bvh8(pdev)) { + fprintf(device->rra_trace.stats_file, ",max_depth,box_node_count,primitive_node_count,instance_node_count"); + } else { + fprintf(device->rra_trace.stats_file, ",max_depth,box16_node_count,box32_node_count,triangle_node_count," + "instance_node_count,procedual_node_count"); + } + fprintf(device->rra_trace.stats_file, ",sah,scene_sah\n"); + } + + fprintf(device->rra_trace.stats_file, "\"%s\",%s,%s,%" PRIu64 ",%" PRIu64, instance->vk.app_info.app_name, + vk_object_base_name(&accel_struct->base), is_tlas ? "tlas" : "blas", accel_struct_data->size, + header->compacted_size); + + float extent[3] = { + header->aabb.max.x - header->aabb.min.x, + header->aabb.max.y - header->aabb.min.y, + header->aabb.max.z - header->aabb.min.z, + }; + float surface_area = 2 * (extent[0] * extent[1] + extent[0] * extent[2] + extent[1] * extent[2]); + + float sah; + float instance_sah; + if (radv_use_bvh8(pdev)) { + struct radv_bvh_stats_gfx12 stats = {}; + radv_gather_bvh_stats_gfx12(data + header->bvh_offset, RADV_BVH_ROOT_NODE, 1, surface_area, blas_sah, &stats); + sah = stats.sah; + instance_sah = stats.instance_sah; + fprintf(device->rra_trace.stats_file, ",%u,%u,%u,%u", stats.max_depth, stats.box_node_count, + stats.primitive_node_count, stats.instance_node_count); + } else { + struct radv_bvh_stats_gfx10_3 stats = {}; + radv_gather_bvh_stats_gfx10_3(data + header->bvh_offset, RADV_BVH_ROOT_NODE, 1, surface_area, blas_sah, &stats); + sah = stats.sah; + instance_sah = stats.instance_sah; + fprintf(device->rra_trace.stats_file, ",%u,%u,%u,%u,%u,%u", stats.max_depth, stats.box16_node_count, + stats.box32_node_count, stats.triangle_node_count, stats.instance_node_count, stats.procedual_node_count); + } + + fprintf(device->rra_trace.stats_file, ",%u", (uint32_t)(sah / surface_area * 1000000)); + + if (is_tlas) { + fprintf(device->rra_trace.stats_file, ",%u\n", (uint32_t)((sah + instance_sah) / surface_area * 1000000)); + } else { + fprintf(device->rra_trace.stats_file, ",0\n"); + + float *sah_ptr = ralloc(blas_sah, float); + *sah_ptr = sah / surface_area; + _mesa_hash_table_u64_insert(blas_sah, vk_acceleration_structure_get_va(accel_struct), sah_ptr); + } + + fflush(device->rra_trace.stats_file); +} + +VkResult +radv_dump_bvh_stats(VkQueue vk_queue) +{ + VK_FROM_HANDLE(radv_queue, queue, vk_queue); + struct radv_device *device = radv_queue_device(queue); + VkDevice vk_device = radv_device_to_handle(device); + + VkResult result = vk_common_DeviceWaitIdle(vk_device); + if (result != VK_SUCCESS) + return result; + + struct hash_entry **hash_entries = NULL; + struct hash_table_u64 *blas_sah = NULL; + + uint32_t struct_count = _mesa_hash_table_num_entries(device->rra_trace.accel_structs); + + hash_entries = malloc(sizeof(*hash_entries) * struct_count); + if (!hash_entries) { + result = VK_ERROR_OUT_OF_HOST_MEMORY; + goto cleanup; + } + + struct hash_entry *last_entry = NULL; + for (unsigned i = 0; (last_entry = _mesa_hash_table_next_entry(device->rra_trace.accel_structs, last_entry)); ++i) + hash_entries[i] = last_entry; + + qsort(hash_entries, struct_count, sizeof(*hash_entries), accel_struct_entry_cmp); + + struct rra_copy_context copy_ctx = { + .device = vk_device, + .queue = vk_queue, + .entries = hash_entries, + .family_index = queue->vk.queue_family_index, + .min_size = device->rra_trace.ray_history_buffer_size, + }; + + result = rra_copy_context_init(©_ctx); + if (result != VK_SUCCESS) + goto cleanup; + + blas_sah = _mesa_hash_table_u64_create(NULL); + + for (unsigned i = 0; i < struct_count; i++) { + void *mapped_data = rra_map_accel_struct_data(©_ctx, i); + if (!mapped_data) + continue; + + dump_bvh_stats(device, (void *)hash_entries[i]->key, hash_entries[i]->data, mapped_data, blas_sah, false); + + rra_unmap_accel_struct_data(©_ctx, i); + } + + for (unsigned i = 0; i < struct_count; i++) { + if (_mesa_hash_table_u64_search(blas_sah, vk_acceleration_structure_get_va(hash_entries[i]->key))) + continue; + + void *mapped_data = rra_map_accel_struct_data(©_ctx, i); + if (!mapped_data) + continue; + + dump_bvh_stats(device, (void *)hash_entries[i]->key, hash_entries[i]->data, mapped_data, blas_sah, true); + + rra_unmap_accel_struct_data(©_ctx, i); + } + + rra_copy_context_finish(©_ctx); + + result = VK_SUCCESS; +cleanup: + _mesa_hash_table_u64_destroy(blas_sah); + free(hash_entries); + return result; +} diff --git a/src/amd/vulkan/radv_rra.h b/src/amd/vulkan/radv_rra.h index c2c86f15d9f..730e4c45683 100644 --- a/src/amd/vulkan/radv_rra.h +++ b/src/amd/vulkan/radv_rra.h @@ -107,6 +107,7 @@ struct radv_rra_trace_data { struct hash_table *accel_structs; struct hash_table_u64 *accel_struct_vas; simple_mtx_t data_mtx; + FILE *stats_file; bool validate_as; bool copy_after_build; bool triggered; @@ -288,6 +289,8 @@ struct rra_bvh_info { uint32_t leaf_nodes_size; uint32_t internal_nodes_size; uint32_t instance_sideband_data_size; + uint32_t box32_count; + uint32_t box16_count; struct rra_geometry_info *geometry_infos; }; @@ -320,4 +323,32 @@ void rra_gather_bvh_info_gfx12(const uint8_t *bvh, uint32_t node_id, struct rra_ void rra_transcode_node_gfx12(struct rra_transcoding_context *ctx, uint32_t parent_id, uint32_t src_id, uint32_t dst_offset); +struct radv_bvh_stats_gfx10_3 { + uint32_t max_depth; + float sah; + float instance_sah; + uint32_t box16_node_count; + uint32_t box32_node_count; + uint32_t triangle_node_count; + uint32_t instance_node_count; + uint32_t procedual_node_count; +}; + +struct radv_bvh_stats_gfx12 { + uint32_t max_depth; + float sah; + float instance_sah; + uint32_t box_node_count; + uint32_t primitive_node_count; + uint32_t instance_node_count; +}; + +void radv_gather_bvh_stats_gfx10_3(const uint8_t *bvh, uint32_t node_id, uint32_t depth, float p, + struct hash_table_u64 *blas_sah, struct radv_bvh_stats_gfx10_3 *stats); + +void radv_gather_bvh_stats_gfx12(const uint8_t *bvh, uint32_t node_id, uint32_t depth, float p, + struct hash_table_u64 *blas_sah, struct radv_bvh_stats_gfx12 *stats); + +VkResult radv_dump_bvh_stats(VkQueue vk_queue); + #endif /* RADV_RRA_H */ diff --git a/src/amd/vulkan/radv_rra_gfx10_3.c b/src/amd/vulkan/radv_rra_gfx10_3.c index 8ff1f01aa9d..6d3bfa706df 100644 --- a/src/amd/vulkan/radv_rra_gfx10_3.c +++ b/src/amd/vulkan/radv_rra_gfx10_3.c @@ -177,9 +177,11 @@ rra_gather_bvh_info_gfx10_3(const uint8_t *bvh, uint32_t node_id, struct rra_bvh switch (node_type) { case radv_bvh_node_box16: dst->internal_nodes_size += sizeof(struct rra_box16_node); + dst->box16_count++; break; case radv_bvh_node_box32: dst->internal_nodes_size += sizeof(struct rra_box32_node); + dst->box32_count++; break; case radv_bvh_node_instance: dst->leaf_nodes_size += sizeof(struct rra_instance_node); @@ -283,15 +285,15 @@ rra_transcode_box16_node(struct rra_transcoding_context *ctx, const struct radv_ vk_aabb bounds = { .min = { - _mesa_half_to_float(src->coords[i][0][0]), - _mesa_half_to_float(src->coords[i][0][1]), - _mesa_half_to_float(src->coords[i][0][2]), + _mesa_half_to_float(src->coords[i].min_x), + _mesa_half_to_float(src->coords[i].min_y), + _mesa_half_to_float(src->coords[i].min_z), }, .max = { - _mesa_half_to_float(src->coords[i][1][0]), - _mesa_half_to_float(src->coords[i][1][1]), - _mesa_half_to_float(src->coords[i][1][2]), + _mesa_half_to_float(src->coords[i].max_x), + _mesa_half_to_float(src->coords[i].max_y), + _mesa_half_to_float(src->coords[i].max_z), }, }; @@ -355,3 +357,78 @@ rra_transcode_node_gfx10_3(struct rra_transcoding_context *ctx, uint32_t parent_ return dst_id; } + +void +radv_gather_bvh_stats_gfx10_3(const uint8_t *bvh, uint32_t node_id, uint32_t depth, float p, + struct hash_table_u64 *blas_sah, struct radv_bvh_stats_gfx10_3 *stats) +{ + uint32_t node_type = node_id & 7; + const void *node = bvh + ((node_id & (~7u)) << 3); + + stats->max_depth = MAX2(stats->max_depth, depth); + + switch (node_type) { + case radv_bvh_node_box16: { + stats->sah += 1.0 * p; + stats->box16_node_count++; + + const struct radv_bvh_box16_node *box16 = node; + for (uint32_t i = 0; i < 4; i++) { + if (box16->children[i] != 0xffffffff) { + float extent[3] = { + _mesa_half_to_float(box16->coords[i].max_x) - _mesa_half_to_float(box16->coords[i].min_x), + _mesa_half_to_float(box16->coords[i].max_y) - _mesa_half_to_float(box16->coords[i].min_y), + _mesa_half_to_float(box16->coords[i].max_z) - _mesa_half_to_float(box16->coords[i].min_z), + }; + float surface_area = 2 * (extent[0] * extent[1] + extent[0] * extent[2] + extent[1] * extent[2]); + radv_gather_bvh_stats_gfx10_3(bvh, box16->children[i], depth + 1, surface_area, blas_sah, stats); + } + } + + break; + } + case radv_bvh_node_box32: { + stats->sah += 1.5 * p; + stats->box32_node_count++; + + const struct radv_bvh_box32_node *box32 = node; + for (uint32_t i = 0; i < 4; i++) { + if (box32->children[i] != 0xffffffff) { + float extent[3] = { + box32->coords[i].max.x - box32->coords[i].min.x, + box32->coords[i].max.y - box32->coords[i].min.y, + box32->coords[i].max.z - box32->coords[i].min.z, + }; + float surface_area = 2 * (extent[0] * extent[1] + extent[0] * extent[2] + extent[1] * extent[2]); + radv_gather_bvh_stats_gfx10_3(bvh, box32->children[i], depth + 1, surface_area, blas_sah, stats); + } + } + + break; + } + case radv_bvh_node_instance: { + stats->sah += 2.0 * p; + stats->instance_node_count++; + + const struct radv_bvh_instance_node *instance = node; + uint64_t blas_va = radv_node_to_addr(instance->bvh_ptr) - instance->bvh_offset; + float *sah = _mesa_hash_table_u64_search(blas_sah, blas_va); + if (sah) + stats->instance_sah += *sah * p; + else + fprintf(stderr, "radv: Could not find SAH for BLAS at address 0x%lx\n", blas_va); + + break; + } + case radv_bvh_node_triangle: + stats->sah += 2.0 * p; + stats->triangle_node_count++; + break; + case radv_bvh_node_aabb: + stats->sah += 4.0 * p; + stats->procedual_node_count++; + break; + default: + break; + } +} diff --git a/src/amd/vulkan/radv_rra_gfx12.c b/src/amd/vulkan/radv_rra_gfx12.c index 3260664db52..ce233bb0d82 100644 --- a/src/amd/vulkan/radv_rra_gfx12.c +++ b/src/amd/vulkan/radv_rra_gfx12.c @@ -10,6 +10,7 @@ #include "radv_rra.h" #include "util/bitset.h" +#include "util/compiler.h" struct rra_instance_sideband_data { uint32_t instance_index; @@ -306,3 +307,98 @@ rra_transcode_node_gfx12(struct rra_transcoding_context *ctx, uint32_t parent_id } } } + +void +radv_gather_bvh_stats_gfx12(const uint8_t *bvh, uint32_t node_id, uint32_t depth, float surface_area, + struct hash_table_u64 *blas_sah, struct radv_bvh_stats_gfx12 *stats) +{ + uint32_t node_type = node_id & 0xf; + const void *node = bvh + ((node_id & (~0xf)) << 3); + + stats->max_depth = MAX2(stats->max_depth, depth); + + switch (node_type) { + case radv_bvh_node_box32: { + stats->box_node_count++; + stats->sah += 0.5 * surface_area; + + const struct radv_gfx12_box_node *src = node; + + uint32_t valid_child_count_minus_one = src->child_count_exponents >> 28; + + if (valid_child_count_minus_one != 0xf) { + uint32_t internal_id = src->internal_base_id; + uint32_t primitive_id = src->primitive_base_id; + + uint32_t exponents[3] = { + src->child_count_exponents & 0xff, + (src->child_count_exponents >> 8) & 0xff, + (src->child_count_exponents >> 16) & 0xff, + }; + float extent[3] = { + uif(exponents[0] << 23), + uif(exponents[1] << 23), + uif(exponents[2] << 23), + }; + + for (uint32_t i = 0; i <= valid_child_count_minus_one; i++) { + uint32_t child_type = (src->children[i].dword2 >> 24) & 0xf; + uint32_t child_size = src->children[i].dword2 >> 28; + + uint32_t child_id; + if (child_type == radv_bvh_node_box32) { + child_id = internal_id | child_type; + internal_id += (child_size * RADV_GFX12_BVH_NODE_SIZE) >> 3; + } else { + child_id = primitive_id | child_type; + primitive_id += (child_size * RADV_GFX12_BVH_NODE_SIZE) >> 3; + } + + float min[3] = { + (float)(src->children[i].dword0 & 0xfff) / 0x1000 * extent[0], + (float)((src->children[i].dword0 >> 12) & 0xfff) / 0x1000 * extent[1], + (float)(src->children[i].dword1 & 0xfff) / 0x1000 * extent[2], + }; + float max[3] = { + (float)(((src->children[i].dword1 >> 12) & 0xfff) + 1) / 0x1000 * extent[0], + (float)((src->children[i].dword2 & 0xfff) + 1) / 0x1000 * extent[1], + (float)(((src->children[i].dword2 >> 12) & 0xfff) + 1) / 0x1000 * extent[2], + }; + float child_extent[3] = { + max[0] - min[0], + max[1] - min[1], + max[2] - min[2], + }; + float child_surface_area = 2 * (child_extent[0] * child_extent[1] + child_extent[0] * child_extent[2] + + child_extent[1] * child_extent[2]); + + radv_gather_bvh_stats_gfx12(bvh, child_id, depth + 1, child_surface_area, blas_sah, stats); + } + } + + break; + } + case radv_bvh_node_instance: { + stats->instance_node_count++; + stats->sah += 0.7 * surface_area; + + struct radv_gfx12_instance_node *instance = (struct radv_gfx12_instance_node *)(node); + const struct radv_gfx12_instance_node_user_data *user_data = + (const void *)((const uint8_t *)node + sizeof(struct radv_gfx12_instance_node)); + uint64_t blas_va = radv_node_to_addr(instance->pointer_flags_bvh_addr) - user_data->bvh_offset; + float *sah = _mesa_hash_table_u64_search(blas_sah, blas_va); + if (sah) + stats->instance_sah += *sah * surface_area; + else + fprintf(stderr, "radv: Could not find SAH for BLAS at address 0x%lx\n", blas_va); + + break; + } + case radv_bvh_node_triangle: + stats->primitive_node_count++; + FALLTHROUGH; + default: + stats->sah += 1.0 * surface_area; + break; + } +} diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py index 6c9aa7e5117..723d2691101 100644 --- a/src/compiler/nir/nir_opcodes.py +++ b/src/compiler/nir/nir_opcodes.py @@ -257,7 +257,7 @@ for src_t in [tint, tuint, tfloat, tbool]: for dst_t in dst_types: for dst_bit_size in type_sizes(dst_t): if dst_bit_size == 16 and dst_t == tfloat and src_t == tfloat: - rnd_modes = ['_rtne', '_rtz', ''] + rnd_modes = ['_rtne', '_rtz', '_ru', '_rd', ''] for rnd_mode in rnd_modes: if rnd_mode == '_rtne': conv_expr = """ @@ -279,6 +279,22 @@ for src_t in [tint, tuint, tfloat, tbool]: dst = src0; } """ + elif rnd_mode == '_ru': + conv_expr = """ + if (bit_size > 16) { + dst = _mesa_half_to_float(_mesa_float_to_float16_ru(src0)); + } else { + dst = src0; + } + """ + elif rnd_mode == '_rd': + conv_expr = """ + if (bit_size > 16) { + dst = _mesa_half_to_float(_mesa_float_to_float16_rd(src0)); + } else { + dst = src0; + } + """ else: conv_expr = """ if (bit_size > 32) { diff --git a/src/compiler/spirv/spirv_internal_exts.h b/src/compiler/spirv/spirv_internal_exts.h new file mode 100644 index 00000000000..74c3b9fd79b --- /dev/null +++ b/src/compiler/spirv/spirv_internal_exts.h @@ -0,0 +1,13 @@ +/* + * Copyright © 2025 Valve Corporation + * + * SPDX-License-Identifier: MIT + */ + +#ifndef SPIRV_INTERNAL_EXTS_H +#define SPIRV_INTERNAL_EXTS_H + +#define SpvOpFConvertRUMesa 0 +#define SpvOpFConvertRDMesa 1 + +#endif diff --git a/src/compiler/spirv/spirv_to_nir.c b/src/compiler/spirv/spirv_to_nir.c index 7d4b9a7d21f..00e7d0d58f1 100644 --- a/src/compiler/spirv/spirv_to_nir.c +++ b/src/compiler/spirv/spirv_to_nir.c @@ -923,6 +923,29 @@ vtn_handle_non_semantic_debug_info(struct vtn_builder *b, SpvOp ext_opcode, return true; } +static bool +vtn_handle_mesa_internal(struct vtn_builder *b, SpvOp ext_opcode, + const uint32_t *w, unsigned count) +{ + uint32_t instr = w[4]; + + switch (instr) { + case SpvOpFConvertRUMesa: { + struct vtn_ssa_value *arg = vtn_ssa_value(b, w[5]); + vtn_push_nir_ssa(b, w[2], nir_f2f16_ru(&b->nb, arg->def)); + break; + } + case SpvOpFConvertRDMesa: { + struct vtn_ssa_value *arg = vtn_ssa_value(b, w[5]); + vtn_push_nir_ssa(b, w[2], nir_f2f16_rd(&b->nb, arg->def)); + break; + } + } + + return true; +} + + static void vtn_handle_extension(struct vtn_builder *b, SpvOp opcode, const uint32_t *w, unsigned count) @@ -958,6 +981,8 @@ vtn_handle_extension(struct vtn_builder *b, SpvOp opcode, val->ext_handler = vtn_handle_debug_printf; } else if (strstr(ext, "NonSemantic.") == ext) { val->ext_handler = vtn_handle_non_semantic_instruction; + } else if (strstr(ext, "MesaInternal") == ext) { + val->ext_handler = vtn_handle_mesa_internal; } else { vtn_fail("Unsupported extension: %s", ext); } diff --git a/src/compiler/spirv/vtn_private.h b/src/compiler/spirv/vtn_private.h index 5d601f95c86..9f6009ed8ea 100644 --- a/src/compiler/spirv/vtn_private.h +++ b/src/compiler/spirv/vtn_private.h @@ -33,6 +33,7 @@ #include "spirv.h" #include "spirv_info.h" #include "vtn_generator_ids.h" +#include "spirv_internal_exts.h" extern uint32_t mesa_spirv_debug; diff --git a/src/util/half_float.c b/src/util/half_float.c index 0eacf06c5a8..6734842df1e 100644 --- a/src/util/half_float.c +++ b/src/util/half_float.c @@ -211,3 +211,41 @@ uint16_t _mesa_uint16_div_64k_to_half(uint16_t v) return (e << 10) | m; } + +static uint16_t +util_nextafter16(uint16_t x, bool up) +{ + uint16_t sign_mask = 1ull << 15; + uint16_t min_abs = 1; + + float f = _mesa_half_to_float(x); + if (isnan(f) || (f == INFINITY && up) || (f == -INFINITY && !up)) + return x; + + /* beware of: +/-0.0 - 1 == NaN */ + uint16_t xn = f == 0 ? (sign_mask | min_abs) : x - 1; + + /* beware of -0.0 + 1 == -0x1p-149 */ + uint16_t xp = f == 0 ? min_abs : x + 1; + + /* nextafter can be implemented by just +/- 1 on the int value */ + return (up ^ (f < 0)) ? xp : xn; +} + +uint16_t +_mesa_float_to_float16_ru(float val) +{ + uint16_t half = _mesa_float_to_half(val); + if (_mesa_half_to_float(half) < val) + return util_nextafter16(half, true); + return half; +} + +uint16_t +_mesa_float_to_float16_rd(float val) +{ + uint16_t half = _mesa_float_to_half(val); + if (_mesa_half_to_float(half) > val) + return util_nextafter16(half, false); + return half; +} diff --git a/src/util/half_float.h b/src/util/half_float.h index f184323bd60..6961e1ed618 100644 --- a/src/util/half_float.h +++ b/src/util/half_float.h @@ -113,6 +113,9 @@ _mesa_float_to_float16_rtz(float val) return _mesa_float_to_float16_rtz_slow(val); } +uint16_t _mesa_float_to_float16_ru(float val); +uint16_t _mesa_float_to_float16_rd(float val); + static inline uint16_t _mesa_float_to_float16_rtne(float val) { diff --git a/src/vulkan/runtime/bvh/meson.build b/src/vulkan/runtime/bvh/meson.build index 02b2afb4163..add1590b70f 100644 --- a/src/vulkan/runtime/bvh/meson.build +++ b/src/vulkan/runtime/bvh/meson.build @@ -42,6 +42,7 @@ bvh_shaders = [ ], ] +spirv_include_dir = dir_source_root + '/src/compiler/spirv' vk_bvh_include_dir = dir_source_root + '/src/vulkan/runtime/bvh' vk_bvh_includes = files( @@ -50,6 +51,7 @@ vk_bvh_includes = files( 'vk_build_interface.h', 'vk_bvh.h', 'vk_debug.h', + spirv_include_dir + '/spirv_internal_exts.h', ) vk_glsl_shader_extensions = [ @@ -69,6 +71,7 @@ vk_glsl_shader_extensions = [ 'GL_KHR_shader_subgroup_ballot', 'GL_KHR_shader_subgroup_clustered', 'GL_EXT_shader_atomic_int64', + 'GL_EXT_spirv_intrinsics', ] vk_glsl_shader_preamble = [] @@ -79,7 +82,7 @@ endforeach bvh_spv = [] foreach s : bvh_shaders command = [ - prog_glslang, '-V', '-I' + vk_bvh_include_dir, '--target-env', 'spirv1.5', '-x', '-o', '@OUTPUT@', '@INPUT@' + prog_glslang, '-V', '-I' + vk_bvh_include_dir, '-I' + spirv_include_dir, '--target-env', 'spirv1.5', '-x', '-o', '@OUTPUT@', '@INPUT@' ] + (with_mesa_debug ? ['-g'] : []) command += glslang_quiet command += vk_glsl_shader_preamble diff --git a/src/vulkan/runtime/bvh/vk_build_helpers.h b/src/vulkan/runtime/bvh/vk_build_helpers.h index 01acb4db715..dd5795855b2 100644 --- a/src/vulkan/runtime/bvh/vk_build_helpers.h +++ b/src/vulkan/runtime/bvh/vk_build_helpers.h @@ -180,6 +180,7 @@ #define INFINITY (1.0 / 0.0) #define NAN (0.0 / 0.0) +#define NAN_F16 (0.0hf / 0.0hf) #define INDEX(type, ptr, index) REF(type)(OFFSET(ptr, (index)*SIZEOF(type)))