Merge branch 'radv-box16' into 'main'

radv: Use box16 nodes when bvh8 is not used

See merge request mesa/mesa!37883
This commit is contained in:
Konstantin Seurer 2025-12-20 01:06:20 +01:00
commit ddef04ff5b
26 changed files with 647 additions and 53 deletions

View file

@ -279,6 +279,13 @@ emit_set_mode_block(fp_mode_ctx* ctx, Block* block)
instr->opcode = aco_opcode::v_cvt_f16_f32;
else
instr->opcode = aco_opcode::s_cvt_f16_f32;
} else if (instr->opcode == aco_opcode::p_v_cvt_f16_f32_rtpi ||
instr->opcode == aco_opcode::p_v_cvt_f16_f32_rtni) {
set_mode |= fp_state.require(mode_round16_64, instr->opcode == aco_opcode::p_v_cvt_f16_f32_rtpi ? fp_round_pi : fp_round_ni);
set_mode |= fp_state.require(mode_fp16_ovfl, default_state.fields[mode_fp16_ovfl]);
set_mode |= fp_state.require(mode_denorm16_64, default_state.fields[mode_denorm16_64]);
set_mode |= fp_state.require(mode_denorm32, default_state.fields[mode_denorm32]);
instr->opcode = aco_opcode::v_cvt_f16_f32;
} else if (instr->opcode == aco_opcode::p_v_cvt_pk_fp8_f32_ovfl) {
set_mode |= fp_state.require(mode_fp16_ovfl, 1);
instr->opcode = aco_opcode::v_cvt_pk_fp8_f32;

View file

@ -718,6 +718,8 @@ instr_is_16bit(amd_gfx_level gfx_level, aco_opcode op)
/* VOP1 */
case aco_opcode::v_cvt_f16_f32:
case aco_opcode::p_v_cvt_f16_f32_rtne:
case aco_opcode::p_v_cvt_f16_f32_rtpi:
case aco_opcode::p_v_cvt_f16_f32_rtni:
case aco_opcode::v_cvt_f16_u16:
case aco_opcode::v_cvt_f16_i16:
case aco_opcode::v_rcp_f16:

View file

@ -1029,6 +1029,8 @@ VOP1 = {
("v_cvt_i32_f32", dst(U32), src(F32), op(0x08)),
("v_cvt_f16_f32", dst(F16), src(F32), op(0x0a)),
("p_v_cvt_f16_f32_rtne", dst(F16), src(F32), op(-1)),
("p_v_cvt_f16_f32_rtpi", dst(F16), src(F32), op(-1)),
("p_v_cvt_f16_f32_rtni", dst(F16), src(F32), op(-1)),
("v_cvt_f32_f16", dst(F32), src(F16), op(0x0b)),
("v_cvt_rpi_i32_f32", dst(U32), src(F32), op(0x0c)), #v_cvt_nearest_i32_f32 in GFX11
("v_cvt_flr_i32_f32", dst(U32), src(F32), op(0x0d)),#v_cvt_floor_i32_f32 in GFX11

View file

@ -453,7 +453,9 @@ init_context(isel_context* ctx, nir_shader* shader)
case nir_op_sdot_2x16_iadd_sat:
case nir_op_bfdot2_bfadd:
case nir_op_byte_perm_amd:
case nir_op_alignbyte_amd: type = RegType::vgpr; break;
case nir_op_alignbyte_amd:
case nir_op_f2f16_ru:
case nir_op_f2f16_rd: type = RegType::vgpr; break;
case nir_op_fmul:
case nir_op_ffma:
case nir_op_fadd:

View file

@ -2615,6 +2615,13 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
}
break;
}
case nir_op_f2f16_ru:
case nir_op_f2f16_rd:
ctx->program->needs_fp_mode_insertion = true;
bld.vop1(instr->op == nir_op_f2f16_ru ? aco_opcode::p_v_cvt_f16_f32_rtpi
: aco_opcode::p_v_cvt_f16_f32_rtni,
Definition(dst), Operand(get_alu_src(ctx, instr->src[0])));
break;
case nir_op_f2f32: {
if (dst.regClass() == s1) {
assert(instr->src[0].src.ssa->bit_size == 16);

View file

@ -8,6 +8,7 @@
#define BVH_BUILD_HELPERS_H
#include "bvh.h"
#include "spirv_internal_exts.h"
#include "vk_build_helpers.h"
TYPE(radv_accel_struct_serialization_header, 8);
@ -110,4 +111,7 @@ radv_encode_blas_pointer_flags(uint32_t flags, uint32_t geometry_type)
return ptr_flags;
}
spirv_instruction(set = "MesaInternal", id = SpvOpFConvertRUMesa) float16_t radv_f32_to_f16_pos_inf(float f);
spirv_instruction(set = "MesaInternal", id = SpvOpFConvertRDMesa) float16_t radv_f32_to_f16_neg_inf(float f);
#endif /* BUILD_HELPERS_H */

View file

@ -26,6 +26,7 @@
#define RADV_BUILD_FLAG_PAIR_COMPRESS_TRIANGLES (1u << (VK_BUILD_FLAG_COUNT + 5))
#define RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES (1u << (VK_BUILD_FLAG_COUNT + 6))
#define RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES_RETRY (1u << (VK_BUILD_FLAG_COUNT + 7))
#define RADV_BUILD_FLAG_USE_BOX16 (1u << (VK_BUILD_FLAG_COUNT + 8))
#define RADV_COPY_MODE_COPY 0
#define RADV_COPY_MODE_SERIALIZE 1

View file

@ -34,6 +34,7 @@
#else
#include <vulkan/vulkan.h>
typedef uint16_t float16_t;
typedef struct radv_aabb16 radv_aabb16;
#endif
struct radv_accel_struct_serialization_header {
@ -112,9 +113,18 @@ struct radv_bvh_instance_node {
mat3x4 otw_matrix;
};
struct radv_aabb16 {
float16_t min_x;
float16_t min_y;
float16_t min_z;
float16_t max_x;
float16_t max_y;
float16_t max_z;
};
struct radv_bvh_box16_node {
uint32_t children[4];
float16_t coords[4][2][3];
radv_aabb16 coords[4];
};
struct radv_bvh_box32_node {

View file

@ -22,6 +22,32 @@ void set_parent(uint32_t child, uint32_t parent)
DEREF(REF(uint32_t)(addr)) = parent;
}
radv_aabb16
radv_aabb_f32_to_f16(vk_aabb aabb)
{
radv_aabb16 aabb16;
aabb16.min_x = radv_f32_to_f16_neg_inf(aabb.min.x);
aabb16.min_y = radv_f32_to_f16_neg_inf(aabb.min.y);
aabb16.min_z = radv_f32_to_f16_neg_inf(aabb.min.z);
aabb16.max_x = radv_f32_to_f16_pos_inf(aabb.max.x);
aabb16.max_y = radv_f32_to_f16_pos_inf(aabb.max.y);
aabb16.max_z = radv_f32_to_f16_pos_inf(aabb.max.z);
return aabb16;
}
vk_aabb
radv_aabb_f16_to_f32(radv_aabb16 aabb16)
{
vk_aabb aabb;
aabb.min.x = float(aabb16.min_x);
aabb.min.y = float(aabb16.min_y);
aabb.min.z = float(aabb16.min_z);
aabb.max.x = float(aabb16.max_x);
aabb.max.y = float(aabb16.max_y);
aabb.max.z = float(aabb16.max_z);
return aabb;
}
void
main()
{
@ -89,18 +115,15 @@ main()
memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
uint32_t bvh_offset = is_root_node ? id_to_offset(RADV_BVH_ROOT_NODE) : DEREF(src_node).bvh_offset;
if (bvh_offset == VK_UNKNOWN_BVH_OFFSET)
uint32_t node_id = is_root_node ? RADV_BVH_ROOT_NODE : DEREF(src_node).bvh_offset;
if (node_id == VK_UNKNOWN_BVH_OFFSET)
continue;
if (bvh_offset == VK_NULL_BVH_OFFSET)
if (node_id == VK_NULL_BVH_OFFSET)
break;
uint32_t flags = 0;
REF(radv_bvh_box32_node) dst_node = REF(radv_bvh_box32_node)(OFFSET(args.output_bvh, bvh_offset));
uint32_t node_id = pack_node_id(bvh_offset, radv_bvh_node_box32);
uint32_t found_child_count = 0;
uint32_t children[4] = {RADV_BVH_INVALID_NODE, RADV_BVH_INVALID_NODE,
RADV_BVH_INVALID_NODE, RADV_BVH_INVALID_NODE};
@ -158,20 +181,33 @@ main()
break;
}
REF(radv_bvh_box16_node) dst_node_f16 = REF(radv_bvh_box16_node)(OFFSET(args.output_bvh, id_to_offset(node_id)));
REF(radv_bvh_box32_node) dst_node_f32 = REF(radv_bvh_box32_node)(OFFSET(args.output_bvh, id_to_offset(node_id)));
bool is_box16 = VK_BUILD_FLAG(RADV_BUILD_FLAG_USE_BOX16) && id_to_type(node_id) == radv_bvh_node_box16;
for (uint32_t i = 0; i < found_child_count; ++i) {
uint32_t type = ir_id_to_type(children[i]);
uint32_t offset = ir_id_to_offset(children[i]);
uint32_t dst_offset;
uint32_t child_node_id;
vk_aabb child_aabb = DEREF(REF(vk_ir_node)OFFSET(args.intermediate_bvh, offset)).aabb;
if (type == vk_ir_node_internal) {
dst_offset = atomicAdd(DEREF(args.header).dst_node_offset, SIZEOF(radv_bvh_box32_node));
radv_aabb16 child_aabb16 = radv_aabb_f32_to_f16(child_aabb);
float surface_area_f16 = aabb_surface_area(radv_aabb_f16_to_f32(child_aabb16));
float surface_area_f32 = aabb_surface_area(child_aabb);
bool child_use_f16 = VK_BUILD_FLAG(RADV_BUILD_FLAG_USE_BOX16) && surface_area_f16 < surface_area_f32 * 1.5;
REF(vk_ir_box_node) child_node = REF(vk_ir_box_node)OFFSET(args.intermediate_bvh, offset);
DEREF(child_node).bvh_offset = dst_offset;
uint32_t dst_offset = atomicAdd(DEREF(args.header).dst_node_offset,
child_use_f16 ? SIZEOF(radv_bvh_box16_node) : SIZEOF(radv_bvh_box32_node));
child_node_id = pack_node_id(dst_offset, child_use_f16 ? radv_bvh_node_box16 : radv_bvh_node_box32);
REF(vk_ir_box_node) child_node = REF(vk_ir_box_node) OFFSET(args.intermediate_bvh, offset);
DEREF(child_node).bvh_offset = child_node_id;
flags |= (DEREF(child_node).flags & 0x3) << i * 8;
} else {
uint32_t child_index = offset / ir_leaf_node_size;
dst_offset = dst_leaf_offset + child_index * output_leaf_node_size;
uint32_t dst_offset = dst_leaf_offset + child_index * output_leaf_node_size;
if (type == vk_ir_node_instance) {
vk_ir_instance_node src_node =
@ -182,47 +218,65 @@ main()
uint32_t child_flags = fetch_child_flags(args.intermediate_bvh, children[i]);
flags |= (child_flags & 0x3) << i * 8;
}
child_node_id = pack_node_id(dst_offset, ir_type_to_bvh_type(type));
}
vk_aabb child_aabb =
DEREF(REF(vk_ir_node)OFFSET(args.intermediate_bvh, offset)).aabb;
/* On gfx11, infinities in AABB coords can cause garbage child nodes to be
* returned by box intersection tests with non-default box sorting modes.
* Subtract 1 from the integer representation of inf/-inf to turn it into
* the maximum/minimum representable floating-point value as a workaround.
*/
if (VK_BUILD_FLAG(RADV_BUILD_FLAG_NO_INFS)) {
for (uint32_t i = 0; i < 3; ++i) {
if (isinf(child_aabb.min[i]))
child_aabb.min[i] = uintBitsToFloat(floatBitsToUint(child_aabb.min[i]) - 1);
if (isinf(child_aabb.max[i]))
child_aabb.max[i] = uintBitsToFloat(floatBitsToUint(child_aabb.max[i]) - 1);
if (is_box16) {
DEREF(dst_node_f16).coords[i] = radv_aabb_f32_to_f16(child_aabb);
} else {
/* On gfx11, infinities in AABB coords can cause garbage child nodes to be
* returned by box intersection tests with non-default box sorting modes.
* Subtract 1 from the integer representation of inf/-inf to turn it into
* the maximum/minimum representable floating-point value as a workaround.
*/
if (VK_BUILD_FLAG(RADV_BUILD_FLAG_NO_INFS)) {
for (uint32_t i = 0; i < 3; ++i) {
if (isinf(child_aabb.min[i]))
child_aabb.min[i] = uintBitsToFloat(floatBitsToUint(child_aabb.min[i]) - 1);
if (isinf(child_aabb.max[i]))
child_aabb.max[i] = uintBitsToFloat(floatBitsToUint(child_aabb.max[i]) - 1);
}
}
DEREF(dst_node_f32).coords[i] = child_aabb;
}
DEREF(dst_node).coords[i] = child_aabb;
uint32_t child_id = pack_node_id(dst_offset, ir_type_to_bvh_type(type));
children[i] = child_id;
set_parent(child_id, node_id);
children[i] = child_node_id;
set_parent(child_node_id, node_id);
}
for (uint i = found_child_count; i < 4; ++i) {
if (is_box16) {
radv_aabb16 null_aabb;
null_aabb.min_x = NAN_F16;
null_aabb.min_y = NAN_F16;
null_aabb.min_z = NAN_F16;
null_aabb.max_x = NAN_F16;
null_aabb.max_y = NAN_F16;
null_aabb.max_z = NAN_F16;
for (uint i = found_child_count; i < 4; ++i)
DEREF(dst_node_f16).coords[i] = null_aabb;
} else {
for (uint i = found_child_count; i < 4; ++i) {
for (uint comp = 0; comp < 3; ++comp) {
DEREF(dst_node).coords[i].min[comp] = NAN;
DEREF(dst_node).coords[i].max[comp] = NAN;
DEREF(dst_node_f32).coords[i].min[comp] = NAN;
DEREF(dst_node_f32).coords[i].max[comp] = NAN;
}
}
}
/* Make changes to the children's BVH offset value available to the other invocations. */
memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
DEREF(dst_node).children = children;
if (is_box16) {
DEREF(dst_node_f16).children = children;
} else {
DEREF(dst_node_f32).children = children;
if (VK_BUILD_FLAG(VK_BUILD_FLAG_PROPAGATE_CULL_FLAGS))
DEREF(dst_node).flags = flags;
if (VK_BUILD_FLAG(VK_BUILD_FLAG_PROPAGATE_CULL_FLAGS))
DEREF(dst_node_f32).flags = flags;
}
break;
}

View file

@ -56,7 +56,7 @@ bvh_includes = files(
bvh_spv = []
foreach s : bvh_shaders
command = [
prog_glslang, '-V', '-I' + bvh_include_dir, '-I' + vk_bvh_include_dir, '--target-env', 'spirv1.5',
prog_glslang, '-V', '-I' + bvh_include_dir, '-I' + vk_bvh_include_dir, '-I' + spirv_include_dir, '--target-env', 'spirv1.5',
'-x', '-o', '@OUTPUT@', '@INPUT@', glslang_depfile, glslang_quiet,
]
command += vk_glsl_shader_preamble

View file

@ -374,7 +374,16 @@ rra_QueueSubmit2KHR(VkQueue _queue, uint32_t submitCount, const VkSubmitInfo2 *p
struct radv_device *device = radv_queue_device(queue);
VkResult result = device->layer_dispatch.rra.QueueSubmit2KHR(_queue, submitCount, pSubmits, _fence);
if (result != VK_SUCCESS || !device->rra_trace.triggered)
if (result != VK_SUCCESS)
return result;
if (radv_bvh_stats_file()) {
result = radv_dump_bvh_stats(_queue);
if (result != VK_SUCCESS)
return result;
}
if (!device->rra_trace.triggered)
return result;
uint32_t total_trace_count = 0;

View file

@ -75,6 +75,7 @@ enum radv_encode_key_bits {
RADV_ENCODE_KEY_WRITE_LEAF_NODE_OFFSETS = (1 << 0),
RADV_ENCODE_KEY_PAIR_COMPRESS_GFX12 = (1 << 1),
RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12 = (1 << 2),
RADV_ENCODE_KEY_USE_BOX16 = (1 << 3),
};
static void
@ -287,6 +288,8 @@ radv_get_build_config(VkDevice _device, struct vk_acceleration_structure_build_s
VK_FROM_HANDLE(radv_device, device, _device);
struct radv_physical_device *pdev = radv_device_physical(device);
VkGeometryTypeKHR geometry_type = vk_get_as_geometry_type(state->build_info);
uint32_t encode_key = 0;
if (radv_use_bvh8(pdev)) {
/*
@ -302,11 +305,13 @@ radv_get_build_config(VkDevice _device, struct vk_acceleration_structure_build_s
state->build_info->type != VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR)
encode_key |= RADV_ENCODE_KEY_WRITE_LEAF_NODE_OFFSETS;
VkGeometryTypeKHR geometry_type = vk_get_as_geometry_type(state->build_info);
if (!(state->build_info->flags & (VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_UPDATE_BIT_KHR |
VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_DATA_ACCESS_KHR)) &&
geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR)
encode_key |= RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12;
} else if (!radv_emulate_rt(pdev)) {
if (!(state->build_info->flags & VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_UPDATE_BIT_KHR))
encode_key |= RADV_ENCODE_KEY_USE_BOX16;
}
state->config.encode_key[0] = encode_key;
@ -391,6 +396,8 @@ radv_build_flags(VkCommandBuffer commandBuffer, uint32_t key)
flags |= RADV_BUILD_FLAG_PAIR_COMPRESS_TRIANGLES;
if (key & RADV_ENCODE_KEY_BATCH_COMPRESS_GFX12)
flags |= RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES;
if (key & RADV_ENCODE_KEY_USE_BOX16)
flags |= RADV_BUILD_FLAG_USE_BOX16;
return flags;
}

View file

@ -722,7 +722,7 @@ radv_device_init_tools(struct radv_device *device)
if (result != VK_SUCCESS)
return result;
if ((instance->vk.trace_mode & RADV_TRACE_MODE_RRA) && radv_enable_rt(pdev)) {
if (radv_bvh_dumping_enabled(instance) && radv_enable_rt(pdev)) {
result = radv_rra_trace_init(device);
if (result != VK_SUCCESS)
return result;
@ -798,7 +798,7 @@ init_dispatch_tables(struct radv_device *device, struct radv_physical_device *pd
if (instance->vk.trace_mode & RADV_TRACE_MODE_RGP)
add_entrypoints(&b, &sqtt_device_entrypoints, RADV_RGP_DISPATCH_TABLE);
if ((instance->vk.trace_mode & RADV_TRACE_MODE_RRA) && radv_enable_rt(pdev))
if (radv_bvh_dumping_enabled(instance) && radv_enable_rt(pdev))
add_entrypoints(&b, &rra_device_entrypoints, RADV_RRA_DISPATCH_TABLE);
#ifndef _WIN32

View file

@ -115,4 +115,17 @@ const char *radv_get_perftest_option_name(int id);
bool radv_is_rt_wave64_enabled(const struct radv_instance *instance);
static const char *
radv_bvh_stats_file()
{
return os_get_option("RADV_BVH_STATS_FILE");
}
static bool
radv_bvh_dumping_enabled(const struct radv_instance *instance)
{
/* Gathering bvh stats uses a large part of the rra code for dumping bvhs. */
return (instance->vk.trace_mode & RADV_TRACE_MODE_RRA) || radv_bvh_stats_file();
}
#endif /* RADV_INSTANCE_H */

View file

@ -198,7 +198,8 @@ rra_fill_accel_struct_header_common(const struct radv_physical_device *pdev, str
/* TODO: calculate active primitives */
.active_primitive_count = primitive_count,
.geometry_description_count = header->geometry_count,
.interior_fp32_node_count = bvh_info->internal_nodes_size / sizeof(struct radv_bvh_box32_node),
.interior_fp32_node_count = bvh_info->box32_count,
.interior_fp16_node_count = bvh_info->box16_count,
.leaf_node_count = primitive_count,
.rt_driver_interface_version = 8 << 16,
.rt_ip_version = pdev->info.rt_ip_version,
@ -488,6 +489,10 @@ radv_rra_trace_init(struct radv_device *device)
device->rra_trace.ray_history = UTIL_DYNARRAY_INIT;
/* BVH stats dumping does not need ray history. */
if (!(radv_physical_device_instance(pdev)->vk.trace_mode & RADV_TRACE_MODE_RRA))
return VK_SUCCESS;
device->rra_trace.ray_history_buffer_size = debug_get_num_option("RADV_RRA_TRACE_HISTORY_SIZE", 100 * 1024 * 1024);
if (device->rra_trace.ray_history_buffer_size <
sizeof(struct radv_ray_history_header) + sizeof(struct radv_packed_end_trace_token))
@ -624,6 +629,9 @@ radv_rra_trace_finish(VkDevice vk_device, struct radv_rra_trace_data *data)
simple_mtx_destroy(&data->data_mtx);
_mesa_hash_table_destroy(data->accel_structs, NULL);
_mesa_hash_table_u64_destroy(data->accel_struct_vas);
if (data->stats_file)
fclose(data->stats_file);
}
void
@ -789,7 +797,7 @@ rra_map_accel_struct_data(struct rra_copy_context *ctx, uint32_t i)
if (radv_GetEventStatus(ctx->device, data->build_event) != VK_EVENT_SET)
return NULL;
if (data->buffer->memory) {
if (data->buffer && data->buffer->memory) {
VkMemoryMapInfo memory_map_info = {
.sType = VK_STRUCTURE_TYPE_MEMORY_MAP_INFO,
.memory = data->buffer->memory,
@ -1297,3 +1305,167 @@ cleanup:
free(accel_struct_offsets);
return result;
}
static void
dump_bvh_stats(struct radv_device *device, struct vk_acceleration_structure *accel_struct,
struct radv_rra_accel_struct_data *accel_struct_data, uint8_t *data, struct hash_table_u64 *blas_sah,
bool tlas_pass)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
const struct radv_instance *instance = radv_physical_device_instance(pdev);
struct radv_accel_struct_header *header = (struct radv_accel_struct_header *)data;
bool is_tlas = header->instance_count > 0;
if (is_tlas != tlas_pass)
return;
/* convert root node id to offset */
uint32_t src_root_offset = (RADV_BVH_ROOT_NODE & ~7) << 3;
if (rra_validate_header(accel_struct_data, header)) {
return;
}
if (radv_use_bvh8(pdev)) {
if (rra_validate_node_gfx12(device->rra_trace.accel_struct_vas, data + header->bvh_offset,
data + header->bvh_offset + src_root_offset, header->geometry_count,
accel_struct_data->size, !is_tlas, 0)) {
return;
}
} else {
if (rra_validate_node_gfx10_3(device->rra_trace.accel_struct_vas, data + header->bvh_offset,
data + header->bvh_offset + src_root_offset, header->geometry_count,
accel_struct_data->size, !is_tlas, 0)) {
return;
}
}
if (!device->rra_trace.stats_file) {
device->rra_trace.stats_file = fopen(radv_bvh_stats_file(), "w");
fprintf(device->rra_trace.stats_file, "app,name,type,allocated_size,compacted_size");
if (radv_use_bvh8(pdev)) {
fprintf(device->rra_trace.stats_file, ",max_depth,box_node_count,primitive_node_count,instance_node_count");
} else {
fprintf(device->rra_trace.stats_file, ",max_depth,box16_node_count,box32_node_count,triangle_node_count,"
"instance_node_count,procedual_node_count");
}
fprintf(device->rra_trace.stats_file, ",sah,scene_sah\n");
}
fprintf(device->rra_trace.stats_file, "\"%s\",%s,%s,%" PRIu64 ",%" PRIu64, instance->vk.app_info.app_name,
vk_object_base_name(&accel_struct->base), is_tlas ? "tlas" : "blas", accel_struct_data->size,
header->compacted_size);
float extent[3] = {
header->aabb.max.x - header->aabb.min.x,
header->aabb.max.y - header->aabb.min.y,
header->aabb.max.z - header->aabb.min.z,
};
float surface_area = 2 * (extent[0] * extent[1] + extent[0] * extent[2] + extent[1] * extent[2]);
float sah;
float instance_sah;
if (radv_use_bvh8(pdev)) {
struct radv_bvh_stats_gfx12 stats = {};
radv_gather_bvh_stats_gfx12(data + header->bvh_offset, RADV_BVH_ROOT_NODE, 1, surface_area, blas_sah, &stats);
sah = stats.sah;
instance_sah = stats.instance_sah;
fprintf(device->rra_trace.stats_file, ",%u,%u,%u,%u", stats.max_depth, stats.box_node_count,
stats.primitive_node_count, stats.instance_node_count);
} else {
struct radv_bvh_stats_gfx10_3 stats = {};
radv_gather_bvh_stats_gfx10_3(data + header->bvh_offset, RADV_BVH_ROOT_NODE, 1, surface_area, blas_sah, &stats);
sah = stats.sah;
instance_sah = stats.instance_sah;
fprintf(device->rra_trace.stats_file, ",%u,%u,%u,%u,%u,%u", stats.max_depth, stats.box16_node_count,
stats.box32_node_count, stats.triangle_node_count, stats.instance_node_count, stats.procedual_node_count);
}
fprintf(device->rra_trace.stats_file, ",%u", (uint32_t)(sah / surface_area * 1000000));
if (is_tlas) {
fprintf(device->rra_trace.stats_file, ",%u\n", (uint32_t)((sah + instance_sah) / surface_area * 1000000));
} else {
fprintf(device->rra_trace.stats_file, ",0\n");
float *sah_ptr = ralloc(blas_sah, float);
*sah_ptr = sah / surface_area;
_mesa_hash_table_u64_insert(blas_sah, vk_acceleration_structure_get_va(accel_struct), sah_ptr);
}
fflush(device->rra_trace.stats_file);
}
VkResult
radv_dump_bvh_stats(VkQueue vk_queue)
{
VK_FROM_HANDLE(radv_queue, queue, vk_queue);
struct radv_device *device = radv_queue_device(queue);
VkDevice vk_device = radv_device_to_handle(device);
VkResult result = vk_common_DeviceWaitIdle(vk_device);
if (result != VK_SUCCESS)
return result;
struct hash_entry **hash_entries = NULL;
struct hash_table_u64 *blas_sah = NULL;
uint32_t struct_count = _mesa_hash_table_num_entries(device->rra_trace.accel_structs);
hash_entries = malloc(sizeof(*hash_entries) * struct_count);
if (!hash_entries) {
result = VK_ERROR_OUT_OF_HOST_MEMORY;
goto cleanup;
}
struct hash_entry *last_entry = NULL;
for (unsigned i = 0; (last_entry = _mesa_hash_table_next_entry(device->rra_trace.accel_structs, last_entry)); ++i)
hash_entries[i] = last_entry;
qsort(hash_entries, struct_count, sizeof(*hash_entries), accel_struct_entry_cmp);
struct rra_copy_context copy_ctx = {
.device = vk_device,
.queue = vk_queue,
.entries = hash_entries,
.family_index = queue->vk.queue_family_index,
.min_size = device->rra_trace.ray_history_buffer_size,
};
result = rra_copy_context_init(&copy_ctx);
if (result != VK_SUCCESS)
goto cleanup;
blas_sah = _mesa_hash_table_u64_create(NULL);
for (unsigned i = 0; i < struct_count; i++) {
void *mapped_data = rra_map_accel_struct_data(&copy_ctx, i);
if (!mapped_data)
continue;
dump_bvh_stats(device, (void *)hash_entries[i]->key, hash_entries[i]->data, mapped_data, blas_sah, false);
rra_unmap_accel_struct_data(&copy_ctx, i);
}
for (unsigned i = 0; i < struct_count; i++) {
if (_mesa_hash_table_u64_search(blas_sah, vk_acceleration_structure_get_va(hash_entries[i]->key)))
continue;
void *mapped_data = rra_map_accel_struct_data(&copy_ctx, i);
if (!mapped_data)
continue;
dump_bvh_stats(device, (void *)hash_entries[i]->key, hash_entries[i]->data, mapped_data, blas_sah, true);
rra_unmap_accel_struct_data(&copy_ctx, i);
}
rra_copy_context_finish(&copy_ctx);
result = VK_SUCCESS;
cleanup:
_mesa_hash_table_u64_destroy(blas_sah);
free(hash_entries);
return result;
}

View file

@ -107,6 +107,7 @@ struct radv_rra_trace_data {
struct hash_table *accel_structs;
struct hash_table_u64 *accel_struct_vas;
simple_mtx_t data_mtx;
FILE *stats_file;
bool validate_as;
bool copy_after_build;
bool triggered;
@ -288,6 +289,8 @@ struct rra_bvh_info {
uint32_t leaf_nodes_size;
uint32_t internal_nodes_size;
uint32_t instance_sideband_data_size;
uint32_t box32_count;
uint32_t box16_count;
struct rra_geometry_info *geometry_infos;
};
@ -320,4 +323,32 @@ void rra_gather_bvh_info_gfx12(const uint8_t *bvh, uint32_t node_id, struct rra_
void rra_transcode_node_gfx12(struct rra_transcoding_context *ctx, uint32_t parent_id, uint32_t src_id,
uint32_t dst_offset);
struct radv_bvh_stats_gfx10_3 {
uint32_t max_depth;
float sah;
float instance_sah;
uint32_t box16_node_count;
uint32_t box32_node_count;
uint32_t triangle_node_count;
uint32_t instance_node_count;
uint32_t procedual_node_count;
};
struct radv_bvh_stats_gfx12 {
uint32_t max_depth;
float sah;
float instance_sah;
uint32_t box_node_count;
uint32_t primitive_node_count;
uint32_t instance_node_count;
};
void radv_gather_bvh_stats_gfx10_3(const uint8_t *bvh, uint32_t node_id, uint32_t depth, float p,
struct hash_table_u64 *blas_sah, struct radv_bvh_stats_gfx10_3 *stats);
void radv_gather_bvh_stats_gfx12(const uint8_t *bvh, uint32_t node_id, uint32_t depth, float p,
struct hash_table_u64 *blas_sah, struct radv_bvh_stats_gfx12 *stats);
VkResult radv_dump_bvh_stats(VkQueue vk_queue);
#endif /* RADV_RRA_H */

View file

@ -177,9 +177,11 @@ rra_gather_bvh_info_gfx10_3(const uint8_t *bvh, uint32_t node_id, struct rra_bvh
switch (node_type) {
case radv_bvh_node_box16:
dst->internal_nodes_size += sizeof(struct rra_box16_node);
dst->box16_count++;
break;
case radv_bvh_node_box32:
dst->internal_nodes_size += sizeof(struct rra_box32_node);
dst->box32_count++;
break;
case radv_bvh_node_instance:
dst->leaf_nodes_size += sizeof(struct rra_instance_node);
@ -283,15 +285,15 @@ rra_transcode_box16_node(struct rra_transcoding_context *ctx, const struct radv_
vk_aabb bounds = {
.min =
{
_mesa_half_to_float(src->coords[i][0][0]),
_mesa_half_to_float(src->coords[i][0][1]),
_mesa_half_to_float(src->coords[i][0][2]),
_mesa_half_to_float(src->coords[i].min_x),
_mesa_half_to_float(src->coords[i].min_y),
_mesa_half_to_float(src->coords[i].min_z),
},
.max =
{
_mesa_half_to_float(src->coords[i][1][0]),
_mesa_half_to_float(src->coords[i][1][1]),
_mesa_half_to_float(src->coords[i][1][2]),
_mesa_half_to_float(src->coords[i].max_x),
_mesa_half_to_float(src->coords[i].max_y),
_mesa_half_to_float(src->coords[i].max_z),
},
};
@ -355,3 +357,78 @@ rra_transcode_node_gfx10_3(struct rra_transcoding_context *ctx, uint32_t parent_
return dst_id;
}
void
radv_gather_bvh_stats_gfx10_3(const uint8_t *bvh, uint32_t node_id, uint32_t depth, float p,
struct hash_table_u64 *blas_sah, struct radv_bvh_stats_gfx10_3 *stats)
{
uint32_t node_type = node_id & 7;
const void *node = bvh + ((node_id & (~7u)) << 3);
stats->max_depth = MAX2(stats->max_depth, depth);
switch (node_type) {
case radv_bvh_node_box16: {
stats->sah += 1.0 * p;
stats->box16_node_count++;
const struct radv_bvh_box16_node *box16 = node;
for (uint32_t i = 0; i < 4; i++) {
if (box16->children[i] != 0xffffffff) {
float extent[3] = {
_mesa_half_to_float(box16->coords[i].max_x) - _mesa_half_to_float(box16->coords[i].min_x),
_mesa_half_to_float(box16->coords[i].max_y) - _mesa_half_to_float(box16->coords[i].min_y),
_mesa_half_to_float(box16->coords[i].max_z) - _mesa_half_to_float(box16->coords[i].min_z),
};
float surface_area = 2 * (extent[0] * extent[1] + extent[0] * extent[2] + extent[1] * extent[2]);
radv_gather_bvh_stats_gfx10_3(bvh, box16->children[i], depth + 1, surface_area, blas_sah, stats);
}
}
break;
}
case radv_bvh_node_box32: {
stats->sah += 1.5 * p;
stats->box32_node_count++;
const struct radv_bvh_box32_node *box32 = node;
for (uint32_t i = 0; i < 4; i++) {
if (box32->children[i] != 0xffffffff) {
float extent[3] = {
box32->coords[i].max.x - box32->coords[i].min.x,
box32->coords[i].max.y - box32->coords[i].min.y,
box32->coords[i].max.z - box32->coords[i].min.z,
};
float surface_area = 2 * (extent[0] * extent[1] + extent[0] * extent[2] + extent[1] * extent[2]);
radv_gather_bvh_stats_gfx10_3(bvh, box32->children[i], depth + 1, surface_area, blas_sah, stats);
}
}
break;
}
case radv_bvh_node_instance: {
stats->sah += 2.0 * p;
stats->instance_node_count++;
const struct radv_bvh_instance_node *instance = node;
uint64_t blas_va = radv_node_to_addr(instance->bvh_ptr) - instance->bvh_offset;
float *sah = _mesa_hash_table_u64_search(blas_sah, blas_va);
if (sah)
stats->instance_sah += *sah * p;
else
fprintf(stderr, "radv: Could not find SAH for BLAS at address 0x%lx\n", blas_va);
break;
}
case radv_bvh_node_triangle:
stats->sah += 2.0 * p;
stats->triangle_node_count++;
break;
case radv_bvh_node_aabb:
stats->sah += 4.0 * p;
stats->procedual_node_count++;
break;
default:
break;
}
}

View file

@ -10,6 +10,7 @@
#include "radv_rra.h"
#include "util/bitset.h"
#include "util/compiler.h"
struct rra_instance_sideband_data {
uint32_t instance_index;
@ -306,3 +307,98 @@ rra_transcode_node_gfx12(struct rra_transcoding_context *ctx, uint32_t parent_id
}
}
}
void
radv_gather_bvh_stats_gfx12(const uint8_t *bvh, uint32_t node_id, uint32_t depth, float surface_area,
struct hash_table_u64 *blas_sah, struct radv_bvh_stats_gfx12 *stats)
{
uint32_t node_type = node_id & 0xf;
const void *node = bvh + ((node_id & (~0xf)) << 3);
stats->max_depth = MAX2(stats->max_depth, depth);
switch (node_type) {
case radv_bvh_node_box32: {
stats->box_node_count++;
stats->sah += 0.5 * surface_area;
const struct radv_gfx12_box_node *src = node;
uint32_t valid_child_count_minus_one = src->child_count_exponents >> 28;
if (valid_child_count_minus_one != 0xf) {
uint32_t internal_id = src->internal_base_id;
uint32_t primitive_id = src->primitive_base_id;
uint32_t exponents[3] = {
src->child_count_exponents & 0xff,
(src->child_count_exponents >> 8) & 0xff,
(src->child_count_exponents >> 16) & 0xff,
};
float extent[3] = {
uif(exponents[0] << 23),
uif(exponents[1] << 23),
uif(exponents[2] << 23),
};
for (uint32_t i = 0; i <= valid_child_count_minus_one; i++) {
uint32_t child_type = (src->children[i].dword2 >> 24) & 0xf;
uint32_t child_size = src->children[i].dword2 >> 28;
uint32_t child_id;
if (child_type == radv_bvh_node_box32) {
child_id = internal_id | child_type;
internal_id += (child_size * RADV_GFX12_BVH_NODE_SIZE) >> 3;
} else {
child_id = primitive_id | child_type;
primitive_id += (child_size * RADV_GFX12_BVH_NODE_SIZE) >> 3;
}
float min[3] = {
(float)(src->children[i].dword0 & 0xfff) / 0x1000 * extent[0],
(float)((src->children[i].dword0 >> 12) & 0xfff) / 0x1000 * extent[1],
(float)(src->children[i].dword1 & 0xfff) / 0x1000 * extent[2],
};
float max[3] = {
(float)(((src->children[i].dword1 >> 12) & 0xfff) + 1) / 0x1000 * extent[0],
(float)((src->children[i].dword2 & 0xfff) + 1) / 0x1000 * extent[1],
(float)(((src->children[i].dword2 >> 12) & 0xfff) + 1) / 0x1000 * extent[2],
};
float child_extent[3] = {
max[0] - min[0],
max[1] - min[1],
max[2] - min[2],
};
float child_surface_area = 2 * (child_extent[0] * child_extent[1] + child_extent[0] * child_extent[2] +
child_extent[1] * child_extent[2]);
radv_gather_bvh_stats_gfx12(bvh, child_id, depth + 1, child_surface_area, blas_sah, stats);
}
}
break;
}
case radv_bvh_node_instance: {
stats->instance_node_count++;
stats->sah += 0.7 * surface_area;
struct radv_gfx12_instance_node *instance = (struct radv_gfx12_instance_node *)(node);
const struct radv_gfx12_instance_node_user_data *user_data =
(const void *)((const uint8_t *)node + sizeof(struct radv_gfx12_instance_node));
uint64_t blas_va = radv_node_to_addr(instance->pointer_flags_bvh_addr) - user_data->bvh_offset;
float *sah = _mesa_hash_table_u64_search(blas_sah, blas_va);
if (sah)
stats->instance_sah += *sah * surface_area;
else
fprintf(stderr, "radv: Could not find SAH for BLAS at address 0x%lx\n", blas_va);
break;
}
case radv_bvh_node_triangle:
stats->primitive_node_count++;
FALLTHROUGH;
default:
stats->sah += 1.0 * surface_area;
break;
}
}

View file

@ -257,7 +257,7 @@ for src_t in [tint, tuint, tfloat, tbool]:
for dst_t in dst_types:
for dst_bit_size in type_sizes(dst_t):
if dst_bit_size == 16 and dst_t == tfloat and src_t == tfloat:
rnd_modes = ['_rtne', '_rtz', '']
rnd_modes = ['_rtne', '_rtz', '_ru', '_rd', '']
for rnd_mode in rnd_modes:
if rnd_mode == '_rtne':
conv_expr = """
@ -279,6 +279,22 @@ for src_t in [tint, tuint, tfloat, tbool]:
dst = src0;
}
"""
elif rnd_mode == '_ru':
conv_expr = """
if (bit_size > 16) {
dst = _mesa_half_to_float(_mesa_float_to_float16_ru(src0));
} else {
dst = src0;
}
"""
elif rnd_mode == '_rd':
conv_expr = """
if (bit_size > 16) {
dst = _mesa_half_to_float(_mesa_float_to_float16_rd(src0));
} else {
dst = src0;
}
"""
else:
conv_expr = """
if (bit_size > 32) {

View file

@ -0,0 +1,13 @@
/*
* Copyright © 2025 Valve Corporation
*
* SPDX-License-Identifier: MIT
*/
#ifndef SPIRV_INTERNAL_EXTS_H
#define SPIRV_INTERNAL_EXTS_H
#define SpvOpFConvertRUMesa 0
#define SpvOpFConvertRDMesa 1
#endif

View file

@ -923,6 +923,29 @@ vtn_handle_non_semantic_debug_info(struct vtn_builder *b, SpvOp ext_opcode,
return true;
}
static bool
vtn_handle_mesa_internal(struct vtn_builder *b, SpvOp ext_opcode,
const uint32_t *w, unsigned count)
{
uint32_t instr = w[4];
switch (instr) {
case SpvOpFConvertRUMesa: {
struct vtn_ssa_value *arg = vtn_ssa_value(b, w[5]);
vtn_push_nir_ssa(b, w[2], nir_f2f16_ru(&b->nb, arg->def));
break;
}
case SpvOpFConvertRDMesa: {
struct vtn_ssa_value *arg = vtn_ssa_value(b, w[5]);
vtn_push_nir_ssa(b, w[2], nir_f2f16_rd(&b->nb, arg->def));
break;
}
}
return true;
}
static void
vtn_handle_extension(struct vtn_builder *b, SpvOp opcode,
const uint32_t *w, unsigned count)
@ -958,6 +981,8 @@ vtn_handle_extension(struct vtn_builder *b, SpvOp opcode,
val->ext_handler = vtn_handle_debug_printf;
} else if (strstr(ext, "NonSemantic.") == ext) {
val->ext_handler = vtn_handle_non_semantic_instruction;
} else if (strstr(ext, "MesaInternal") == ext) {
val->ext_handler = vtn_handle_mesa_internal;
} else {
vtn_fail("Unsupported extension: %s", ext);
}

View file

@ -33,6 +33,7 @@
#include "spirv.h"
#include "spirv_info.h"
#include "vtn_generator_ids.h"
#include "spirv_internal_exts.h"
extern uint32_t mesa_spirv_debug;

View file

@ -211,3 +211,41 @@ uint16_t _mesa_uint16_div_64k_to_half(uint16_t v)
return (e << 10) | m;
}
static uint16_t
util_nextafter16(uint16_t x, bool up)
{
uint16_t sign_mask = 1ull << 15;
uint16_t min_abs = 1;
float f = _mesa_half_to_float(x);
if (isnan(f) || (f == INFINITY && up) || (f == -INFINITY && !up))
return x;
/* beware of: +/-0.0 - 1 == NaN */
uint16_t xn = f == 0 ? (sign_mask | min_abs) : x - 1;
/* beware of -0.0 + 1 == -0x1p-149 */
uint16_t xp = f == 0 ? min_abs : x + 1;
/* nextafter can be implemented by just +/- 1 on the int value */
return (up ^ (f < 0)) ? xp : xn;
}
uint16_t
_mesa_float_to_float16_ru(float val)
{
uint16_t half = _mesa_float_to_half(val);
if (_mesa_half_to_float(half) < val)
return util_nextafter16(half, true);
return half;
}
uint16_t
_mesa_float_to_float16_rd(float val)
{
uint16_t half = _mesa_float_to_half(val);
if (_mesa_half_to_float(half) > val)
return util_nextafter16(half, false);
return half;
}

View file

@ -113,6 +113,9 @@ _mesa_float_to_float16_rtz(float val)
return _mesa_float_to_float16_rtz_slow(val);
}
uint16_t _mesa_float_to_float16_ru(float val);
uint16_t _mesa_float_to_float16_rd(float val);
static inline uint16_t
_mesa_float_to_float16_rtne(float val)
{

View file

@ -42,6 +42,7 @@ bvh_shaders = [
],
]
spirv_include_dir = dir_source_root + '/src/compiler/spirv'
vk_bvh_include_dir = dir_source_root + '/src/vulkan/runtime/bvh'
vk_bvh_includes = files(
@ -50,6 +51,7 @@ vk_bvh_includes = files(
'vk_build_interface.h',
'vk_bvh.h',
'vk_debug.h',
spirv_include_dir + '/spirv_internal_exts.h',
)
vk_glsl_shader_extensions = [
@ -69,6 +71,7 @@ vk_glsl_shader_extensions = [
'GL_KHR_shader_subgroup_ballot',
'GL_KHR_shader_subgroup_clustered',
'GL_EXT_shader_atomic_int64',
'GL_EXT_spirv_intrinsics',
]
vk_glsl_shader_preamble = []
@ -79,7 +82,7 @@ endforeach
bvh_spv = []
foreach s : bvh_shaders
command = [
prog_glslang, '-V', '-I' + vk_bvh_include_dir, '--target-env', 'spirv1.5', '-x', '-o', '@OUTPUT@', '@INPUT@'
prog_glslang, '-V', '-I' + vk_bvh_include_dir, '-I' + spirv_include_dir, '--target-env', 'spirv1.5', '-x', '-o', '@OUTPUT@', '@INPUT@'
] + (with_mesa_debug ? ['-g'] : [])
command += glslang_quiet
command += vk_glsl_shader_preamble

View file

@ -180,6 +180,7 @@
#define INFINITY (1.0 / 0.0)
#define NAN (0.0 / 0.0)
#define NAN_F16 (0.0hf / 0.0hf)
#define INDEX(type, ptr, index) REF(type)(OFFSET(ptr, (index)*SIZEOF(type)))