mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-20 15:38:19 +02:00
Reviewed-by: Natalie Vock <natalie.vock@gmx.de> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38462>
688 lines
33 KiB
Text
688 lines
33 KiB
Text
/*
|
|
* Copyright © 2025 Valve Corporation
|
|
*
|
|
* SPDX-License-Identifier: MIT
|
|
*/
|
|
|
|
#version 460
|
|
|
|
#extension GL_GOOGLE_include_directive : require
|
|
|
|
#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
|
|
#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
|
|
#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
|
|
#extension GL_EXT_shader_explicit_arithmetic_types_int64 : require
|
|
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
|
|
#extension GL_EXT_scalar_block_layout : require
|
|
#extension GL_EXT_buffer_reference : require
|
|
#extension GL_EXT_buffer_reference2 : require
|
|
#extension GL_KHR_memory_scope_semantics : require
|
|
#extension GL_KHR_shader_subgroup_basic : require
|
|
#extension GL_KHR_shader_subgroup_shuffle : require
|
|
#extension GL_KHR_shader_subgroup_ballot : require
|
|
#extension GL_KHR_shader_subgroup_clustered : require
|
|
|
|
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
|
|
|
#define GFX12
|
|
#define USE_GLOBAL_SYNC
|
|
|
|
#include "vk_debug.h"
|
|
|
|
#include "build_helpers.h"
|
|
#include "build_interface.h"
|
|
#include "encode.h"
|
|
#include "invocation_cluster.h"
|
|
|
|
layout(push_constant) uniform CONSTS
|
|
{
|
|
encode_triangles_gfx12_args args;
|
|
};
|
|
|
|
#define UNASSIGNED_VERTEX_INDICES 0xfffffffffffful
|
|
|
|
void
|
|
main()
|
|
{
|
|
bool is_retry = VK_BUILD_FLAG(RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES_RETRY);
|
|
|
|
uint32_t global_id = gl_GlobalInvocationID.x;
|
|
|
|
/* Each invocation cluster handles one task. */
|
|
radv_invocation_cluster cluster;
|
|
radv_invocation_cluster_init(cluster, RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT);
|
|
|
|
uint32_t task_index = global_id / RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT;
|
|
if (is_retry) {
|
|
VOID_REF retry_indices = OFFSET(args.header, SIZEOF(vk_ir_header) + args.batches_size);
|
|
task_index = DEREF(INDEX(uint32_t, retry_indices, task_index));
|
|
}
|
|
|
|
VOID_REF triangle_tasks = OFFSET(args.header, SIZEOF(vk_ir_header));
|
|
REF(radv_triangle_encode_task) task = INDEX(radv_triangle_encode_task, triangle_tasks, task_index);
|
|
uint32_t pair_index_node_index0 = DEREF(task).pair_index_node_index[cluster.invocation_index * 2];
|
|
uint32_t pair_index_node_index1 = DEREF(task).pair_index_node_index[cluster.invocation_index * 2 + 1];
|
|
|
|
uint32_t total_pair_count = min(findLSB(radv_ballot(cluster, pair_index_node_index0 == RADV_BVH_INVALID_NODE)), 8u);
|
|
|
|
if (cluster.invocation_index >= total_pair_count)
|
|
return;
|
|
|
|
uint32_t leaf_index0 = pair_index_node_index0 & 0x0fffffff;
|
|
vk_ir_triangle_node node0 = DEREF(INDEX(vk_ir_triangle_node, args.intermediate_bvh, leaf_index0));
|
|
|
|
uint32_t triangle_id0 = node0.triangle_id;
|
|
uint32_t geometry_id0 = node0.geometry_id_and_flags & 0xffffff;
|
|
bool opaque0 = (node0.geometry_id_and_flags & VK_GEOMETRY_OPAQUE) != 0;
|
|
uint32_t triangle_id1 = triangle_id0;
|
|
uint32_t geometry_id1 = geometry_id0;
|
|
bool opaque1 = false;
|
|
|
|
vec3 vertices[6];
|
|
vertices[0] = vec3(node0.coords[0][0], node0.coords[0][1], node0.coords[0][2]);
|
|
vertices[1] = vec3(node0.coords[1][0], node0.coords[1][1], node0.coords[1][2]);
|
|
vertices[2] = vec3(node0.coords[2][0], node0.coords[2][1], node0.coords[2][2]);
|
|
|
|
uint32_t pair_vertex_indices = 0x210;
|
|
|
|
uint32_t pair_size = 1;
|
|
if (pair_index_node_index1 != RADV_BVH_INVALID_NODE) {
|
|
pair_size = 2;
|
|
|
|
uint32_t leaf_index1 = pair_index_node_index1 & 0x0fffffff;
|
|
vk_ir_triangle_node node1 = DEREF(INDEX(vk_ir_triangle_node, args.intermediate_bvh, leaf_index1));
|
|
|
|
triangle_id1 = node1.triangle_id;
|
|
geometry_id1 = node1.geometry_id_and_flags & 0xffffff;
|
|
opaque1 = (node1.geometry_id_and_flags & VK_GEOMETRY_OPAQUE) != 0;
|
|
|
|
vertices[3] = vec3(node1.coords[0][0], node1.coords[0][1], node1.coords[0][2]);
|
|
vertices[4] = vec3(node1.coords[1][0], node1.coords[1][1], node1.coords[1][2]);
|
|
vertices[5] = vec3(node1.coords[2][0], node1.coords[2][1], node1.coords[2][2]);
|
|
|
|
pair_vertex_indices = 0x543210;
|
|
|
|
/* Deduplicate vertices here so it does not have to be done during the compression loop. */
|
|
for (uint32_t i = 0; i < 3; i++) {
|
|
for (uint32_t j = 0; j < 3; j++) {
|
|
if (vertices[3 + i] == vertices[j]) {
|
|
uint32_t bit_offset = (i + 3) * 4;
|
|
uint32_t clear_mask = ~(0xf << bit_offset);
|
|
pair_vertex_indices = (pair_vertex_indices & clear_mask) | (j << bit_offset);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Encode inside a loop. Every active invocation tries to compress with the previously chosen
|
|
* nodes. The invocation with the smallest node size is chosen. TODO: Are there better heuristics?
|
|
* If there are no new candidates because the node would be too large, encode the previously chosen nodes
|
|
* and break out of the loop. In this case the first active invocation is chosen.
|
|
*/
|
|
|
|
/* Each vertex is described by 8 bits. The highest 4 contain the invocation index and the low 4 bits contain the
|
|
* array index.
|
|
*/
|
|
uint64_t vertex_indices = UNASSIGNED_VERTEX_INDICES;
|
|
|
|
bool vertex_used[6] = {false, false, false, false, false, false};
|
|
|
|
uint32_t hw_node_index = 0;
|
|
uvec3 encode_vertex_payload_bit_size;
|
|
uint32_t encode_trailing_zero_bits;
|
|
uint32_t encode_geometry_id_base_bit_size;
|
|
uint32_t encode_geometry_id_payload_bit_size;
|
|
uint32_t encode_triangle_id_base_bit_size;
|
|
uint32_t encode_triangle_id_payload_bit_size;
|
|
uint32_t encode_indices_midpoint;
|
|
|
|
uint32_t invocation_vertex_count = pair_index_node_index1 != RADV_BVH_INVALID_NODE ? 6 : 3;
|
|
|
|
while (true) {
|
|
/* assigned is true for every invocation whorse triangles are already part of the node. */
|
|
bool assigned = vertex_indices != UNASSIGNED_VERTEX_INDICES;
|
|
uint32_t assigned_mask = radv_ballot(cluster, assigned);
|
|
uint32_t first_assigned_invocation = findLSB(assigned_mask);
|
|
uint32_t last_assigned_invocation = assigned_mask != 0 ? findMSB(assigned_mask) : 0;
|
|
|
|
if (!assigned)
|
|
vertex_indices = 0;
|
|
|
|
bool found[6] = {false, false, false, false, false, false};
|
|
|
|
/* At this point vertex_used is only set for assigned invocations since the rejected candidate invocations are
|
|
* reset.
|
|
*/
|
|
uint32_t vertex_count = 0;
|
|
for (uint32_t i = 0; i < 6; i++)
|
|
vertex_count += bitCount(radv_ballot(cluster, vertex_used[i]));
|
|
|
|
for (uint32_t target_invocation = first_assigned_invocation; target_invocation <= last_assigned_invocation;
|
|
target_invocation++) {
|
|
|
|
if (((assigned_mask >> target_invocation) & 1) == 0)
|
|
continue;
|
|
|
|
vec3 target_vertices[6];
|
|
bool target_vertex_used[6];
|
|
for (uint32_t i = 0; i < 6; i++) {
|
|
target_vertices[i] = radv_read_invocation(cluster, target_invocation, vertices[i]);
|
|
target_vertex_used[i] = radv_read_invocation(cluster, target_invocation, vertex_used[i]);
|
|
}
|
|
|
|
uint32_t target_vertex_count = radv_read_invocation(cluster, target_invocation, invocation_vertex_count);
|
|
|
|
if (!assigned) {
|
|
for (uint32_t candidate_vertex_index = 0; candidate_vertex_index < invocation_vertex_count;
|
|
candidate_vertex_index++) {
|
|
if (found[candidate_vertex_index])
|
|
continue;
|
|
|
|
uint32_t assign_index = 0;
|
|
|
|
for (uint32_t target_vertex_index = 0; target_vertex_index < target_vertex_count;
|
|
target_vertex_index++) {
|
|
if (target_vertex_used[target_vertex_index] &&
|
|
target_vertices[target_vertex_index] == vertices[candidate_vertex_index]) {
|
|
found[candidate_vertex_index] = true;
|
|
assign_index = target_vertex_index;
|
|
}
|
|
}
|
|
|
|
if (found[candidate_vertex_index])
|
|
vertex_indices |= uint64_t((target_invocation << 4) + assign_index)
|
|
<< uint64_t(candidate_vertex_index * 8);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Handle the remaining vertices that are not already present in the assigned invocations. */
|
|
if (!assigned) {
|
|
for (uint32_t i = 0; i < invocation_vertex_count; i++) {
|
|
if (found[i])
|
|
continue;
|
|
|
|
uint32_t pair_vertex_index = (pair_vertex_indices >> (i * 4)) & 0xf;
|
|
if (pair_vertex_index == i) {
|
|
vertex_indices |= uint64_t((cluster.invocation_index << 4) + i) << uint64_t(i * 8);
|
|
vertex_used[i] = true;
|
|
vertex_count++;
|
|
} else {
|
|
uint64_t vertex_index = (vertex_indices >> uint64_t(pair_vertex_index * 8)) & 0xff;
|
|
vertex_indices |= vertex_index << uint64_t(i * 8);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Compute the node layout and size. For assigned invocations, the values contain information about the node with
|
|
* only the assigned triangles and for !assigned invocations, the current invocation is included.
|
|
*/
|
|
|
|
uint32_t triangle_id_base_bit_size;
|
|
uint32_t triangle_id_payload_bit_size;
|
|
uint32_t geometry_id_base_bit_size;
|
|
uint32_t geometry_id_payload_bit_size;
|
|
uint32_t trailing_zero_bits;
|
|
uvec3 vertex_payload_bit_size;
|
|
bool has_assigned = first_assigned_invocation != 0xffffffff;
|
|
uint32_t size_loop_start = has_assigned ? 0 : cluster.invocation_index;
|
|
uint32_t size_loop_end = has_assigned ? first_assigned_invocation : cluster.invocation_index;
|
|
for (uint32_t i = size_loop_start; i <= size_loop_end; i++) {
|
|
uvec3 vertex_prefix = radv_read_invocation(cluster, i, floatBitsToUint(vertices[0]));
|
|
uvec3 vertex_payload_mask = uvec3(0);
|
|
uint32_t vertex_non_zero_mask = 0;
|
|
for (uint32_t i = 0; i < invocation_vertex_count; i++) {
|
|
vertex_payload_mask |= vertex_prefix ^ floatBitsToUint(vertices[i]);
|
|
vertex_non_zero_mask |=
|
|
floatBitsToUint(vertices[i].x) | floatBitsToUint(vertices[i].y) | floatBitsToUint(vertices[i].z);
|
|
}
|
|
uint32_t invoc_trailing_zero_bits = min(findLSB(vertex_non_zero_mask), 32u);
|
|
uvec3 invoc_vertex_payload_bit_size = min(findMSB(vertex_payload_mask), 31u) + 1;
|
|
trailing_zero_bits =
|
|
subgroupClusteredMin(assigned ? invoc_trailing_zero_bits : 32, RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT);
|
|
vertex_payload_bit_size =
|
|
subgroupClusteredMax(assigned ? invoc_vertex_payload_bit_size : uvec3(0), RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT);
|
|
|
|
/* Determine the number of bits required to represent the node ids in the hw's encoding format.
|
|
* Base and "offset" are masked and OR'd together, so look at the highest-ordered differing bit.
|
|
*/
|
|
uint32_t triangle_id_base = radv_read_invocation(cluster, i, triangle_id0);
|
|
triangle_id_base_bit_size = findMSB(triangle_id_base) + 1;
|
|
uint32_t invoc_triangle_id_payload_bit_size =
|
|
max(findMSB(triangle_id0 ^ triangle_id_base), findMSB(triangle_id1 ^ triangle_id_base)) + 1;
|
|
triangle_id_payload_bit_size = subgroupClusteredMax(assigned ? invoc_triangle_id_payload_bit_size : 0,
|
|
RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT);
|
|
|
|
uint32_t geometry_id_base = radv_read_invocation(cluster, i, geometry_id0);
|
|
geometry_id_base_bit_size = align(findMSB(geometry_id_base) + 1, 2);
|
|
uint32_t invoc_geometry_id_payload_bit_size =
|
|
max(findMSB(geometry_id0 ^ geometry_id_base), findMSB(geometry_id1 ^ geometry_id_base)) + 1;
|
|
geometry_id_payload_bit_size = subgroupClusteredMax(assigned ? invoc_geometry_id_payload_bit_size : 0,
|
|
RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT);
|
|
|
|
if (!assigned) {
|
|
trailing_zero_bits = min(trailing_zero_bits, invoc_trailing_zero_bits);
|
|
vertex_payload_bit_size = max(vertex_payload_bit_size, invoc_vertex_payload_bit_size);
|
|
triangle_id_payload_bit_size = max(triangle_id_payload_bit_size, invoc_triangle_id_payload_bit_size);
|
|
geometry_id_payload_bit_size = max(geometry_id_payload_bit_size, invoc_geometry_id_payload_bit_size);
|
|
}
|
|
|
|
if (cluster.invocation_index <= i)
|
|
break;
|
|
}
|
|
|
|
geometry_id_payload_bit_size = align(geometry_id_payload_bit_size, 2);
|
|
|
|
vertex_payload_bit_size.x =
|
|
vertex_payload_bit_size.x > trailing_zero_bits ? vertex_payload_bit_size.x - trailing_zero_bits : 1;
|
|
vertex_payload_bit_size.y =
|
|
vertex_payload_bit_size.y > trailing_zero_bits ? vertex_payload_bit_size.y - trailing_zero_bits : 1;
|
|
vertex_payload_bit_size.z =
|
|
vertex_payload_bit_size.z > trailing_zero_bits ? vertex_payload_bit_size.z - trailing_zero_bits : 1;
|
|
|
|
uvec3 vertex_base_bit_size = uvec3(32 - trailing_zero_bits) - vertex_payload_bit_size;
|
|
|
|
uint32_t required_bit_size = RADV_GFX12_PRIMITIVE_NODE_HEADER_SIZE;
|
|
|
|
required_bit_size += vertex_base_bit_size.x + vertex_base_bit_size.y + vertex_base_bit_size.z;
|
|
required_bit_size +=
|
|
vertex_count * (vertex_payload_bit_size.x + vertex_payload_bit_size.y + vertex_payload_bit_size.z);
|
|
|
|
uint32_t pair_count = bitCount(assigned_mask);
|
|
if (!assigned)
|
|
pair_count++;
|
|
|
|
required_bit_size += geometry_id_base_bit_size + (pair_count * 2 - 1) * geometry_id_payload_bit_size;
|
|
uint32_t indices_midpoint = required_bit_size;
|
|
required_bit_size += triangle_id_base_bit_size + (pair_count * 2 - 1) * triangle_id_payload_bit_size;
|
|
|
|
uint32_t triangle_pair_descs_size = RADV_GFX12_PRIMITIVE_NODE_PAIR_DESC_SIZE * pair_count;
|
|
required_bit_size += triangle_pair_descs_size;
|
|
|
|
if (vertex_count > 15)
|
|
required_bit_size = RADV_GFX12_BVH_NODE_SIZE * 8 + 1;
|
|
|
|
/* This is only relevant for unassigned invocations. If every invocation is assigned, the 0xffffffff will force a
|
|
* final flush.
|
|
*/
|
|
uint32_t min_required_bit_size =
|
|
subgroupClusteredMin(assigned ? 0xffffffff : required_bit_size, RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT);
|
|
|
|
/* The last iteration always needs to write the remaining triangles. */
|
|
if (min_required_bit_size > RADV_GFX12_BVH_NODE_SIZE * 8) {
|
|
if (assigned) {
|
|
encode_vertex_payload_bit_size = vertex_payload_bit_size;
|
|
encode_trailing_zero_bits = trailing_zero_bits;
|
|
encode_geometry_id_base_bit_size = geometry_id_base_bit_size;
|
|
encode_geometry_id_payload_bit_size = geometry_id_payload_bit_size;
|
|
encode_triangle_id_base_bit_size = triangle_id_base_bit_size;
|
|
encode_triangle_id_payload_bit_size = triangle_id_payload_bit_size;
|
|
encode_indices_midpoint = indices_midpoint;
|
|
break;
|
|
} else {
|
|
hw_node_index++;
|
|
|
|
vertex_indices = UNASSIGNED_VERTEX_INDICES;
|
|
for (uint32_t i = 0; i < 6; i++)
|
|
vertex_used[i] = false;
|
|
}
|
|
} else {
|
|
uint32_t chosen_invocation =
|
|
findMSB(radv_ballot(cluster, !assigned && required_bit_size == min_required_bit_size));
|
|
if (cluster.invocation_index != chosen_invocation && !assigned) {
|
|
vertex_indices = UNASSIGNED_VERTEX_INDICES;
|
|
for (uint32_t i = 0; i < 6; i++)
|
|
vertex_used[i] = false;
|
|
}
|
|
}
|
|
}
|
|
|
|
uint32_t hw_node_count = subgroupClusteredMax(hw_node_index, RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT) + 1;
|
|
|
|
uint32_t pair_index;
|
|
uint32_t pair_base_index = 0;
|
|
uint32_t pair_count;
|
|
uint32_t first_active_in_node;
|
|
uint32_t node_mask;
|
|
uint32_t node_invocations;
|
|
for (uint32_t i = 0; i < hw_node_count; i++) {
|
|
uint32_t current_node_mask = radv_ballot(cluster, hw_node_index == i);
|
|
if (hw_node_index == i) {
|
|
node_mask = current_node_mask;
|
|
pair_count = bitCount(node_mask);
|
|
first_active_in_node = findLSB(node_mask);
|
|
pair_index = bitCount(node_mask & ((1u << cluster.invocation_index) - 1));
|
|
node_invocations = subgroupClusteredOr(cluster.invocation_index << (pair_index * 4),
|
|
RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT);
|
|
break;
|
|
}
|
|
pair_base_index += bitCount(current_node_mask);
|
|
}
|
|
|
|
bool is_single_prim_node = pair_count == 1 && pair_index_node_index1 == RADV_BVH_INVALID_NODE;
|
|
|
|
/* If there is a node that contains only one primitive, abort this encoding attempt and retry during a second pass
|
|
* which will pair such nodes. This needs a separate pass so that the allocated nodes of two batches can be
|
|
* guaranteed to be close enough since primitive nodes can only have small relative offsets. The retry pass has
|
|
* RADV_BUILD_FLAG_BATCH_COMPRESS_TRIANGLES_RETRY (is_retry) set.
|
|
*/
|
|
uint32_t single_prim_node_invoc = findLSB(radv_ballot(cluster, is_single_prim_node));
|
|
bool has_single_prim_node = radv_ballot(cluster, is_single_prim_node) != 0;
|
|
if (!is_retry && has_single_prim_node) {
|
|
if (cluster.invocation_index == 0) {
|
|
uint32_t retry_base_invocation =
|
|
atomicAdd(DEREF(args.header).driver_internal[RADV_IR_HEADER_ENCODE_TRIANGLES_RETRY_INVOCATIONS_X],
|
|
RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT);
|
|
|
|
uint32_t retry_batch_index_index = retry_base_invocation / RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT;
|
|
VOID_REF retry_indices = OFFSET(args.header, SIZEOF(vk_ir_header) + args.batches_size);
|
|
DEREF(INDEX(uint32_t, retry_indices, retry_batch_index_index)) = task_index;
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
if (is_retry) {
|
|
/* Move the single primitive node to the end since it needs to offset into the next batch. */
|
|
uint32_t single_prim_pair_base_index = radv_read_invocation(cluster, single_prim_node_invoc, pair_base_index);
|
|
|
|
if (pair_base_index > single_prim_pair_base_index)
|
|
pair_base_index--;
|
|
if (is_single_prim_node)
|
|
pair_base_index = total_pair_count - 1;
|
|
}
|
|
|
|
REF(radv_gfx12_box_node) parent_node =
|
|
REF(radv_gfx12_box_node)(args.output_base + args.output_bvh_offset + DEREF(task).parent_offset);
|
|
uint32_t first_leaf_child_index = (DEREF(parent_node).child_count_exponents >> 28) + 1;
|
|
if (first_leaf_child_index == 0x10)
|
|
first_leaf_child_index = 0;
|
|
|
|
/* Two batches are always combined into one during the retry pass is there is a second batch. The goal is to merge
|
|
* all primitive nodes with just one triangle (except one if there is an odd number of such nodes). Since the
|
|
* compression loop above can always merge at least two nodes, the following assumptions should hold:
|
|
*
|
|
* - There is at most one primitive node with only one triangle in a batch
|
|
* - this primitive has the max hw_node_index in this batch.
|
|
*
|
|
* If there is a second batch, the first batch will allocate one less primitive node. This is the triangle that will
|
|
* be merged into the second batch which we know has the highest hw_node_index/dst_offset. The second batch starts in
|
|
* dst memory where the primitive that was removed from the first batch should have been. The merged triangle can be
|
|
* referenced in two different ways:
|
|
*
|
|
* - If the batch contains only one triangle, the primitive_base_id is changed to point at the merged node.
|
|
* - Otherwise the node size inside the cild info before the moved triangle child info is set to skip ahead to the
|
|
* merged primitive node in the second batch.
|
|
*/
|
|
|
|
if (is_retry) {
|
|
assert(bitCount(radv_ballot(cluster, is_single_prim_node)) == 1,
|
|
"radv: encode_triangles_gfx12: There must be exactly one node with only one triangle.\n");
|
|
assert(!is_single_prim_node || hw_node_index == hw_node_count - 1,
|
|
"radv: encode_triangles_gfx12: The single triangle primitive node must be last.\n");
|
|
}
|
|
|
|
radv_invocation_cluster alloc_cluster = cluster;
|
|
uint32_t alloc_hw_node_count = hw_node_count;
|
|
bool has_second_batch = false;
|
|
bool jump_to_second_batch = false;
|
|
uint32_t single_prim_node_invocs[2];
|
|
if (is_retry) {
|
|
radv_invocation_cluster_init(alloc_cluster, RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT * 2);
|
|
|
|
has_second_batch = (radv_ballot(alloc_cluster, true) >> RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT) != 0;
|
|
|
|
single_prim_node_invocs[0] = radv_read_invocation(alloc_cluster, 0, single_prim_node_invoc);
|
|
single_prim_node_invocs[1] =
|
|
radv_read_invocation(alloc_cluster, RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT, single_prim_node_invoc) +
|
|
RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT;
|
|
|
|
if (has_second_batch) {
|
|
alloc_hw_node_count =
|
|
radv_read_invocation(alloc_cluster, 0, hw_node_count) +
|
|
radv_read_invocation(alloc_cluster, RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT, hw_node_count) - 1;
|
|
|
|
jump_to_second_batch = alloc_cluster.invocation_index < RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT;
|
|
if (is_single_prim_node) {
|
|
encode_vertex_payload_bit_size = uvec3(32);
|
|
encode_trailing_zero_bits = 0;
|
|
encode_geometry_id_base_bit_size = 24;
|
|
encode_geometry_id_payload_bit_size = 24;
|
|
encode_triangle_id_base_bit_size = 24;
|
|
encode_triangle_id_payload_bit_size = 24;
|
|
encode_indices_midpoint = RADV_GFX12_PRIMITIVE_NODE_HEADER_SIZE + 32 * 9 * 2 + 24 * 2;
|
|
|
|
vertex_indices = 0;
|
|
for (uint32_t i = 0; i < 6; i++) {
|
|
vertex_used[i] = true;
|
|
vertex_indices |= uint64_t((cluster.invocation_index << 4) + i) << uint64_t(i * 8);
|
|
}
|
|
|
|
vertices[3] = radv_read_invocation(alloc_cluster, single_prim_node_invocs[0], vertices[0]);
|
|
vertices[4] = radv_read_invocation(alloc_cluster, single_prim_node_invocs[0], vertices[1]);
|
|
vertices[5] = radv_read_invocation(alloc_cluster, single_prim_node_invocs[0], vertices[2]);
|
|
|
|
triangle_id1 = radv_read_invocation(alloc_cluster, single_prim_node_invocs[0], triangle_id0);
|
|
geometry_id1 = radv_read_invocation(alloc_cluster, single_prim_node_invocs[0], geometry_id0);
|
|
opaque1 = radv_read_invocation(alloc_cluster, single_prim_node_invocs[0], opaque0);
|
|
|
|
/* Indicate that there is a second node. The actual value of pair_index_node_index1 is not used. */
|
|
pair_index_node_index1 = 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Allocate space for the primitive node. */
|
|
uint32_t dst_offset;
|
|
if (cluster.invocation_index == 0) {
|
|
if (alloc_cluster.invocation_index == 0)
|
|
dst_offset = atomicAdd(DEREF(args.header).dst_node_offset, alloc_hw_node_count * RADV_GFX12_BVH_NODE_SIZE);
|
|
dst_offset = radv_read_invocation(alloc_cluster, 0, dst_offset);
|
|
|
|
if (alloc_cluster.invocation_index == RADV_TRIANGLE_ENCODE_TASK_INVOCATION_COUNT) {
|
|
dst_offset +=
|
|
radv_read_invocation(alloc_cluster, 0, hw_node_count) * RADV_GFX12_BVH_NODE_SIZE - RADV_GFX12_BVH_NODE_SIZE;
|
|
}
|
|
|
|
DEREF(parent_node).primitive_base_id = pack_node_id(dst_offset, 0);
|
|
DEREF(parent_node).child_count_exponents = (DEREF(parent_node).child_count_exponents & 0x0fffffff) |
|
|
((first_leaf_child_index + total_pair_count - 1) << 28);
|
|
}
|
|
dst_offset = radv_read_invocation(cluster, 0, dst_offset) + hw_node_index * RADV_GFX12_BVH_NODE_SIZE;
|
|
|
|
uint32_t second_dst_offset = radv_read_invocation(alloc_cluster, single_prim_node_invocs[1], dst_offset);
|
|
bool rewrite_primitive_base_id = jump_to_second_batch && total_pair_count == 1;
|
|
if (rewrite_primitive_base_id)
|
|
DEREF(parent_node).primitive_base_id = pack_node_id(second_dst_offset, 0);
|
|
|
|
radv_gfx12_box_child child = DEREF(parent_node).children[first_leaf_child_index + cluster.invocation_index];
|
|
|
|
memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
|
|
gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
|
|
|
|
if (pair_index < pair_count - 1)
|
|
child.dword2 = child.dword2 & 0xffffff;
|
|
|
|
uint32_t jump_size = radv_read_invocation(alloc_cluster, single_prim_node_invocs[1], dst_offset) - dst_offset;
|
|
if (jump_to_second_batch && !rewrite_primitive_base_id && pair_base_index + pair_index == (total_pair_count - 1) - 1)
|
|
child.dword2 = (child.dword2 & 0xffffff) | ((jump_size / RADV_GFX12_BVH_NODE_SIZE) << 28);
|
|
|
|
/* Update the node type because it encodes the pair index which cannot be known in advance.
|
|
* The BVH8 encoding uses 4 bits for the type. The high bit is used to reference up to 8 pairs.
|
|
*/
|
|
child.dword2 |= ((pair_index & 0x3) << 24);
|
|
if (pair_index >= 4)
|
|
child.dword2 |= (8 << 24);
|
|
|
|
DEREF(parent_node).children[first_leaf_child_index + pair_base_index + pair_index] = child;
|
|
|
|
/* Return because the triangle is written by the second batch. */
|
|
if (is_single_prim_node && jump_to_second_batch)
|
|
return;
|
|
|
|
VOID_REF dst_leaf_addr = args.output_base + args.output_bvh_offset + dst_offset;
|
|
|
|
bit_writer writer;
|
|
bit_writer_init(writer, dst_leaf_addr);
|
|
|
|
if (cluster.invocation_index == first_active_in_node) {
|
|
assert(encode_indices_midpoint >= 54, "radv: encode_triangles_gfx12: encode_indices_midpoint < 54.\n");
|
|
assert(encode_indices_midpoint < 1024, "radv: encode_triangles_gfx12: encode_indices_midpoint >= 1024.\n");
|
|
|
|
bit_writer_write(writer, encode_vertex_payload_bit_size.x - 1, 5); /* x_vertex_bits_minus_one */
|
|
bit_writer_write(writer, encode_vertex_payload_bit_size.y - 1, 5); /* y_vertex_bits_minus_one */
|
|
bit_writer_write(writer, encode_vertex_payload_bit_size.z - 1, 5); /* z_vertex_bits_minus_one */
|
|
bit_writer_write(writer, encode_trailing_zero_bits, 5); /* trailing_zero_bits */
|
|
bit_writer_write(writer, encode_geometry_id_base_bit_size / 2, 4); /* geometry_index_base_bits_div_2 */
|
|
bit_writer_write(writer, encode_geometry_id_payload_bit_size / 2, 4); /* geometry_index_bits_div_2 */
|
|
bit_writer_write(writer, pair_count - 1, 3); /* triangle_pair_count_minus_one */
|
|
bit_writer_write(writer, 0, 1); /* vertex_type */
|
|
bit_writer_write(writer, encode_triangle_id_base_bit_size, 5); /* primitive_index_base_bits */
|
|
bit_writer_write(writer, encode_triangle_id_payload_bit_size, 5); /* primitive_index_bits */
|
|
bit_writer_write(writer, encode_indices_midpoint, 10);
|
|
|
|
uvec3 vertex_prefix = floatBitsToUint(vertices[0]);
|
|
uvec3 vertex_base_bit_size = uvec3(32 - encode_trailing_zero_bits) - encode_vertex_payload_bit_size;
|
|
if (vertex_base_bit_size.x > 0) {
|
|
bit_writer_write(writer, vertex_prefix.x >> (encode_trailing_zero_bits + encode_vertex_payload_bit_size.x),
|
|
vertex_base_bit_size.x);
|
|
}
|
|
if (vertex_base_bit_size.y > 0) {
|
|
bit_writer_write(writer, vertex_prefix.y >> (encode_trailing_zero_bits + encode_vertex_payload_bit_size.y),
|
|
vertex_base_bit_size.y);
|
|
}
|
|
if (vertex_base_bit_size.z > 0) {
|
|
bit_writer_write(writer, vertex_prefix.z >> (encode_trailing_zero_bits + encode_vertex_payload_bit_size.z),
|
|
vertex_base_bit_size.z);
|
|
}
|
|
}
|
|
|
|
uint32_t vertex_used_mask[6];
|
|
for (uint32_t processed_node_index = 0; processed_node_index < hw_node_count; processed_node_index++) {
|
|
if (processed_node_index != hw_node_index)
|
|
continue;
|
|
|
|
for (uint32_t vertex_index = 0; vertex_index < 6; vertex_index++)
|
|
vertex_used_mask[vertex_index] = radv_ballot(cluster, vertex_used[vertex_index]);
|
|
}
|
|
|
|
for (uint32_t vertex_index = 0; vertex_index < 6; vertex_index++) {
|
|
uvec3 vertex = floatBitsToUint(vertices[vertex_index]) >> encode_trailing_zero_bits;
|
|
vertex = vertex & uvec3((1ul << uint64_t(encode_vertex_payload_bit_size.x)) - 1,
|
|
(1ul << uint64_t(encode_vertex_payload_bit_size.y)) - 1,
|
|
(1ul << uint64_t(encode_vertex_payload_bit_size.z)) - 1);
|
|
|
|
for (uint32_t i = 0; i < pair_count; i++) {
|
|
uint32_t invocation = (node_invocations >> (i * 4)) & 0xf;
|
|
if ((vertex_used_mask[vertex_index] & (1u << invocation)) == 0)
|
|
continue;
|
|
|
|
uvec3 current_vertex = radv_read_invocation(cluster, invocation, vertex);
|
|
|
|
if (cluster.invocation_index == first_active_in_node) {
|
|
bit_writer_write(writer, current_vertex.x, encode_vertex_payload_bit_size.x);
|
|
bit_writer_write(writer, current_vertex.y, encode_vertex_payload_bit_size.y);
|
|
bit_writer_write(writer, current_vertex.z, encode_vertex_payload_bit_size.z);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (encode_geometry_id_payload_bit_size > 0) {
|
|
uint32_t geometry_id_payload_mask =
|
|
(encode_geometry_id_payload_bit_size == 32) ? 0xffffffff : ((1u << encode_geometry_id_payload_bit_size) - 1);
|
|
uint32_t geometry_id_payloads[2] = {
|
|
geometry_id0 & geometry_id_payload_mask,
|
|
geometry_id1 & geometry_id_payload_mask,
|
|
};
|
|
|
|
for (int32_t i = int32_t(pair_count) - 1; i >= 0; i--) {
|
|
uint32_t invocation = (node_invocations >> (uint32_t(i) * 4)) & 0xf;
|
|
|
|
uint32_t payload0 = radv_read_invocation(cluster, invocation, geometry_id_payloads[0]);
|
|
uint32_t payload1 = radv_read_invocation(cluster, invocation, geometry_id_payloads[1]);
|
|
if (cluster.invocation_index == first_active_in_node) {
|
|
bit_writer_write(writer, payload1, encode_geometry_id_payload_bit_size);
|
|
if (invocation != first_active_in_node)
|
|
bit_writer_write(writer, payload0, encode_geometry_id_payload_bit_size);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (cluster.invocation_index == first_active_in_node) {
|
|
bit_writer_write(writer, geometry_id0, encode_geometry_id_base_bit_size);
|
|
bit_writer_write(writer, triangle_id0, encode_triangle_id_base_bit_size);
|
|
}
|
|
|
|
if (encode_triangle_id_payload_bit_size > 0) {
|
|
uint32_t triangle_id_payload_mask =
|
|
(encode_triangle_id_payload_bit_size == 32) ? 0xffffffff : ((1u << encode_triangle_id_payload_bit_size) - 1);
|
|
uint32_t triangle_id_payloads[2] = {
|
|
triangle_id0 & triangle_id_payload_mask,
|
|
triangle_id1 & triangle_id_payload_mask,
|
|
};
|
|
|
|
for (uint32_t i = 0; i < pair_count; i++) {
|
|
uint32_t invocation = (node_invocations >> (i * 4)) & 0xf;
|
|
|
|
uint32_t payload0 = radv_read_invocation(cluster, invocation, triangle_id_payloads[0]);
|
|
uint32_t payload1 = radv_read_invocation(cluster, invocation, triangle_id_payloads[1]);
|
|
if (cluster.invocation_index == first_active_in_node) {
|
|
if (invocation != first_active_in_node)
|
|
bit_writer_write(writer, payload0, encode_triangle_id_payload_bit_size);
|
|
bit_writer_write(writer, payload1, encode_triangle_id_payload_bit_size);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (cluster.invocation_index == first_active_in_node) {
|
|
uint32_t triangle_pair_descs_size = RADV_GFX12_PRIMITIVE_NODE_PAIR_DESC_SIZE * pair_count;
|
|
uint32_t target = 32 * 32 - triangle_pair_descs_size;
|
|
uint32_t skip_count = target - writer.total_count;
|
|
if (skip_count <= 32)
|
|
bit_writer_write(writer, 0, skip_count);
|
|
else
|
|
bit_writer_skip_to(writer, target);
|
|
}
|
|
|
|
uint32_t encoded_vertex_indices = 0;
|
|
for (uint32_t i = 0; i < 6; i++) {
|
|
uint32_t vertex_index = uint32_t((vertex_indices >> (i * 8)) & 0xff);
|
|
uint32_t invocation = vertex_index >> 4;
|
|
uint32_t array_index = vertex_index & 0xf;
|
|
|
|
uint32_t encoded_index = bitCount(vertex_used_mask[array_index] & ((1u << invocation) - 1));
|
|
for (uint32_t j = 0; j < 5; j++) {
|
|
if (array_index > j) {
|
|
encoded_index += bitCount(vertex_used_mask[j]);
|
|
}
|
|
}
|
|
|
|
encoded_vertex_indices |= (encoded_index << (i * 4));
|
|
}
|
|
|
|
for (int32_t i = int32_t(pair_count) - 1; i >= 0; i--) {
|
|
uint32_t invocation = (node_invocations >> (uint32_t(i) * 4)) & 0xf;
|
|
|
|
bool has_second_triangle =
|
|
radv_read_invocation(cluster, invocation, pair_index_node_index1 != RADV_BVH_INVALID_NODE);
|
|
bool current_opaque0 = radv_read_invocation(cluster, invocation, opaque0);
|
|
bool current_opaque1 = radv_read_invocation(cluster, invocation, opaque1);
|
|
uint32_t current_encoded_vertex_indices = radv_read_invocation(cluster, invocation, encoded_vertex_indices);
|
|
|
|
if (cluster.invocation_index == first_active_in_node) {
|
|
bit_writer_write(writer, 1, 1); /* prim_range_stop */
|
|
bit_writer_write(writer, 0, 1); /* tri1_double_sided */
|
|
bit_writer_write(writer, (has_second_triangle && current_opaque1) ? 1 : 0, 1); /* tri1_opaque */
|
|
bit_writer_write(writer, has_second_triangle ? (current_encoded_vertex_indices >> 12) : 0,
|
|
12); /* tri1_v0_index, tri1_v1_index, tri1_v2_index */
|
|
bit_writer_write(writer, 0, 1); /* tri0_double_sided */
|
|
bit_writer_write(writer, current_opaque0 ? 1 : 0, 1); /* tri0_opaque */
|
|
bit_writer_write(writer, current_encoded_vertex_indices & 0xfff,
|
|
12); /* tri0_v0_index, tri0_v1_index, tri0_v2_index */
|
|
}
|
|
}
|
|
|
|
if (cluster.invocation_index == first_active_in_node)
|
|
bit_writer_finish(writer);
|
|
}
|