kk: Reuse as much poly utilities as possible for unrolling

We cannot use poly_unroll_restart since we may require a promoted index
size when we want to disable primitive restart for 16 bit indices. For
32 bit indices we cannot do much...

Signed-off-by: Aitor Camacho <aitor@lunarg.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40864>
This commit is contained in:
Aitor Camacho 2026-04-07 21:22:57 +09:00 committed by Marge Bot
parent 1d57784308
commit 0c6019e265
8 changed files with 140 additions and 242 deletions

View file

@ -32,7 +32,7 @@ static const struct spirv_to_nir_options spirv_options = {
.temp_addr_format = nir_address_format_62bit_generic,
.constant_addr_format = nir_address_format_64bit_global,
.create_library = true,
.printf = true,
.printf = false, /* TODO_KOSMICKRISP Enable */
};
/* Standard optimization loop */

View file

@ -1287,6 +1287,12 @@ intrinsic_to_msl(struct nir_to_msl_ctx *ctx, nir_intrinsic_instr *instr)
ctx->indentlevel--;
P_IND(ctx, "}\n");
break;
/* This is only used by OpenCL shaders (because poly uses printf_abort even
* if we don't expose printf, need to actually fix this or implement printf
* in KK). Kinda hacked, but need to get things going. TODO_KOSMICKRISP */
case nir_intrinsic_printf_abort:
P_IND(ctx, "return;\n");
break;
case nir_intrinsic_load_shared:
assert(nir_intrinsic_base(instr) == 0);
P(ctx, "*(threadgroup %s*)&shared_data[",

View file

@ -8,214 +8,59 @@
#include "compiler/libcl/libcl_vk.h"
#include "compiler/shader_enums.h"
static uint
libkk_vertex_id_for_line_loop(uint prim, uint vert, uint num_prims)
{
/* (0, 1), (1, 2), (2, 0) */
if (prim == (num_prims - 1) && vert == 1)
return 0;
else
return prim + vert;
}
/* Swap the two non-provoking vertices third vert in odd triangles. This
* generates a vertex ID list with a consistent winding order.
*
* With prim and flatshade_first, the map : [0, 1, 2] -> [0, 1, 2] is its own
* inverse. This lets us reuse it for both vertex fetch and transform feedback.
*/
static uint
libagx_map_vertex_in_tri_strip(uint prim, uint vert, bool flatshade_first)
{
unsigned pv = flatshade_first ? 0 : 2;
bool even = (prim & 1) == 0;
bool provoking = vert == pv;
return (provoking || even) ? vert : ((3 - pv) - vert);
}
#include "poly/cl/restart.h"
#include "poly/geometry.h"
static uint
libkk_vertex_id_for_tri_fan(uint prim, uint vert, bool flatshade_first)
load_index(uintptr_t index_buffer, uint32_t index_buffer_range_el, uint id,
uint index_size)
{
/* Vulkan spec section 20.1.7 gives (i + 1, i + 2, 0) for a provoking
* first. OpenGL instead wants (0, i + 1, i + 2) with a provoking last.
* Piglit clipflat expects us to switch between these orders depending on
* provoking vertex, to avoid trivializing the fan.
*
* Rotate accordingly.
*/
if (flatshade_first) {
vert = (vert == 2) ? 0 : (vert + 1);
}
/* The simpler form assuming last is provoking. */
return (vert == 0) ? 0 : prim + vert;
}
static uint
libkk_vertex_id_for_tri_strip_adj(uint prim, uint vert, uint num_prims,
bool flatshade_first)
{
/* See Vulkan spec section 20.1.11 "Triangle Strips With Adjancency".
*
* There are different cases for first/middle/last/only primitives and for
* odd/even primitives. Determine which case we're in.
*/
bool last = prim == (num_prims - 1);
bool first = prim == 0;
bool even = (prim & 1) == 0;
bool even_or_first = even || first;
/* When the last vertex is provoking, we rotate the primitives
* accordingly. This seems required for OpenGL.
*/
if (!flatshade_first && !even_or_first) {
vert = (vert + 4u) % 6u;
}
/* Offsets per the spec. The spec lists 6 cases with 6 offsets. Luckily,
* there are lots of patterns we can exploit, avoiding a full 6x6 LUT.
*
* Here we assume the first vertex is provoking, the Vulkan default.
*/
const uint offsets[6] = {
0,
first ? 1 : (even ? -2 : 3),
even_or_first ? 2 : 4,
last ? 5 : 6,
even_or_first ? 4 : 2,
even_or_first ? 3 : -2,
};
/* Ensure NIR can see thru the local array */
uint offset = 0;
for (uint i = 1; i < 6; ++i) {
if (i == vert)
offset = offsets[i];
}
/* Finally add to the base of the primitive */
return (prim * 2) + offset;
}
static uint
vertex_id_for_topology(enum mesa_prim mode, bool flatshade_first, uint prim,
uint vert, uint num_prims)
{
switch (mode) {
case MESA_PRIM_POINTS:
case MESA_PRIM_LINES:
case MESA_PRIM_TRIANGLES:
case MESA_PRIM_LINES_ADJACENCY:
case MESA_PRIM_TRIANGLES_ADJACENCY:
/* Regular primitive: every N vertices defines a primitive */
return (prim * mesa_vertices_per_prim(mode)) + vert;
case MESA_PRIM_LINE_LOOP:
return libkk_vertex_id_for_line_loop(prim, vert, num_prims);
case MESA_PRIM_LINE_STRIP:
case MESA_PRIM_LINE_STRIP_ADJACENCY:
/* (i, i + 1) or (i, ..., i + 3) */
return prim + vert;
case MESA_PRIM_TRIANGLE_STRIP: {
/* Order depends on the provoking vert.
*
* First: (0, 1, 2), (1, 3, 2), (2, 3, 4).
* Last: (0, 1, 2), (2, 1, 3), (2, 3, 4).
*
* Pull the (maybe swapped) vert from the corresponding primitive
*/
return prim + libagx_map_vertex_in_tri_strip(prim, vert, flatshade_first);
}
case MESA_PRIM_TRIANGLE_FAN:
return libkk_vertex_id_for_tri_fan(prim, vert, flatshade_first);
case MESA_PRIM_TRIANGLE_STRIP_ADJACENCY:
return libkk_vertex_id_for_tri_strip_adj(prim, vert, num_prims,
flatshade_first);
default:
return 0;
}
}
static void
store_index(global uint8_t *index_buffer, uint index_size_B, uint id,
uint value)
{
global uint32_t *out_32 = (global uint32_t *)index_buffer;
global uint16_t *out_16 = (global uint16_t *)index_buffer;
global uint8_t *out_8 = (global uint8_t *)index_buffer;
if (index_size_B == 4)
out_32[id] = value;
else if (index_size_B == 2)
out_16[id] = value;
else
out_8[id] = value;
}
static uint
load_index(constant uint8_t *index_buffer, uint32_t index_buffer_range_el,
uint id, uint index_size)
{
/* We have no index buffer, index is the id */
/* We have no index buffer, index is the id. Required for index promotion. */
if (index_buffer == 0u)
return id;
/* When no index_buffer is present, index_buffer_range_el is vtx count */
bool oob = id >= index_buffer_range_el;
/* If the load would be out-of-bounds, load the first element which is
* assumed valid. If the application index buffer is empty with robustness2,
* index_buffer will point to a zero sink where only the first is valid.
*/
if (oob) {
id = 0u;
}
uint el;
if (index_size == 1) {
el = ((constant uint8_t *)index_buffer)[id];
} else if (index_size == 2) {
el = ((constant uint16_t *)index_buffer)[id];
} else {
el = ((constant uint32_t *)index_buffer)[id];
}
/* D3D robustness semantics. TODO: Optimize? */
if (oob) {
el = 0;
}
return el;
return poly_load_index(index_buffer, index_buffer_range_el, id, index_size);
}
/*
* Return the ID of the first thread in the workgroup where cond is true, or
* 1024 if cond is false across the workgroup.
* Same as poly_setup_unroll_for_draw but for non-indexed. Only changes how the
* out_draw is built.
*/
static uint
first_true_thread_in_workgroup(bool cond, local uint *scratch)
static inline global void *
kk_setup_unroll_for_non_indexed_draw(global struct poly_heap *heap,
constant uint *in_draw,
global uint *out_draw, enum mesa_prim mode,
uint index_size_B)
{
barrier(CLK_LOCAL_MEM_FENCE);
scratch[get_sub_group_id()] = sub_group_ballot(cond)[0];
barrier(CLK_LOCAL_MEM_FENCE);
/* Determine an upper bound on the memory required for the index buffer.
* Restarts only decrease the unrolled index buffer size, so the maximum size
* is the unrolled size when the input has no restarts.
*/
uint max_prims = u_decomposed_prims_for_vertices(mode, in_draw[0]);
uint max_verts = max_prims * mesa_vertices_per_prim(mode);
uint alloc_size = max_verts * index_size_B;
uint first_group =
ctz(sub_group_ballot(scratch[get_sub_group_local_id()])[0]);
uint off = ctz(first_group < 32 ? scratch[first_group] : 0);
return (first_group * 32) + off;
/* Allocate unrolled index buffer.
*
* TODO: For multidraw, should be atomic. But multidraw+unroll isn't
* currently wired up in any driver.
*/
uint old_heap_bottom_B = poly_heap_alloc_offs(heap, alloc_size);
/* Setup most of the descriptor. Count will be determined after unroll. */
out_draw[1] = in_draw[1]; /* instance count */
out_draw[2] = old_heap_bottom_B / index_size_B; /* index offset */
out_draw[3] = in_draw[2]; /* index bias */
out_draw[4] = in_draw[3]; /* base instance */
/* Return the index buffer we allocated */
return (global uchar *)heap->base + old_heap_bottom_B;
}
// KERNEL(1024)
/* TODO_KOSMICKRISP KERNEL(1024) */
KERNEL(1)
libkk_unroll_geometry_and_restart(
constant uint8_t *index_buffer, global uint8_t *out_ptr,
uint64_t index_buffer, global struct poly_heap *heap,
constant uint32_t *in_draw, global uint32_t *out_draw,
uint32_t restart_index, uint32_t index_buffer_size_el, uint32_t in_el_size_B,
uint32_t out_el_size_B, uint32_t flatshade_first, uint32_t mode)
@ -223,10 +68,21 @@ libkk_unroll_geometry_and_restart(
uint tid = cl_local_id.x;
uint count = in_draw[0];
constant uint8_t *in_ptr =
index_buffer ? index_buffer + (in_draw[2] * in_el_size_B) : index_buffer;
uintptr_t out_ptr;
if (tid == 0) {
if (index_buffer)
out_ptr = (uintptr_t)poly_setup_unroll_for_draw(
heap, in_draw, out_draw, mode, out_el_size_B);
else
out_ptr = (uintptr_t)kk_setup_unroll_for_non_indexed_draw(
heap, in_draw, out_draw, mode, out_el_size_B);
}
// local uint scratch[32];
uintptr_t in_ptr = index_buffer
? (uintptr_t)index_buffer + (in_draw[2] * in_el_size_B)
: (uintptr_t)index_buffer;
/* TODO_KOSMICKRISP local uint scratch[32]; */
uint out_prims = 0;
uint needle = 0;
@ -240,11 +96,13 @@ libkk_unroll_geometry_and_restart(
idx >= count || load_index(in_ptr, index_buffer_size_el, idx,
in_el_size_B) == restart_index;
// uint next_offs = first_true_thread_in_workgroup(restart, scratch);
/* TODO_KOSMICKRISP Uncomment this when subgroups are reliable
uint next_offs = poly_work_group_first_true(restart, scratch);
// next_restart += next_offs;
// if (next_offs < 1024)
// break;
next_restart += next_offs;
if (next_offs < cl_local_size.x)
break;
*/
if (restart)
break;
next_restart++;
@ -254,17 +112,17 @@ libkk_unroll_geometry_and_restart(
uint subcount = next_restart - needle;
uint subprims = u_decomposed_prims_for_vertices(mode, subcount);
uint out_prims_base = out_prims;
for (uint i = tid; i < subprims; /*i += 1024*/ ++i) {
for (uint i = tid; i < subprims; /*i += cl_local_size.x*/ ++i) {
for (uint vtx = 0; vtx < per_prim; ++vtx) {
uint id =
vertex_id_for_topology(mode, flatshade_first, i, vtx, subprims);
uint id = poly_vertex_id_for_topology(mode, flatshade_first, i, vtx,
subprims);
uint offset = needle + id;
uint x = ((out_prims_base + i) * per_prim) + vtx;
uint y =
load_index(in_ptr, index_buffer_size_el, offset, in_el_size_B);
store_index(out_ptr, out_el_size_B, x, y);
poly_store_index(out_ptr, out_el_size_B, x, y);
}
}
@ -273,10 +131,6 @@ libkk_unroll_geometry_and_restart(
}
if (tid == 0) {
out_draw[0] = out_prims * per_prim; /* indexCount */
out_draw[1] = in_draw[1]; /* instanceCount */
out_draw[2] = 0u; /* firstIndex */
out_draw[3] = index_buffer ? in_draw[3] : in_draw[2]; /* vertexOffset */
out_draw[4] = index_buffer ? in_draw[4] : in_draw[3]; /* firstInstance */
out_draw[0] = out_prims * per_prim;
}
}

View file

@ -105,6 +105,7 @@ kk_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer,
kk_cmd_release_resources(dev, cmd);
memset(&cmd->state, 0, sizeof(cmd->state));
cmd->uses_heap = false;
}
const struct vk_command_buffer_ops kk_cmd_buffer_ops = {

View file

@ -138,9 +138,6 @@ struct kk_graphics_state {
struct {
struct kk_addr_range addr_range[KK_MAX_VBUFS];
mtl_buffer *handles[KK_MAX_VBUFS];
/* Required to understand maximum size of index buffer if primitive is
* triangle fans */
uint32_t max_vertices;
} vb;
/* Needed by vk_command_buffer::dynamic_graphics_state */
@ -170,6 +167,9 @@ struct kk_cmd_buffer {
/* Owned large BOs */
struct util_dynarray large_bos;
/* Does the command buffer use the geometry heap? */
bool uses_heap;
};
VK_DEFINE_HANDLE_CASTS(kk_cmd_buffer, vk.base, VkCommandBuffer,
@ -213,6 +213,11 @@ kk_cmd_buffer_dirty_all_gfx(struct kk_cmd_buffer *cmd)
cmd->state.dirty_shaders = ~0u;
cmd->state.gfx.dirty = ~0u;
cmd->state.gfx.descriptors.root_dirty = true;
/* We just flushed out the heap use. If we want to use it again, we'll need
* to queue a free for it again.
*/
cmd->uses_heap = false;
}
void kk_cmd_release_dynamic_ds_state(struct kk_cmd_buffer *cmd);

View file

@ -19,6 +19,8 @@
#include "kosmickrisp/bridge/mtl_bridge.h"
#include "kosmickrisp/bridge/vk_to_mtl_map.h"
#include "poly/geometry.h"
#include "vulkan/util/vk_format.h"
static void
@ -766,7 +768,6 @@ kk_flush_dynamic_state(struct kk_cmd_buffer *cmd)
IS_DIRTY(VI_BINDING_STRIDES) || gfx->dirty & KK_DIRTY_VB) {
struct kk_shader *vs = cmd->state.shaders[MESA_SHADER_VERTEX];
unsigned slot = 0;
cmd->state.gfx.vb.max_vertices = 0u;
u_foreach_bit(i, vs->info.vs.attribs_read) {
if (dyn->vi->attributes_valid & BITFIELD_BIT(i)) {
struct vk_vertex_attribute_state attr = dyn->vi->attributes[i];
@ -778,10 +779,6 @@ kk_flush_dynamic_state(struct kk_cmd_buffer *cmd)
&desc->root.draw.attrib_base[slot]);
desc->root.draw.buffer_strides[attr.binding] =
dyn->vi_binding_strides[attr.binding];
cmd->state.gfx.vb.max_vertices =
MAX2(vb.range / dyn->vi_binding_strides[attr.binding],
cmd->state.gfx.vb.max_vertices);
}
slot++;
}
@ -843,12 +840,52 @@ struct kk_draw_data {
bool restart;
};
static void
kk_init_heap(const void *data)
{
struct kk_cmd_buffer *cmd = (struct kk_cmd_buffer *)data;
struct kk_device *dev = kk_cmd_buffer_device(cmd);
size_t size = 128 * 1024 * 1024;
kk_alloc_bo(dev, &dev->vk.base, size, 0, &dev->heap);
struct poly_heap *map = (struct poly_heap *)dev->heap->cpu;
/* TODO_KOSMICKRISP Self-contained until we have rodata at the device. */
*map = (struct poly_heap){
.base = dev->heap->gpu + sizeof(struct poly_heap),
.size = size - sizeof(struct poly_heap),
};
}
static uint64_t
kk_heap(struct kk_cmd_buffer *cmd)
{
struct kk_device *dev = kk_cmd_buffer_device(cmd);
util_call_once_data(&dev->heap_init_once, kk_init_heap, cmd);
/* We need to free all allocations after each command buffer execution */
if (!cmd->uses_heap) {
uint64_t addr = dev->heap->gpu;
/* Zeroing the allocated index frees everything */
kk_cmd_write(cmd, (struct libkk_imm_write){
addr + offsetof(struct poly_heap, bottom), 0});
cmd->uses_heap = true;
}
return dev->heap->gpu;
}
/* Unrolling will always be done through indirect rendering, so if this is
* called from non-indirect calls, we will fake it. */
static struct kk_draw_data
kk_unroll_geometry(struct kk_cmd_buffer *cmd, struct kk_draw_data data,
bool promote_index_type)
{
struct kk_device *dev = kk_cmd_buffer_device(cmd);
if (!data.indirect) {
if (data.indexed) {
VkDrawIndexedIndirectCommand draw = {
@ -875,38 +912,24 @@ kk_unroll_geometry(struct kk_cmd_buffer *cmd, struct kk_draw_data data,
}
}
uint32_t el_count = cmd->state.gfx.vb.max_vertices;
if (data.indexed) {
el_count =
(mtl_buffer_get_length(data.index_buffer) - data.index_buffer_offset) /
data.index_size;
}
struct kk_bo *out_draw =
kk_cmd_allocate_buffer(cmd, sizeof(VkDrawIndexedIndirectCommand), 4u);
uint32_t decomposed_index_count =
u_decomposed_prims_for_vertices(data.prim, el_count) *
mesa_vertices_per_prim(data.prim);
uint32_t el_size_B = 4u;
uint32_t index_buffer_size_B = decomposed_index_count * el_size_B;
uint32_t buffer_size_B =
sizeof(VkDrawIndexedIndirectCommand) + index_buffer_size_B;
struct kk_bo *index_buffer =
kk_cmd_allocate_buffer(cmd, buffer_size_B, el_size_B);
if (!index_buffer)
if (!out_draw)
return data;
struct libkk_unroll_geometry_and_restart_args info = {
.index_buffer = mtl_buffer_get_gpu_address(data.index_buffer) +
data.index_buffer_offset,
.out_ptr = index_buffer->gpu + sizeof(VkDrawIndexedIndirectCommand),
.heap = kk_heap(cmd),
.in_draw = mtl_buffer_get_gpu_address(data.indirect_buffer) +
data.indirect_buffer_offset,
.out_draw = index_buffer->gpu,
.out_draw = out_draw->gpu,
.restart_index =
promote_index_type ? UINT32_MAX : cmd->state.gfx.index.restart,
.index_buffer_size_el = data.index_buffer_range_B,
.in_el_size_B = data.index_size,
.out_el_size_B = el_size_B,
.out_el_size_B = 4u,
.flatshade_first = true,
.mode = data.prim,
};
@ -914,14 +937,15 @@ kk_unroll_geometry(struct kk_cmd_buffer *cmd, struct kk_draw_data data,
struct mtl_size grid = {1, 1, 1};
libkk_unroll_geometry_and_restart_struct(cmd, grid, true, info);
data.indirect_buffer = index_buffer->map;
data.index_buffer = index_buffer->map;
data.index_buffer_offset = sizeof(VkDrawIndexedIndirectCommand);
data.indirect_buffer = out_draw->map;
data.index_buffer = dev->heap->map;
/* TODO_KOSMICKRISP Self-contained until we have rodata at the device. */
data.index_buffer_offset = sizeof(struct poly_heap);
data.indirect_buffer_offset = 0u;
data.index_buffer_range_B = index_buffer_size_B;
data.index_buffer_range_B = dev->heap->size_B - sizeof(struct poly_heap);
data.first_index = 0u;
data.prim = u_decomposed_prim(data.prim);
data.index_size = el_size_B;
data.index_size = 4u;
data.indirect = true;
data.indexed = true;
data.restart = false;

View file

@ -287,6 +287,10 @@ kk_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
kk_query_table_finish(dev, &dev->occlusion_queries);
kk_destroy_sampler_heap(dev, &dev->samplers);
/* Geometry heap */
if (dev->heap)
kk_destroy_bo(dev, dev->heap);
/* Release the residency set last once all BOs are released. */
mtl_release(dev->residency_set.handle);
simple_mtx_destroy(&dev->residency_set.mutex);

View file

@ -97,6 +97,10 @@ struct kk_device {
struct vk_meta_device meta;
/* Geomtry heap */
struct kk_bo *heap;
util_once_flag heap_init_once;
uint64_t disabled_workarounds;
bool gpu_capture_enabled;
};