kk: Reuse as much poly utilities as possible for unrolling

We cannot use poly_unroll_restart since we may require a promoted index size when we want to disable primitive restart for 16 bit indices. For 32 bit indices we cannot do much... Signed-off-by: Aitor Camacho <aitor@lunarg.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40864>
2026-05-07 04:58:05 +02:00 · 2026-04-07 21:22:57 +09:00 · 2026-04-07 21:22:57 +09:00 · 0c6019e265
commit 0c6019e265
parent 1d57784308
8 changed files with 140 additions and 242 deletions
--- a/src/kosmickrisp/clc/kk_clc.c
+++ b/src/kosmickrisp/clc/kk_clc.c
@ -32,7 +32,7 @@ static const struct spirv_to_nir_options spirv_options = {
   .temp_addr_format = nir_address_format_62bit_generic,
   .constant_addr_format = nir_address_format_64bit_global,
   .create_library = true,
-   .printf = true,
+   .printf = false, /* TODO_KOSMICKRISP Enable */
 };

 /* Standard optimization loop */
--- a/src/kosmickrisp/compiler/nir_to_msl.c
+++ b/src/kosmickrisp/compiler/nir_to_msl.c
@ -1287,6 +1287,12 @@ intrinsic_to_msl(struct nir_to_msl_ctx *ctx, nir_intrinsic_instr *instr)
      ctx->indentlevel--;
      P_IND(ctx, "}\n");
      break;
+   /* This is only used by OpenCL shaders (because poly uses printf_abort even
+    * if we don't expose printf, need to actually fix this or implement printf
+    * in KK). Kinda hacked, but need to get things going. TODO_KOSMICKRISP */
+   case nir_intrinsic_printf_abort:
+      P_IND(ctx, "return;\n");
+      break;
   case nir_intrinsic_load_shared:
      assert(nir_intrinsic_base(instr) == 0);
      P(ctx, "*(threadgroup %s*)&shared_data[",
--- a/src/kosmickrisp/libkk/kk_triangle_fan.cl
+++ b/src/kosmickrisp/libkk/kk_triangle_fan.cl
@ -8,214 +8,59 @@
 #include "compiler/libcl/libcl_vk.h"
 #include "compiler/shader_enums.h"

-static uint
-libkk_vertex_id_for_line_loop(uint prim, uint vert, uint num_prims)
-{
-   /* (0, 1), (1, 2), (2, 0) */
-   if (prim == (num_prims - 1) && vert == 1)
-      return 0;
-   else
-      return prim + vert;
-}
-
-/* Swap the two non-provoking vertices third vert in odd triangles. This
- * generates a vertex ID list with a consistent winding order.
- *
- * With prim and flatshade_first, the map : [0, 1, 2] -> [0, 1, 2] is its own
- * inverse. This lets us reuse it for both vertex fetch and transform feedback.
- */
-static uint
-libagx_map_vertex_in_tri_strip(uint prim, uint vert, bool flatshade_first)
-{
-   unsigned pv = flatshade_first ? 0 : 2;
-
-   bool even = (prim & 1) == 0;
-   bool provoking = vert == pv;
-
-   return (provoking || even) ? vert : ((3 - pv) - vert);
-}
+#include "poly/cl/restart.h"
+#include "poly/geometry.h"

 static uint
-libkk_vertex_id_for_tri_fan(uint prim, uint vert, bool flatshade_first)
+load_index(uintptr_t index_buffer, uint32_t index_buffer_range_el, uint id,
+           uint index_size)
 {
-   /* Vulkan spec section 20.1.7 gives (i + 1, i + 2, 0) for a provoking
-    * first. OpenGL instead wants (0, i + 1, i + 2) with a provoking last.
-    * Piglit clipflat expects us to switch between these orders depending on
-    * provoking vertex, to avoid trivializing the fan.
-    *
-    * Rotate accordingly.
-    */
-   if (flatshade_first) {
-      vert = (vert == 2) ? 0 : (vert + 1);
-   }
-
-   /* The simpler form assuming last is provoking. */
-   return (vert == 0) ? 0 : prim + vert;
-}
-
-static uint
-libkk_vertex_id_for_tri_strip_adj(uint prim, uint vert, uint num_prims,
-                                  bool flatshade_first)
-{
-   /* See Vulkan spec section 20.1.11 "Triangle Strips With Adjancency".
-    *
-    * There are different cases for first/middle/last/only primitives and for
-    * odd/even primitives.  Determine which case we're in.
-    */
-   bool last = prim == (num_prims - 1);
-   bool first = prim == 0;
-   bool even = (prim & 1) == 0;
-   bool even_or_first = even || first;
-
-   /* When the last vertex is provoking, we rotate the primitives
-    * accordingly. This seems required for OpenGL.
-    */
-   if (!flatshade_first && !even_or_first) {
-      vert = (vert + 4u) % 6u;
-   }
-
-   /* Offsets per the spec. The spec lists 6 cases with 6 offsets. Luckily,
-    * there are lots of patterns we can exploit, avoiding a full 6x6 LUT.
-    *
-    * Here we assume the first vertex is provoking, the Vulkan default.
-    */
-   const uint offsets[6] = {
-      0,
-      first ? 1 : (even ? -2 : 3),
-      even_or_first ? 2 : 4,
-      last ? 5 : 6,
-      even_or_first ? 4 : 2,
-      even_or_first ? 3 : -2,
-   };
-
-   /* Ensure NIR can see thru the local array */
-   uint offset = 0;
-   for (uint i = 1; i < 6; ++i) {
-      if (i == vert)
-         offset = offsets[i];
-   }
-
-   /* Finally add to the base of the primitive */
-   return (prim * 2) + offset;
-}
-
-static uint
-vertex_id_for_topology(enum mesa_prim mode, bool flatshade_first, uint prim,
-                       uint vert, uint num_prims)
-{
-   switch (mode) {
-   case MESA_PRIM_POINTS:
-   case MESA_PRIM_LINES:
-   case MESA_PRIM_TRIANGLES:
-   case MESA_PRIM_LINES_ADJACENCY:
-   case MESA_PRIM_TRIANGLES_ADJACENCY:
-      /* Regular primitive: every N vertices defines a primitive */
-      return (prim * mesa_vertices_per_prim(mode)) + vert;
-
-   case MESA_PRIM_LINE_LOOP:
-      return libkk_vertex_id_for_line_loop(prim, vert, num_prims);
-
-   case MESA_PRIM_LINE_STRIP:
-   case MESA_PRIM_LINE_STRIP_ADJACENCY:
-      /* (i, i + 1) or (i, ..., i + 3) */
-      return prim + vert;
-
-   case MESA_PRIM_TRIANGLE_STRIP: {
-      /* Order depends on the provoking vert.
-       *
-       * First: (0, 1, 2), (1, 3, 2), (2, 3, 4).
-       * Last:  (0, 1, 2), (2, 1, 3), (2, 3, 4).
-       *
-       * Pull the (maybe swapped) vert from the corresponding primitive
-       */
-      return prim + libagx_map_vertex_in_tri_strip(prim, vert, flatshade_first);
-   }
-
-   case MESA_PRIM_TRIANGLE_FAN:
-      return libkk_vertex_id_for_tri_fan(prim, vert, flatshade_first);
-
-   case MESA_PRIM_TRIANGLE_STRIP_ADJACENCY:
-      return libkk_vertex_id_for_tri_strip_adj(prim, vert, num_prims,
-                                               flatshade_first);
-
-   default:
-      return 0;
-   }
-}
-
-static void
-store_index(global uint8_t *index_buffer, uint index_size_B, uint id,
-            uint value)
-{
-   global uint32_t *out_32 = (global uint32_t *)index_buffer;
-   global uint16_t *out_16 = (global uint16_t *)index_buffer;
-   global uint8_t *out_8 = (global uint8_t *)index_buffer;
-
-   if (index_size_B == 4)
-      out_32[id] = value;
-   else if (index_size_B == 2)
-      out_16[id] = value;
-   else
-      out_8[id] = value;
-}
-
-static uint
-load_index(constant uint8_t *index_buffer, uint32_t index_buffer_range_el,
-           uint id, uint index_size)
-{
-   /* We have no index buffer, index is the id */
+   /* We have no index buffer, index is the id. Required for index promotion. */
   if (index_buffer == 0u)
      return id;

-   /* When no index_buffer is present, index_buffer_range_el is vtx count */
-   bool oob = id >= index_buffer_range_el;
-
-   /* If the load would be out-of-bounds, load the first element which is
-    * assumed valid. If the application index buffer is empty with robustness2,
-    * index_buffer will point to a zero sink where only the first is valid.
-    */
-   if (oob) {
-      id = 0u;
-   }
-
-   uint el;
-   if (index_size == 1) {
-      el = ((constant uint8_t *)index_buffer)[id];
-   } else if (index_size == 2) {
-      el = ((constant uint16_t *)index_buffer)[id];
-   } else {
-      el = ((constant uint32_t *)index_buffer)[id];
-   }
-
-   /* D3D robustness semantics. TODO: Optimize? */
-   if (oob) {
-      el = 0;
-   }
-
-   return el;
+   return poly_load_index(index_buffer, index_buffer_range_el, id, index_size);
 }

 /*
- * Return the ID of the first thread in the workgroup where cond is true, or
- * 1024 if cond is false across the workgroup.
+ * Same as poly_setup_unroll_for_draw but for non-indexed. Only changes how the
+ * out_draw is built.
 */
-static uint
-first_true_thread_in_workgroup(bool cond, local uint *scratch)
+static inline global void *
+kk_setup_unroll_for_non_indexed_draw(global struct poly_heap *heap,
+                                     constant uint *in_draw,
+                                     global uint *out_draw, enum mesa_prim mode,
+                                     uint index_size_B)
 {
-   barrier(CLK_LOCAL_MEM_FENCE);
-   scratch[get_sub_group_id()] = sub_group_ballot(cond)[0];
-   barrier(CLK_LOCAL_MEM_FENCE);
+   /* Determine an upper bound on the memory required for the index buffer.
+    * Restarts only decrease the unrolled index buffer size, so the maximum size
+    * is the unrolled size when the input has no restarts.
+    */
+   uint max_prims = u_decomposed_prims_for_vertices(mode, in_draw[0]);
+   uint max_verts = max_prims * mesa_vertices_per_prim(mode);
+   uint alloc_size = max_verts * index_size_B;

-   uint first_group =
-      ctz(sub_group_ballot(scratch[get_sub_group_local_id()])[0]);
-   uint off = ctz(first_group < 32 ? scratch[first_group] : 0);
-   return (first_group * 32) + off;
+   /* Allocate unrolled index buffer.
+    *
+    * TODO: For multidraw, should be atomic. But multidraw+unroll isn't
+    * currently wired up in any driver.
+    */
+   uint old_heap_bottom_B = poly_heap_alloc_offs(heap, alloc_size);
+
+   /* Setup most of the descriptor. Count will be determined after unroll. */
+   out_draw[1] = in_draw[1];                       /* instance count */
+   out_draw[2] = old_heap_bottom_B / index_size_B; /* index offset */
+   out_draw[3] = in_draw[2];                       /* index bias */
+   out_draw[4] = in_draw[3];                       /* base instance */
+
+   /* Return the index buffer we allocated */
+   return (global uchar *)heap->base + old_heap_bottom_B;
 }

-// KERNEL(1024)
+/* TODO_KOSMICKRISP KERNEL(1024) */
 KERNEL(1)
 libkk_unroll_geometry_and_restart(
-   constant uint8_t *index_buffer, global uint8_t *out_ptr,
+   uint64_t index_buffer, global struct poly_heap *heap,
   constant uint32_t *in_draw, global uint32_t *out_draw,
   uint32_t restart_index, uint32_t index_buffer_size_el, uint32_t in_el_size_B,
   uint32_t out_el_size_B, uint32_t flatshade_first, uint32_t mode)
@ -223,10 +68,21 @@ libkk_unroll_geometry_and_restart(
   uint tid = cl_local_id.x;
   uint count = in_draw[0];

-   constant uint8_t *in_ptr =
-      index_buffer ? index_buffer + (in_draw[2] * in_el_size_B) : index_buffer;
+   uintptr_t out_ptr;
+   if (tid == 0) {
+      if (index_buffer)
+         out_ptr = (uintptr_t)poly_setup_unroll_for_draw(
+            heap, in_draw, out_draw, mode, out_el_size_B);
+      else
+         out_ptr = (uintptr_t)kk_setup_unroll_for_non_indexed_draw(
+            heap, in_draw, out_draw, mode, out_el_size_B);
+   }

-   // local uint scratch[32];
+   uintptr_t in_ptr = index_buffer
+                         ? (uintptr_t)index_buffer + (in_draw[2] * in_el_size_B)
+                         : (uintptr_t)index_buffer;
+
+   /* TODO_KOSMICKRISP local uint scratch[32]; */

   uint out_prims = 0;
   uint needle = 0;
@ -240,11 +96,13 @@ libkk_unroll_geometry_and_restart(
            idx >= count || load_index(in_ptr, index_buffer_size_el, idx,
                                       in_el_size_B) == restart_index;

-         // uint next_offs = first_true_thread_in_workgroup(restart, scratch);
+         /* TODO_KOSMICKRISP Uncomment this when subgroups are reliable
+         uint next_offs = poly_work_group_first_true(restart, scratch);

-         // next_restart += next_offs;
-         // if (next_offs < 1024)
-         //    break;
+         next_restart += next_offs;
+         if (next_offs < cl_local_size.x)
+            break;
+         */
         if (restart)
            break;
         next_restart++;
@ -254,17 +112,17 @@ libkk_unroll_geometry_and_restart(
      uint subcount = next_restart - needle;
      uint subprims = u_decomposed_prims_for_vertices(mode, subcount);
      uint out_prims_base = out_prims;
-      for (uint i = tid; i < subprims; /*i += 1024*/ ++i) {
+      for (uint i = tid; i < subprims; /*i += cl_local_size.x*/ ++i) {
         for (uint vtx = 0; vtx < per_prim; ++vtx) {
-            uint id =
-               vertex_id_for_topology(mode, flatshade_first, i, vtx, subprims);
+            uint id = poly_vertex_id_for_topology(mode, flatshade_first, i, vtx,
+                                                  subprims);
            uint offset = needle + id;

            uint x = ((out_prims_base + i) * per_prim) + vtx;
            uint y =
               load_index(in_ptr, index_buffer_size_el, offset, in_el_size_B);

-            store_index(out_ptr, out_el_size_B, x, y);
+            poly_store_index(out_ptr, out_el_size_B, x, y);
         }
      }

@ -273,10 +131,6 @@ libkk_unroll_geometry_and_restart(
   }

   if (tid == 0) {
-      out_draw[0] = out_prims * per_prim;                   /* indexCount */
-      out_draw[1] = in_draw[1];                             /* instanceCount */
-      out_draw[2] = 0u;                                     /* firstIndex */
-      out_draw[3] = index_buffer ? in_draw[3] : in_draw[2]; /* vertexOffset */
-      out_draw[4] = index_buffer ? in_draw[4] : in_draw[3]; /* firstInstance */
+      out_draw[0] = out_prims * per_prim;
   }
 }
--- a/src/kosmickrisp/vulkan/kk_cmd_buffer.c
+++ b/src/kosmickrisp/vulkan/kk_cmd_buffer.c
@ -105,6 +105,7 @@ kk_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer,
   kk_cmd_release_resources(dev, cmd);

   memset(&cmd->state, 0, sizeof(cmd->state));
+   cmd->uses_heap = false;
 }

 const struct vk_command_buffer_ops kk_cmd_buffer_ops = {
--- a/src/kosmickrisp/vulkan/kk_cmd_buffer.h
+++ b/src/kosmickrisp/vulkan/kk_cmd_buffer.h
@ -138,9 +138,6 @@ struct kk_graphics_state {
   struct {
      struct kk_addr_range addr_range[KK_MAX_VBUFS];
      mtl_buffer *handles[KK_MAX_VBUFS];
-      /* Required to understand maximum size of index buffer if primitive is
-       * triangle fans */
-      uint32_t max_vertices;
   } vb;

   /* Needed by vk_command_buffer::dynamic_graphics_state */
@ -170,6 +167,9 @@ struct kk_cmd_buffer {

   /* Owned large BOs */
   struct util_dynarray large_bos;
+
+   /* Does the command buffer use the geometry heap? */
+   bool uses_heap;
 };

 VK_DEFINE_HANDLE_CASTS(kk_cmd_buffer, vk.base, VkCommandBuffer,
@ -213,6 +213,11 @@ kk_cmd_buffer_dirty_all_gfx(struct kk_cmd_buffer *cmd)
   cmd->state.dirty_shaders = ~0u;
   cmd->state.gfx.dirty = ~0u;
   cmd->state.gfx.descriptors.root_dirty = true;
+
+   /* We just flushed out the heap use. If we want to use it again, we'll need
+    * to queue a free for it again.
+    */
+   cmd->uses_heap = false;
 }

 void kk_cmd_release_dynamic_ds_state(struct kk_cmd_buffer *cmd);
--- a/src/kosmickrisp/vulkan/kk_cmd_draw.c
+++ b/src/kosmickrisp/vulkan/kk_cmd_draw.c
@ -19,6 +19,8 @@
 #include "kosmickrisp/bridge/mtl_bridge.h"
 #include "kosmickrisp/bridge/vk_to_mtl_map.h"

+#include "poly/geometry.h"
+
 #include "vulkan/util/vk_format.h"

 static void
@ -766,7 +768,6 @@ kk_flush_dynamic_state(struct kk_cmd_buffer *cmd)
       IS_DIRTY(VI_BINDING_STRIDES) || gfx->dirty & KK_DIRTY_VB) {
      struct kk_shader *vs = cmd->state.shaders[MESA_SHADER_VERTEX];
      unsigned slot = 0;
-      cmd->state.gfx.vb.max_vertices = 0u;
      u_foreach_bit(i, vs->info.vs.attribs_read) {
         if (dyn->vi->attributes_valid & BITFIELD_BIT(i)) {
            struct vk_vertex_attribute_state attr = dyn->vi->attributes[i];
@ -778,10 +779,6 @@ kk_flush_dynamic_state(struct kk_cmd_buffer *cmd)
               &desc->root.draw.attrib_base[slot]);
            desc->root.draw.buffer_strides[attr.binding] =
               dyn->vi_binding_strides[attr.binding];
-
-            cmd->state.gfx.vb.max_vertices =
-               MAX2(vb.range / dyn->vi_binding_strides[attr.binding],
-                    cmd->state.gfx.vb.max_vertices);
         }
         slot++;
      }
@ -843,12 +840,52 @@ struct kk_draw_data {
   bool restart;
 };

+static void
+kk_init_heap(const void *data)
+{
+   struct kk_cmd_buffer *cmd = (struct kk_cmd_buffer *)data;
+   struct kk_device *dev = kk_cmd_buffer_device(cmd);
+
+   size_t size = 128 * 1024 * 1024;
+   kk_alloc_bo(dev, &dev->vk.base, size, 0, &dev->heap);
+
+   struct poly_heap *map = (struct poly_heap *)dev->heap->cpu;
+
+   /* TODO_KOSMICKRISP Self-contained until we have rodata at the device. */
+   *map = (struct poly_heap){
+      .base = dev->heap->gpu + sizeof(struct poly_heap),
+      .size = size - sizeof(struct poly_heap),
+   };
+}
+
+static uint64_t
+kk_heap(struct kk_cmd_buffer *cmd)
+{
+   struct kk_device *dev = kk_cmd_buffer_device(cmd);
+
+   util_call_once_data(&dev->heap_init_once, kk_init_heap, cmd);
+
+   /* We need to free all allocations after each command buffer execution */
+   if (!cmd->uses_heap) {
+      uint64_t addr = dev->heap->gpu;
+
+      /* Zeroing the allocated index frees everything */
+      kk_cmd_write(cmd, (struct libkk_imm_write){
+                           addr + offsetof(struct poly_heap, bottom), 0});
+
+      cmd->uses_heap = true;
+   }
+
+   return dev->heap->gpu;
+}
+
 /* Unrolling will always be done through indirect rendering, so if this is
 * called from non-indirect calls, we will fake it. */
 static struct kk_draw_data
 kk_unroll_geometry(struct kk_cmd_buffer *cmd, struct kk_draw_data data,
                   bool promote_index_type)
 {
+   struct kk_device *dev = kk_cmd_buffer_device(cmd);
   if (!data.indirect) {
      if (data.indexed) {
         VkDrawIndexedIndirectCommand draw = {
@ -875,38 +912,24 @@ kk_unroll_geometry(struct kk_cmd_buffer *cmd, struct kk_draw_data data,
      }
   }

-   uint32_t el_count = cmd->state.gfx.vb.max_vertices;
-   if (data.indexed) {
-      el_count =
-         (mtl_buffer_get_length(data.index_buffer) - data.index_buffer_offset) /
-         data.index_size;
-   }
+   struct kk_bo *out_draw =
+      kk_cmd_allocate_buffer(cmd, sizeof(VkDrawIndexedIndirectCommand), 4u);

-   uint32_t decomposed_index_count =
-      u_decomposed_prims_for_vertices(data.prim, el_count) *
-      mesa_vertices_per_prim(data.prim);
-   uint32_t el_size_B = 4u;
-   uint32_t index_buffer_size_B = decomposed_index_count * el_size_B;
-   uint32_t buffer_size_B =
-      sizeof(VkDrawIndexedIndirectCommand) + index_buffer_size_B;
-   struct kk_bo *index_buffer =
-      kk_cmd_allocate_buffer(cmd, buffer_size_B, el_size_B);
-
-   if (!index_buffer)
+   if (!out_draw)
      return data;

   struct libkk_unroll_geometry_and_restart_args info = {
      .index_buffer = mtl_buffer_get_gpu_address(data.index_buffer) +
                      data.index_buffer_offset,
-      .out_ptr = index_buffer->gpu + sizeof(VkDrawIndexedIndirectCommand),
+      .heap = kk_heap(cmd),
      .in_draw = mtl_buffer_get_gpu_address(data.indirect_buffer) +
                 data.indirect_buffer_offset,
-      .out_draw = index_buffer->gpu,
+      .out_draw = out_draw->gpu,
      .restart_index =
         promote_index_type ? UINT32_MAX : cmd->state.gfx.index.restart,
      .index_buffer_size_el = data.index_buffer_range_B,
      .in_el_size_B = data.index_size,
-      .out_el_size_B = el_size_B,
+      .out_el_size_B = 4u,
      .flatshade_first = true,
      .mode = data.prim,
   };
@ -914,14 +937,15 @@ kk_unroll_geometry(struct kk_cmd_buffer *cmd, struct kk_draw_data data,
   struct mtl_size grid = {1, 1, 1};
   libkk_unroll_geometry_and_restart_struct(cmd, grid, true, info);

-   data.indirect_buffer = index_buffer->map;
-   data.index_buffer = index_buffer->map;
-   data.index_buffer_offset = sizeof(VkDrawIndexedIndirectCommand);
+   data.indirect_buffer = out_draw->map;
+   data.index_buffer = dev->heap->map;
+   /* TODO_KOSMICKRISP Self-contained until we have rodata at the device. */
+   data.index_buffer_offset = sizeof(struct poly_heap);
   data.indirect_buffer_offset = 0u;
-   data.index_buffer_range_B = index_buffer_size_B;
+   data.index_buffer_range_B = dev->heap->size_B - sizeof(struct poly_heap);
   data.first_index = 0u;
   data.prim = u_decomposed_prim(data.prim);
-   data.index_size = el_size_B;
+   data.index_size = 4u;
   data.indirect = true;
   data.indexed = true;
   data.restart = false;
--- a/src/kosmickrisp/vulkan/kk_device.c
+++ b/src/kosmickrisp/vulkan/kk_device.c
@ -287,6 +287,10 @@ kk_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
   kk_query_table_finish(dev, &dev->occlusion_queries);
   kk_destroy_sampler_heap(dev, &dev->samplers);

+   /* Geometry heap */
+   if (dev->heap)
+      kk_destroy_bo(dev, dev->heap);
+
   /* Release the residency set last once all BOs are released. */
   mtl_release(dev->residency_set.handle);
   simple_mtx_destroy(&dev->residency_set.mutex);
--- a/src/kosmickrisp/vulkan/kk_device.h
+++ b/src/kosmickrisp/vulkan/kk_device.h
@ -97,6 +97,10 @@ struct kk_device {

   struct vk_meta_device meta;

+   /* Geomtry heap */
+   struct kk_bo *heap;
+   util_once_flag heap_init_once;
+
   uint64_t disabled_workarounds;
   bool gpu_capture_enabled;
 };