diff --git a/src/kosmickrisp/clc/kk_clc.c b/src/kosmickrisp/clc/kk_clc.c index 5025be656e3..094d7ffb421 100644 --- a/src/kosmickrisp/clc/kk_clc.c +++ b/src/kosmickrisp/clc/kk_clc.c @@ -32,7 +32,7 @@ static const struct spirv_to_nir_options spirv_options = { .temp_addr_format = nir_address_format_62bit_generic, .constant_addr_format = nir_address_format_64bit_global, .create_library = true, - .printf = true, + .printf = false, /* TODO_KOSMICKRISP Enable */ }; /* Standard optimization loop */ diff --git a/src/kosmickrisp/compiler/nir_to_msl.c b/src/kosmickrisp/compiler/nir_to_msl.c index 5ac2c28d999..66c32e48288 100644 --- a/src/kosmickrisp/compiler/nir_to_msl.c +++ b/src/kosmickrisp/compiler/nir_to_msl.c @@ -1287,6 +1287,12 @@ intrinsic_to_msl(struct nir_to_msl_ctx *ctx, nir_intrinsic_instr *instr) ctx->indentlevel--; P_IND(ctx, "}\n"); break; + /* This is only used by OpenCL shaders (because poly uses printf_abort even + * if we don't expose printf, need to actually fix this or implement printf + * in KK). Kinda hacked, but need to get things going. TODO_KOSMICKRISP */ + case nir_intrinsic_printf_abort: + P_IND(ctx, "return;\n"); + break; case nir_intrinsic_load_shared: assert(nir_intrinsic_base(instr) == 0); P(ctx, "*(threadgroup %s*)&shared_data[", diff --git a/src/kosmickrisp/libkk/kk_triangle_fan.cl b/src/kosmickrisp/libkk/kk_triangle_fan.cl index 7e53ed0f1ad..e4b71780c83 100644 --- a/src/kosmickrisp/libkk/kk_triangle_fan.cl +++ b/src/kosmickrisp/libkk/kk_triangle_fan.cl @@ -8,214 +8,59 @@ #include "compiler/libcl/libcl_vk.h" #include "compiler/shader_enums.h" -static uint -libkk_vertex_id_for_line_loop(uint prim, uint vert, uint num_prims) -{ - /* (0, 1), (1, 2), (2, 0) */ - if (prim == (num_prims - 1) && vert == 1) - return 0; - else - return prim + vert; -} - -/* Swap the two non-provoking vertices third vert in odd triangles. This - * generates a vertex ID list with a consistent winding order. - * - * With prim and flatshade_first, the map : [0, 1, 2] -> [0, 1, 2] is its own - * inverse. This lets us reuse it for both vertex fetch and transform feedback. - */ -static uint -libagx_map_vertex_in_tri_strip(uint prim, uint vert, bool flatshade_first) -{ - unsigned pv = flatshade_first ? 0 : 2; - - bool even = (prim & 1) == 0; - bool provoking = vert == pv; - - return (provoking || even) ? vert : ((3 - pv) - vert); -} +#include "poly/cl/restart.h" +#include "poly/geometry.h" static uint -libkk_vertex_id_for_tri_fan(uint prim, uint vert, bool flatshade_first) +load_index(uintptr_t index_buffer, uint32_t index_buffer_range_el, uint id, + uint index_size) { - /* Vulkan spec section 20.1.7 gives (i + 1, i + 2, 0) for a provoking - * first. OpenGL instead wants (0, i + 1, i + 2) with a provoking last. - * Piglit clipflat expects us to switch between these orders depending on - * provoking vertex, to avoid trivializing the fan. - * - * Rotate accordingly. - */ - if (flatshade_first) { - vert = (vert == 2) ? 0 : (vert + 1); - } - - /* The simpler form assuming last is provoking. */ - return (vert == 0) ? 0 : prim + vert; -} - -static uint -libkk_vertex_id_for_tri_strip_adj(uint prim, uint vert, uint num_prims, - bool flatshade_first) -{ - /* See Vulkan spec section 20.1.11 "Triangle Strips With Adjancency". - * - * There are different cases for first/middle/last/only primitives and for - * odd/even primitives. Determine which case we're in. - */ - bool last = prim == (num_prims - 1); - bool first = prim == 0; - bool even = (prim & 1) == 0; - bool even_or_first = even || first; - - /* When the last vertex is provoking, we rotate the primitives - * accordingly. This seems required for OpenGL. - */ - if (!flatshade_first && !even_or_first) { - vert = (vert + 4u) % 6u; - } - - /* Offsets per the spec. The spec lists 6 cases with 6 offsets. Luckily, - * there are lots of patterns we can exploit, avoiding a full 6x6 LUT. - * - * Here we assume the first vertex is provoking, the Vulkan default. - */ - const uint offsets[6] = { - 0, - first ? 1 : (even ? -2 : 3), - even_or_first ? 2 : 4, - last ? 5 : 6, - even_or_first ? 4 : 2, - even_or_first ? 3 : -2, - }; - - /* Ensure NIR can see thru the local array */ - uint offset = 0; - for (uint i = 1; i < 6; ++i) { - if (i == vert) - offset = offsets[i]; - } - - /* Finally add to the base of the primitive */ - return (prim * 2) + offset; -} - -static uint -vertex_id_for_topology(enum mesa_prim mode, bool flatshade_first, uint prim, - uint vert, uint num_prims) -{ - switch (mode) { - case MESA_PRIM_POINTS: - case MESA_PRIM_LINES: - case MESA_PRIM_TRIANGLES: - case MESA_PRIM_LINES_ADJACENCY: - case MESA_PRIM_TRIANGLES_ADJACENCY: - /* Regular primitive: every N vertices defines a primitive */ - return (prim * mesa_vertices_per_prim(mode)) + vert; - - case MESA_PRIM_LINE_LOOP: - return libkk_vertex_id_for_line_loop(prim, vert, num_prims); - - case MESA_PRIM_LINE_STRIP: - case MESA_PRIM_LINE_STRIP_ADJACENCY: - /* (i, i + 1) or (i, ..., i + 3) */ - return prim + vert; - - case MESA_PRIM_TRIANGLE_STRIP: { - /* Order depends on the provoking vert. - * - * First: (0, 1, 2), (1, 3, 2), (2, 3, 4). - * Last: (0, 1, 2), (2, 1, 3), (2, 3, 4). - * - * Pull the (maybe swapped) vert from the corresponding primitive - */ - return prim + libagx_map_vertex_in_tri_strip(prim, vert, flatshade_first); - } - - case MESA_PRIM_TRIANGLE_FAN: - return libkk_vertex_id_for_tri_fan(prim, vert, flatshade_first); - - case MESA_PRIM_TRIANGLE_STRIP_ADJACENCY: - return libkk_vertex_id_for_tri_strip_adj(prim, vert, num_prims, - flatshade_first); - - default: - return 0; - } -} - -static void -store_index(global uint8_t *index_buffer, uint index_size_B, uint id, - uint value) -{ - global uint32_t *out_32 = (global uint32_t *)index_buffer; - global uint16_t *out_16 = (global uint16_t *)index_buffer; - global uint8_t *out_8 = (global uint8_t *)index_buffer; - - if (index_size_B == 4) - out_32[id] = value; - else if (index_size_B == 2) - out_16[id] = value; - else - out_8[id] = value; -} - -static uint -load_index(constant uint8_t *index_buffer, uint32_t index_buffer_range_el, - uint id, uint index_size) -{ - /* We have no index buffer, index is the id */ + /* We have no index buffer, index is the id. Required for index promotion. */ if (index_buffer == 0u) return id; - /* When no index_buffer is present, index_buffer_range_el is vtx count */ - bool oob = id >= index_buffer_range_el; - - /* If the load would be out-of-bounds, load the first element which is - * assumed valid. If the application index buffer is empty with robustness2, - * index_buffer will point to a zero sink where only the first is valid. - */ - if (oob) { - id = 0u; - } - - uint el; - if (index_size == 1) { - el = ((constant uint8_t *)index_buffer)[id]; - } else if (index_size == 2) { - el = ((constant uint16_t *)index_buffer)[id]; - } else { - el = ((constant uint32_t *)index_buffer)[id]; - } - - /* D3D robustness semantics. TODO: Optimize? */ - if (oob) { - el = 0; - } - - return el; + return poly_load_index(index_buffer, index_buffer_range_el, id, index_size); } /* - * Return the ID of the first thread in the workgroup where cond is true, or - * 1024 if cond is false across the workgroup. + * Same as poly_setup_unroll_for_draw but for non-indexed. Only changes how the + * out_draw is built. */ -static uint -first_true_thread_in_workgroup(bool cond, local uint *scratch) +static inline global void * +kk_setup_unroll_for_non_indexed_draw(global struct poly_heap *heap, + constant uint *in_draw, + global uint *out_draw, enum mesa_prim mode, + uint index_size_B) { - barrier(CLK_LOCAL_MEM_FENCE); - scratch[get_sub_group_id()] = sub_group_ballot(cond)[0]; - barrier(CLK_LOCAL_MEM_FENCE); + /* Determine an upper bound on the memory required for the index buffer. + * Restarts only decrease the unrolled index buffer size, so the maximum size + * is the unrolled size when the input has no restarts. + */ + uint max_prims = u_decomposed_prims_for_vertices(mode, in_draw[0]); + uint max_verts = max_prims * mesa_vertices_per_prim(mode); + uint alloc_size = max_verts * index_size_B; - uint first_group = - ctz(sub_group_ballot(scratch[get_sub_group_local_id()])[0]); - uint off = ctz(first_group < 32 ? scratch[first_group] : 0); - return (first_group * 32) + off; + /* Allocate unrolled index buffer. + * + * TODO: For multidraw, should be atomic. But multidraw+unroll isn't + * currently wired up in any driver. + */ + uint old_heap_bottom_B = poly_heap_alloc_offs(heap, alloc_size); + + /* Setup most of the descriptor. Count will be determined after unroll. */ + out_draw[1] = in_draw[1]; /* instance count */ + out_draw[2] = old_heap_bottom_B / index_size_B; /* index offset */ + out_draw[3] = in_draw[2]; /* index bias */ + out_draw[4] = in_draw[3]; /* base instance */ + + /* Return the index buffer we allocated */ + return (global uchar *)heap->base + old_heap_bottom_B; } -// KERNEL(1024) +/* TODO_KOSMICKRISP KERNEL(1024) */ KERNEL(1) libkk_unroll_geometry_and_restart( - constant uint8_t *index_buffer, global uint8_t *out_ptr, + uint64_t index_buffer, global struct poly_heap *heap, constant uint32_t *in_draw, global uint32_t *out_draw, uint32_t restart_index, uint32_t index_buffer_size_el, uint32_t in_el_size_B, uint32_t out_el_size_B, uint32_t flatshade_first, uint32_t mode) @@ -223,10 +68,21 @@ libkk_unroll_geometry_and_restart( uint tid = cl_local_id.x; uint count = in_draw[0]; - constant uint8_t *in_ptr = - index_buffer ? index_buffer + (in_draw[2] * in_el_size_B) : index_buffer; + uintptr_t out_ptr; + if (tid == 0) { + if (index_buffer) + out_ptr = (uintptr_t)poly_setup_unroll_for_draw( + heap, in_draw, out_draw, mode, out_el_size_B); + else + out_ptr = (uintptr_t)kk_setup_unroll_for_non_indexed_draw( + heap, in_draw, out_draw, mode, out_el_size_B); + } - // local uint scratch[32]; + uintptr_t in_ptr = index_buffer + ? (uintptr_t)index_buffer + (in_draw[2] * in_el_size_B) + : (uintptr_t)index_buffer; + + /* TODO_KOSMICKRISP local uint scratch[32]; */ uint out_prims = 0; uint needle = 0; @@ -240,11 +96,13 @@ libkk_unroll_geometry_and_restart( idx >= count || load_index(in_ptr, index_buffer_size_el, idx, in_el_size_B) == restart_index; - // uint next_offs = first_true_thread_in_workgroup(restart, scratch); + /* TODO_KOSMICKRISP Uncomment this when subgroups are reliable + uint next_offs = poly_work_group_first_true(restart, scratch); - // next_restart += next_offs; - // if (next_offs < 1024) - // break; + next_restart += next_offs; + if (next_offs < cl_local_size.x) + break; + */ if (restart) break; next_restart++; @@ -254,17 +112,17 @@ libkk_unroll_geometry_and_restart( uint subcount = next_restart - needle; uint subprims = u_decomposed_prims_for_vertices(mode, subcount); uint out_prims_base = out_prims; - for (uint i = tid; i < subprims; /*i += 1024*/ ++i) { + for (uint i = tid; i < subprims; /*i += cl_local_size.x*/ ++i) { for (uint vtx = 0; vtx < per_prim; ++vtx) { - uint id = - vertex_id_for_topology(mode, flatshade_first, i, vtx, subprims); + uint id = poly_vertex_id_for_topology(mode, flatshade_first, i, vtx, + subprims); uint offset = needle + id; uint x = ((out_prims_base + i) * per_prim) + vtx; uint y = load_index(in_ptr, index_buffer_size_el, offset, in_el_size_B); - store_index(out_ptr, out_el_size_B, x, y); + poly_store_index(out_ptr, out_el_size_B, x, y); } } @@ -273,10 +131,6 @@ libkk_unroll_geometry_and_restart( } if (tid == 0) { - out_draw[0] = out_prims * per_prim; /* indexCount */ - out_draw[1] = in_draw[1]; /* instanceCount */ - out_draw[2] = 0u; /* firstIndex */ - out_draw[3] = index_buffer ? in_draw[3] : in_draw[2]; /* vertexOffset */ - out_draw[4] = index_buffer ? in_draw[4] : in_draw[3]; /* firstInstance */ + out_draw[0] = out_prims * per_prim; } } diff --git a/src/kosmickrisp/vulkan/kk_cmd_buffer.c b/src/kosmickrisp/vulkan/kk_cmd_buffer.c index 3ff5f3042c9..3408e5e4eb2 100644 --- a/src/kosmickrisp/vulkan/kk_cmd_buffer.c +++ b/src/kosmickrisp/vulkan/kk_cmd_buffer.c @@ -105,6 +105,7 @@ kk_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer, kk_cmd_release_resources(dev, cmd); memset(&cmd->state, 0, sizeof(cmd->state)); + cmd->uses_heap = false; } const struct vk_command_buffer_ops kk_cmd_buffer_ops = { diff --git a/src/kosmickrisp/vulkan/kk_cmd_buffer.h b/src/kosmickrisp/vulkan/kk_cmd_buffer.h index 0e44ed0fb12..0b23f166767 100644 --- a/src/kosmickrisp/vulkan/kk_cmd_buffer.h +++ b/src/kosmickrisp/vulkan/kk_cmd_buffer.h @@ -138,9 +138,6 @@ struct kk_graphics_state { struct { struct kk_addr_range addr_range[KK_MAX_VBUFS]; mtl_buffer *handles[KK_MAX_VBUFS]; - /* Required to understand maximum size of index buffer if primitive is - * triangle fans */ - uint32_t max_vertices; } vb; /* Needed by vk_command_buffer::dynamic_graphics_state */ @@ -170,6 +167,9 @@ struct kk_cmd_buffer { /* Owned large BOs */ struct util_dynarray large_bos; + + /* Does the command buffer use the geometry heap? */ + bool uses_heap; }; VK_DEFINE_HANDLE_CASTS(kk_cmd_buffer, vk.base, VkCommandBuffer, @@ -213,6 +213,11 @@ kk_cmd_buffer_dirty_all_gfx(struct kk_cmd_buffer *cmd) cmd->state.dirty_shaders = ~0u; cmd->state.gfx.dirty = ~0u; cmd->state.gfx.descriptors.root_dirty = true; + + /* We just flushed out the heap use. If we want to use it again, we'll need + * to queue a free for it again. + */ + cmd->uses_heap = false; } void kk_cmd_release_dynamic_ds_state(struct kk_cmd_buffer *cmd); diff --git a/src/kosmickrisp/vulkan/kk_cmd_draw.c b/src/kosmickrisp/vulkan/kk_cmd_draw.c index d06484fa4aa..c9990aea44e 100644 --- a/src/kosmickrisp/vulkan/kk_cmd_draw.c +++ b/src/kosmickrisp/vulkan/kk_cmd_draw.c @@ -19,6 +19,8 @@ #include "kosmickrisp/bridge/mtl_bridge.h" #include "kosmickrisp/bridge/vk_to_mtl_map.h" +#include "poly/geometry.h" + #include "vulkan/util/vk_format.h" static void @@ -766,7 +768,6 @@ kk_flush_dynamic_state(struct kk_cmd_buffer *cmd) IS_DIRTY(VI_BINDING_STRIDES) || gfx->dirty & KK_DIRTY_VB) { struct kk_shader *vs = cmd->state.shaders[MESA_SHADER_VERTEX]; unsigned slot = 0; - cmd->state.gfx.vb.max_vertices = 0u; u_foreach_bit(i, vs->info.vs.attribs_read) { if (dyn->vi->attributes_valid & BITFIELD_BIT(i)) { struct vk_vertex_attribute_state attr = dyn->vi->attributes[i]; @@ -778,10 +779,6 @@ kk_flush_dynamic_state(struct kk_cmd_buffer *cmd) &desc->root.draw.attrib_base[slot]); desc->root.draw.buffer_strides[attr.binding] = dyn->vi_binding_strides[attr.binding]; - - cmd->state.gfx.vb.max_vertices = - MAX2(vb.range / dyn->vi_binding_strides[attr.binding], - cmd->state.gfx.vb.max_vertices); } slot++; } @@ -843,12 +840,52 @@ struct kk_draw_data { bool restart; }; +static void +kk_init_heap(const void *data) +{ + struct kk_cmd_buffer *cmd = (struct kk_cmd_buffer *)data; + struct kk_device *dev = kk_cmd_buffer_device(cmd); + + size_t size = 128 * 1024 * 1024; + kk_alloc_bo(dev, &dev->vk.base, size, 0, &dev->heap); + + struct poly_heap *map = (struct poly_heap *)dev->heap->cpu; + + /* TODO_KOSMICKRISP Self-contained until we have rodata at the device. */ + *map = (struct poly_heap){ + .base = dev->heap->gpu + sizeof(struct poly_heap), + .size = size - sizeof(struct poly_heap), + }; +} + +static uint64_t +kk_heap(struct kk_cmd_buffer *cmd) +{ + struct kk_device *dev = kk_cmd_buffer_device(cmd); + + util_call_once_data(&dev->heap_init_once, kk_init_heap, cmd); + + /* We need to free all allocations after each command buffer execution */ + if (!cmd->uses_heap) { + uint64_t addr = dev->heap->gpu; + + /* Zeroing the allocated index frees everything */ + kk_cmd_write(cmd, (struct libkk_imm_write){ + addr + offsetof(struct poly_heap, bottom), 0}); + + cmd->uses_heap = true; + } + + return dev->heap->gpu; +} + /* Unrolling will always be done through indirect rendering, so if this is * called from non-indirect calls, we will fake it. */ static struct kk_draw_data kk_unroll_geometry(struct kk_cmd_buffer *cmd, struct kk_draw_data data, bool promote_index_type) { + struct kk_device *dev = kk_cmd_buffer_device(cmd); if (!data.indirect) { if (data.indexed) { VkDrawIndexedIndirectCommand draw = { @@ -875,38 +912,24 @@ kk_unroll_geometry(struct kk_cmd_buffer *cmd, struct kk_draw_data data, } } - uint32_t el_count = cmd->state.gfx.vb.max_vertices; - if (data.indexed) { - el_count = - (mtl_buffer_get_length(data.index_buffer) - data.index_buffer_offset) / - data.index_size; - } + struct kk_bo *out_draw = + kk_cmd_allocate_buffer(cmd, sizeof(VkDrawIndexedIndirectCommand), 4u); - uint32_t decomposed_index_count = - u_decomposed_prims_for_vertices(data.prim, el_count) * - mesa_vertices_per_prim(data.prim); - uint32_t el_size_B = 4u; - uint32_t index_buffer_size_B = decomposed_index_count * el_size_B; - uint32_t buffer_size_B = - sizeof(VkDrawIndexedIndirectCommand) + index_buffer_size_B; - struct kk_bo *index_buffer = - kk_cmd_allocate_buffer(cmd, buffer_size_B, el_size_B); - - if (!index_buffer) + if (!out_draw) return data; struct libkk_unroll_geometry_and_restart_args info = { .index_buffer = mtl_buffer_get_gpu_address(data.index_buffer) + data.index_buffer_offset, - .out_ptr = index_buffer->gpu + sizeof(VkDrawIndexedIndirectCommand), + .heap = kk_heap(cmd), .in_draw = mtl_buffer_get_gpu_address(data.indirect_buffer) + data.indirect_buffer_offset, - .out_draw = index_buffer->gpu, + .out_draw = out_draw->gpu, .restart_index = promote_index_type ? UINT32_MAX : cmd->state.gfx.index.restart, .index_buffer_size_el = data.index_buffer_range_B, .in_el_size_B = data.index_size, - .out_el_size_B = el_size_B, + .out_el_size_B = 4u, .flatshade_first = true, .mode = data.prim, }; @@ -914,14 +937,15 @@ kk_unroll_geometry(struct kk_cmd_buffer *cmd, struct kk_draw_data data, struct mtl_size grid = {1, 1, 1}; libkk_unroll_geometry_and_restart_struct(cmd, grid, true, info); - data.indirect_buffer = index_buffer->map; - data.index_buffer = index_buffer->map; - data.index_buffer_offset = sizeof(VkDrawIndexedIndirectCommand); + data.indirect_buffer = out_draw->map; + data.index_buffer = dev->heap->map; + /* TODO_KOSMICKRISP Self-contained until we have rodata at the device. */ + data.index_buffer_offset = sizeof(struct poly_heap); data.indirect_buffer_offset = 0u; - data.index_buffer_range_B = index_buffer_size_B; + data.index_buffer_range_B = dev->heap->size_B - sizeof(struct poly_heap); data.first_index = 0u; data.prim = u_decomposed_prim(data.prim); - data.index_size = el_size_B; + data.index_size = 4u; data.indirect = true; data.indexed = true; data.restart = false; diff --git a/src/kosmickrisp/vulkan/kk_device.c b/src/kosmickrisp/vulkan/kk_device.c index c81f92adbdb..9ebe8afb387 100644 --- a/src/kosmickrisp/vulkan/kk_device.c +++ b/src/kosmickrisp/vulkan/kk_device.c @@ -287,6 +287,10 @@ kk_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator) kk_query_table_finish(dev, &dev->occlusion_queries); kk_destroy_sampler_heap(dev, &dev->samplers); + /* Geometry heap */ + if (dev->heap) + kk_destroy_bo(dev, dev->heap); + /* Release the residency set last once all BOs are released. */ mtl_release(dev->residency_set.handle); simple_mtx_destroy(&dev->residency_set.mutex); diff --git a/src/kosmickrisp/vulkan/kk_device.h b/src/kosmickrisp/vulkan/kk_device.h index 1553e0956d8..cc15b049dd7 100644 --- a/src/kosmickrisp/vulkan/kk_device.h +++ b/src/kosmickrisp/vulkan/kk_device.h @@ -97,6 +97,10 @@ struct kk_device { struct vk_meta_device meta; + /* Geomtry heap */ + struct kk_bo *heap; + util_once_flag heap_init_once; + uint64_t disabled_workarounds; bool gpu_capture_enabled; };