/* * Copyright 2023 Alyssa Rosenzweig * Copyright 2023 Valve Corporation * SPDX-License-Identifier: MIT */ #include "compiler/libcl/libcl.h" #include "compiler/shader_enums.h" #include "util/bitscan.h" #include "util/u_math.h" #ifdef __OPENCL_VERSION__ #include "compiler/libcl/libcl_vk.h" #endif #pragma once #define POLY_MAX_SO_BUFFERS 4 #define POLY_MAX_VERTEX_STREAMS 4 enum poly_gs_shape { /* Indexed, where indices are encoded as: * * round_to_pot(max_indices) * round_to_pot(input_primitives) * * * instance_count * * invoked for max_indices * input_primitives * instance_count indices. * * This is used with any dynamic topology. No hardware instancing used. */ POLY_GS_SHAPE_DYNAMIC_INDEXED, /* Indexed with a static index buffer. Indices ranges up to max_indices. * Hardware instance count = input_primitives * software instance count. */ POLY_GS_SHAPE_STATIC_INDEXED, /* Non-indexed. Dispatched as: * * (max_indices, input_primitives * instance count). */ POLY_GS_SHAPE_STATIC_PER_PRIM, /* Non-indexed. Dispatched as: * * (max_indices * input_primitives, instance count). */ POLY_GS_SHAPE_STATIC_PER_INSTANCE, }; static inline unsigned poly_gs_rast_vertices(enum poly_gs_shape shape, unsigned max_indices, unsigned input_primitives, unsigned instance_count) { switch (shape) { case POLY_GS_SHAPE_DYNAMIC_INDEXED: return max_indices * input_primitives * instance_count; case POLY_GS_SHAPE_STATIC_INDEXED: case POLY_GS_SHAPE_STATIC_PER_PRIM: return max_indices; case POLY_GS_SHAPE_STATIC_PER_INSTANCE: return max_indices * input_primitives; } UNREACHABLE("invalid shape"); } static inline unsigned poly_gs_rast_instances(enum poly_gs_shape shape, unsigned max_indices, unsigned input_primitives, unsigned instance_count) { switch (shape) { case POLY_GS_SHAPE_DYNAMIC_INDEXED: return 1; case POLY_GS_SHAPE_STATIC_INDEXED: case POLY_GS_SHAPE_STATIC_PER_PRIM: return input_primitives * instance_count; case POLY_GS_SHAPE_STATIC_PER_INSTANCE: return instance_count; } UNREACHABLE("invalid shape"); } static inline bool poly_gs_indexed(enum poly_gs_shape shape) { return shape == POLY_GS_SHAPE_DYNAMIC_INDEXED || shape == POLY_GS_SHAPE_STATIC_INDEXED; } static inline unsigned poly_gs_index_size(enum poly_gs_shape shape) { switch (shape) { case POLY_GS_SHAPE_DYNAMIC_INDEXED: return 4; case POLY_GS_SHAPE_STATIC_INDEXED: return 1; default: return 0; } } /* Heap to allocate from. */ struct poly_heap { DEVICE(uchar) base; uint32_t bottom, size; } PACKED; static_assert(sizeof(struct poly_heap) == 4 * 4, "struct poly_heap must be 4 words"); #ifdef __OPENCL_VERSION__ static inline uint poly_heap_alloc_offs(global struct poly_heap *heap, uint size_B) { size_B = align(size_B, 16); uint offs = atomic_fetch_add((volatile atomic_uint *)(&heap->bottom), size_B); /* Use printf+abort because assert is stripped from release builds. */ if (heap->bottom >= heap->size) { printf( "FATAL: GPU heap overflow, allocating size %u, at offset %u, heap size %u!", size_B, offs, heap->size); abort(); } return offs; } static inline global void * poly_heap_alloc(global struct poly_heap *heap, uint size_B) { return heap->base + poly_heap_alloc_offs(heap, size_B); } uint64_t nir_load_ro_sink_address_poly(void); static inline uint64_t poly_index_buffer(uint64_t index_buffer, uint size_el, uint offset_el, uint elsize_B) { if (offset_el < size_el) return index_buffer + (offset_el * elsize_B); else return nir_load_ro_sink_address_poly(); } #endif /** Parameters that feed a vertex (or tessellation evaluation) shader. * * From the perspective of libpoly, vertex and tessellation evaluation shaders * are identical. One just fets fed by the hardware's input assmebly (which * may be emulated by the driver) and the other gets fed from the tessellator. * However, from the perspective of a geometry dispatch, they are identical. */ struct poly_vertex_params { /* Index buffer if present. */ uint64_t index_buffer; /* Size of an index in the index buffer, in bytes */ uint32_t index_size_B; /* Size of the bound index buffer for bounds checking */ uint32_t index_buffer_range_el; /* Number of vertices per instance. Written by CPU for direct draw, indirect * setup kernel for indirect. This is used for VS->GS and VS->TCS indexing. */ uint32_t verts_per_instance; /* Within an indirect VS draw, the grids used to dispatch the VS written * out by the GS indirect setup kernel or the CPU for a direct draw. This is * the "indirect local" format: first 3 is in threads, second 3 is in grid * blocks. This lets us use nontrivial workgroups with indirect draws without * needing any predication. */ uint32_t grid[6]; uint32_t _pad; /* Output buffer for vertex data */ uint64_t output_buffer; /* Mask of outputs present in the output buffer */ uint64_t outputs; } PACKED; static_assert(sizeof(struct poly_vertex_params) == 16 * 4, "struct poly_vertex_params must be 16 words"); static inline void poly_vertex_params_init(struct poly_vertex_params *p, uint64_t outputs, const uint32_t wg_size[3]) { *p = (struct poly_vertex_params) { .outputs = outputs, .grid = { 0, 0, 1, /* x/y are set by poly_vertex_params_set_draw() */ wg_size[0], wg_size[1], wg_size[2], }, }; } static inline void poly_vertex_params_set_draw(struct poly_vertex_params *p, uint32_t vertex_count, uint32_t instance_count) { /* Invoke VS as (vertices, instances) */ p->verts_per_instance = vertex_count; p->grid[0] = vertex_count; p->grid[1] = instance_count; } static inline uint poly_index_buffer_range_el(uint size_el, uint offset_el) { return offset_el < size_el ? (size_el - offset_el) : 0; } /* This must match VkDraw[Indexed]IndirectCommand * * The vertex/index_count and first_vertex/index fields line up, as does * instance_count. The only ones that don't are vertexOffset and * firstInstance but we always set those to zero. */ struct poly_indirect_draw { union { uint32_t vertex_count; uint32_t index_count; }; uint32_t instance_count; union { uint32_t first_vertex; uint32_t first_index; }; uint32_t zeros[2]; }; static_assert(sizeof(struct poly_indirect_draw) == 5 * 4, "struct poly_indirect_draw must be 5 words"); struct poly_geometry_params { /* Address of count buffer. For an indirect draw, this will be written by the * indirect setup kernel. */ DEVICE(uint) count_buffer; /* Address of the primitives generated counters */ DEVICE(uint) prims_generated_counter[POLY_MAX_VERTEX_STREAMS]; DEVICE(uint) xfb_prims_generated_counter[POLY_MAX_VERTEX_STREAMS]; DEVICE(uint) xfb_overflow[POLY_MAX_VERTEX_STREAMS]; DEVICE(uint) xfb_any_overflow; /* Pointers to transform feedback buffer offsets in bytes */ DEVICE(uint) xfb_offs_ptrs[POLY_MAX_SO_BUFFERS]; /* Output index buffer, allocated by pre-GS. */ DEVICE(uint) output_index_buffer; /* Address of transform feedback buffer in general, supplied by the CPU. */ DEVICE(uchar) xfb_base_original[POLY_MAX_SO_BUFFERS]; /* Address of transform feedback for the current primitive. Written by pre-GS * program. */ DEVICE(uchar) xfb_base[POLY_MAX_SO_BUFFERS]; /* Location-indexed mask of flat outputs, used for lowering GL edge flags. */ uint64_t flat_outputs; uint32_t xfb_size[POLY_MAX_SO_BUFFERS]; /* Number of vertices emitted by transform feedback per stream. Written by * the pre-GS program. */ uint32_t xfb_verts[POLY_MAX_VERTEX_STREAMS]; /* Within an indirect GS draw, the grids used to dispatch the GS written * out by the GS indirect setup kernel or the CPU for a direct draw. This is * the "indirect local" format: first 3 is in threads, second 3 is in grid * blocks. This lets us use nontrivial workgroups with indirect draws without * needing any predication. */ uint32_t grid[6]; /* Indirect draw command */ struct poly_indirect_draw draw; /* Number of input primitives across all instances, calculated by the CPU for * a direct draw or the GS indirect setup kernel for an indirect draw. */ uint32_t input_primitives; /* Number of input primitives per instance, rounded up to a power-of-two and * with the base-2 log taken. This is used to partition the output vertex IDs * efficiently. */ uint32_t primitives_log2; /* Number of bytes output by the GS count shader per input primitive (may be * 0), written by CPU and consumed by indirect draw setup shader for * allocating counts. */ uint32_t count_buffer_stride; /* Dynamic input topology. Must be compatible with the geometry shader's * layout() declared input class. */ uint32_t input_topology; } PACKED; static_assert(sizeof(struct poly_geometry_params) == 79 * 4, "struct poly_geometry_params must be 79 words"); static inline void poly_geometry_params_init(struct poly_geometry_params *p, enum mesa_prim prim, const uint32_t wg_size[3]) { *p = (struct poly_geometry_params) { .input_topology = prim, .grid = { 0, 0, 1, /* x/y are set by poly_geometry_params_set_draw() */ wg_size[0], wg_size[1], wg_size[2], }, }; } static inline void poly_geometry_params_set_draw(struct poly_geometry_params *p, enum mesa_prim prim, enum poly_gs_shape shape, uint32_t max_indices, uint32_t vertex_count, uint32_t instance_count) { /* Calculate number of primitives input into the GS */ const uint32_t prim_per_instance = u_decomposed_prims_for_vertices(prim, vertex_count); /* Invoke GS as (primitives, instances) */ p->grid[0] = prim_per_instance; p->grid[1] = instance_count; p->input_primitives = prim_per_instance * instance_count; p->primitives_log2 = util_logbase2_ceil(prim_per_instance); p->draw.index_count = poly_gs_rast_vertices( shape, max_indices, prim_per_instance, instance_count); p->draw.instance_count = poly_gs_rast_instances( shape, max_indices, prim_per_instance, instance_count); } /* TCS shared memory layout: * * vec4 vs_outputs[VERTICES_IN_INPUT_PATCH][TOTAL_VERTEX_OUTPUTS]; * * TODO: compact. */ static inline uint poly_tcs_in_offs_el(uint vtx, gl_varying_slot location, uint64_t crosslane_vs_out_mask) { uint base = vtx * util_bitcount64(crosslane_vs_out_mask); uint offs = util_bitcount64(crosslane_vs_out_mask & (((uint64_t)(1) << location) - 1)); return base + offs; } static inline uint poly_tcs_in_size(uint32_t vertices_in_patch, uint64_t crosslane_vs_out_mask) { return vertices_in_patch * util_bitcount64(crosslane_vs_out_mask) * 16; } /* * TCS out buffer layout, per-patch: * * float tess_level_outer[4]; * float tess_level_inner[2]; * vec4 patch_out[MAX_PATCH_OUTPUTS]; * vec4 vtx_out[OUT_PATCH_SIZE][TOTAL_VERTEX_OUTPUTS]; * * Vertex out are compacted based on the mask of written out. Patch * out are used as-is. * * Bounding boxes are ignored. */ static inline uint poly_tcs_out_offs_el(uint vtx_id, gl_varying_slot location, uint nr_patch_out, uint64_t vtx_out_mask) { uint off = 0; if (location == VARYING_SLOT_TESS_LEVEL_OUTER) return off; off += 4; if (location == VARYING_SLOT_TESS_LEVEL_INNER) return off; off += 2; if (location >= VARYING_SLOT_PATCH0) return off + (4 * (location - VARYING_SLOT_PATCH0)); /* Anything else is a per-vtx output */ off += 4 * nr_patch_out; off += 4 * vtx_id * util_bitcount64(vtx_out_mask); uint idx = util_bitcount64(vtx_out_mask & (((uint64_t)(1) << location) - 1)); return off + (4 * idx); } static inline uint poly_tcs_out_stride_el(uint nr_patch_out, uint out_patch_size, uint64_t vtx_out_mask) { return poly_tcs_out_offs_el(out_patch_size, VARYING_SLOT_POS, nr_patch_out, vtx_out_mask); } static inline uint poly_tcs_out_stride(uint nr_patch_out, uint out_patch_size, uint64_t vtx_out_mask) { return poly_tcs_out_stride_el(nr_patch_out, out_patch_size, vtx_out_mask) * 4; } /* In a tess eval shader, stride for hw vertex ID */ #define POLY_TES_PATCH_ID_STRIDE 8192 static inline uint poly_compact_prim(enum mesa_prim prim) { static_assert(MESA_PRIM_QUAD_STRIP == MESA_PRIM_QUADS + 1, "MESA_PRIM_QUAD_STRIP must be immediately after MESA_PRIM_QUADS"); static_assert(MESA_PRIM_POLYGON == MESA_PRIM_QUADS + 2, "MESA_PRIM_POLYGON must be immediately after MESA_PRIM_QUAD_STRIP"); #ifndef __OPENCL_VERSION__ assert(prim != MESA_PRIM_QUADS); assert(prim != MESA_PRIM_QUAD_STRIP); assert(prim != MESA_PRIM_POLYGON); assert(prim != MESA_PRIM_PATCHES); #endif return (prim >= MESA_PRIM_QUADS) ? (prim - 3) : prim; } static inline enum mesa_prim poly_uncompact_prim(uint packed) { if (packed >= MESA_PRIM_QUADS) return (enum mesa_prim)(packed + 3); return (enum mesa_prim)packed; } /* * Write a strip into a 32-bit index buffer. This is the sequence: * * (b, b + 1, b + 2, ..., b + n - 1, -1) where -1 is the restart index * * For points, we write index buffers without restart just for remapping. */ static inline void _poly_write_strip(GLOBAL uint32_t *index_buffer, uint32_t index_offset, uint32_t vertex_offset, uint32_t verts_in_prim, uint32_t stream, uint32_t stream_multiplier, uint32_t n) { bool restart = n > 1; if (verts_in_prim < n) return; GLOBAL uint32_t *out = &index_buffer[index_offset]; /* Write out indices for the strip */ for (uint32_t i = 0; i < verts_in_prim; ++i) { out[i] = (vertex_offset + i) * stream_multiplier + stream; } if (restart) out[verts_in_prim] = -1; } static inline unsigned poly_decomposed_prims_for_vertices_with_tess(enum mesa_prim prim, int vertices, unsigned verts_per_patch) { if (prim >= MESA_PRIM_PATCHES) { return vertices / verts_per_patch; } else { return u_decomposed_prims_for_vertices(prim, vertices); } } #ifdef __OPENCL_VERSION__ /* * Returns (work_group_scan_inclusive_add(x), work_group_sum(x)). Implemented * manually with subgroup ops and local memory since Mesa doesn't do those * lowerings yet. */ static inline uint2 poly_work_group_scan_inclusive_add(uint x, local uint *scratch) { uint sg_id = get_sub_group_id(); /* Partial prefix sum of the subgroup */ uint sg = sub_group_scan_inclusive_add(x); /* Reduction (sum) for the subgroup */ uint sg_sum = sub_group_broadcast(sg, 31); /* Write out all the subgroups sums */ barrier(CLK_LOCAL_MEM_FENCE); scratch[sg_id] = sg_sum; barrier(CLK_LOCAL_MEM_FENCE); /* Read all the subgroup sums. Thread T in subgroup G reads the sum of all * threads in subgroup T. */ uint other_sum = scratch[get_sub_group_local_id()]; /* Exclusive sum the subgroup sums to get the total before the current group, * which can be added to the total for the current group. */ uint other_sums = sub_group_scan_exclusive_add(other_sum); uint base = sub_group_broadcast(other_sums, sg_id); uint prefix = base + sg; /* Reduce the workgroup using the prefix sum we already did */ uint reduction = sub_group_broadcast(other_sums + other_sum, 31); return (uint2)(prefix, reduction); } static inline void poly_prefix_sum(local uint *scratch, global uint *buffer, uint len, uint words, uint word, uint wg_count) { uint tid = cl_local_id.x; /* Main loop: complete workgroups processing multiple values at once */ uint i, count = 0; uint len_remainder = len % wg_count; uint len_rounded_down = len - len_remainder; for (i = tid; i < len_rounded_down; i += wg_count) { global uint *ptr = &buffer[(i * words) + word]; uint value = *ptr; uint2 sums = poly_work_group_scan_inclusive_add(value, scratch); *ptr = count + sums[0]; count += sums[1]; } /* The last iteration is special since we won't have a full subgroup unless * the length is divisible by the subgroup size, and we don't advance count. */ global uint *ptr = &buffer[(i * words) + word]; uint value = (tid < len_remainder) ? *ptr : 0; uint scan = poly_work_group_scan_inclusive_add(value, scratch)[0]; if (tid < len_remainder) { *ptr = count + scan; } } static inline void poly_increment_counters(global uint32_t *a, global uint32_t *b, global uint32_t *c, uint count) { global uint32_t *ptr[] = {a, b, c}; for (uint i = 0; i < 3; ++i) { if (ptr[i]) { *(ptr[i]) += count; } } } static inline void poly_increment_ia(global uint32_t *ia_vertices, global uint32_t *ia_primitives, global uint32_t *vs_invocations, global uint32_t *c_prims, global uint32_t *c_invs, constant uint32_t *draw, enum mesa_prim prim, unsigned verts_per_patch) { poly_increment_counters(ia_vertices, vs_invocations, NULL, draw[0] * draw[1]); uint prims = poly_decomposed_prims_for_vertices_with_tess(prim, draw[0], verts_per_patch) * draw[1]; poly_increment_counters(ia_primitives, c_prims, c_invs, prims); } static inline void poly_gs_setup_indirect(uint64_t index_buffer, constant uint *draw, global struct poly_vertex_params *vp /* output */, global struct poly_geometry_params *p /* output */, global struct poly_heap *heap, uint64_t vs_outputs /* Vertex (TES) output mask */, uint32_t index_size_B /* 0 if no index bffer */, uint32_t index_buffer_range_el, uint32_t prim /* Input primitive type, enum mesa_prim */, int is_prefix_summing, uint max_indices, enum poly_gs_shape shape) { /* Determine the (primitives, instances) grid size. */ uint vertex_count = draw[0]; uint instance_count = draw[1]; poly_vertex_params_set_draw(vp, vertex_count, instance_count); poly_geometry_params_set_draw(p, prim, shape, max_indices, vertex_count, instance_count); /* If indexing is enabled, the third word is the offset into the index buffer * in elements. Apply that offset now that we have it. For a hardware * indirect draw, the hardware would do this for us, but for software input * assembly we need to do it ourselves. */ if (index_size_B) { vp->index_buffer = poly_index_buffer(index_buffer, index_buffer_range_el, draw[2], index_size_B); vp->index_buffer_range_el = poly_index_buffer_range_el(index_buffer_range_el, draw[2]); } /* We need to allocate VS and GS count buffers, do so now */ uint vertex_buffer_size = poly_tcs_in_size(vertex_count * instance_count, vs_outputs); if (is_prefix_summing) { p->count_buffer = poly_heap_alloc( heap, p->input_primitives * p->count_buffer_stride); } vp->output_buffer = (uintptr_t)poly_heap_alloc(heap, vertex_buffer_size); vp->outputs = vs_outputs; if (shape == POLY_GS_SHAPE_DYNAMIC_INDEXED) { const uint32_t index_offset = poly_heap_alloc_offs(heap, p->draw.index_count * 4); p->draw.first_index = index_offset / 4; p->output_index_buffer = (global uint *)(heap->base + index_offset); } } static uint poly_load_index(uintptr_t index_buffer, uint32_t index_buffer_range_el, uint id, uint index_size) { bool oob = id >= index_buffer_range_el; /* If the load would be out-of-bounds, load the first element which is * assumed valid. If the application index buffer is empty with robustness2, * index_buffer will point to a zero sink where only the first is valid. */ if (oob) { id = 0; } uint el; if (index_size == 1) { el = ((constant uint8_t *)index_buffer)[id]; } else if (index_size == 2) { el = ((constant uint16_t *)index_buffer)[id]; } else { el = ((constant uint32_t *)index_buffer)[id]; } /* D3D robustness semantics. TODO: Optimize? */ if (oob) { el = 0; } return el; } static void poly_store_index(uintptr_t index_buffer, uint index_size_B, uint id, uint value) { global uint32_t *out_32 = (global uint32_t *)index_buffer; global uint16_t *out_16 = (global uint16_t *)index_buffer; global uint8_t *out_8 = (global uint8_t *)index_buffer; if (index_size_B == 4) out_32[id] = value; else if (index_size_B == 2) out_16[id] = value; else out_8[id] = value; } #endif