diff --git a/meson.build b/meson.build index c7ed21a0961..ec978d02cdd 100644 --- a/meson.build +++ b/meson.build @@ -845,6 +845,10 @@ endif with_llvm = with_llvm \ .enable_if(with_clc, error_message : 'CLC requires LLVM') +with_poly = [ + with_gallium_asahi, with_asahi_vk, with_tools.contains('asahi'), +].contains(true) + dep_clc = null_dep if with_clc dep_clc = dependency('libclc') diff --git a/src/.clang-format b/src/.clang-format index beb26c8d3b2..099120f2cca 100644 --- a/src/.clang-format +++ b/src/.clang-format @@ -237,7 +237,9 @@ ForEachMacros: - agx_foreach_reg_dest - agx_foreach_successor - foreach_next_use - - libagx_foreach_xfb + +# poly + - poly_foreach_xfb # radv - PHASE diff --git a/src/asahi/lib/agx_helpers.h b/src/asahi/lib/agx_helpers.h index f9aa1220aeb..bf7dfb9a6da 100644 --- a/src/asahi/lib/agx_helpers.h +++ b/src/asahi/lib/agx_helpers.h @@ -316,16 +316,6 @@ agx_fill_decompress_args(struct ail_layout *layout, unsigned layer, agx_fill_decompress_args(layout, layer, level, ptr, images), \ util_logbase2(layout->sample_count_sa)) -#define libagx_tessellate(context, grid, barrier, prim, mode, state) \ - if (prim == TESS_PRIMITIVE_QUADS) { \ - libagx_tess_quad(context, grid, barrier, state, mode); \ - } else if (prim == TESS_PRIMITIVE_TRIANGLES) { \ - libagx_tess_tri(context, grid, barrier, state, mode); \ - } else { \ - assert(prim == TESS_PRIMITIVE_ISOLINES); \ - libagx_tess_isoline(context, grid, barrier, state, mode); \ - } - struct agx_border_packed; void agx_pack_border(struct agx_border_packed *out, const uint32_t in[4], diff --git a/src/asahi/lib/agx_nir_lower_gs.h b/src/asahi/lib/agx_nir_lower_gs.h deleted file mode 100644 index e29705a9491..00000000000 --- a/src/asahi/lib/agx_nir_lower_gs.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright 2023 Alyssa Rosenzweig - * SPDX-License-Identifier: MIT - */ - -#pragma once - -#include -#include -#include "libagx/geometry.h" -#include "nir.h" -#include "shader_enums.h" - -struct nir_def *agx_load_per_vertex_input(struct nir_builder *b, - nir_intrinsic_instr *intr, - struct nir_def *vertex); - -nir_def *agx_nir_load_vertex_id(struct nir_builder *b, nir_def *id, - unsigned index_size_B); - -bool agx_nir_lower_sw_vs(struct nir_shader *s, unsigned index_size_B); - -bool agx_nir_lower_vs_before_gs(struct nir_shader *vs); - -struct agx_gs_info { - /* Output primitive mode for geometry shaders */ - enum mesa_prim mode; - - /* Number of words per primitive in the count buffer */ - unsigned count_words; - - /* Per-input primitive stride of the output index buffer */ - unsigned max_indices; - - /* Whether the GS includes transform feedback at a compile-time level */ - bool xfb; - - /* Whether a prefix sum is required on the count outputs. Implies xfb */ - bool prefix_sum; - - /* Whether the GS writes to a stream other than stream #0 */ - bool multistream; - - /* Shape of the rasterization draw, named by the instance ID */ - enum agx_gs_shape shape; - - /* Static topology used if shape = AGX_GS_SHAPE_STATIC_INDEXED */ - uint8_t topology[64]; -}; - -bool agx_nir_lower_gs(struct nir_shader *gs, struct nir_shader **gs_count, - struct nir_shader **gs_copy, struct nir_shader **pre_gs, - struct agx_gs_info *info); - -bool agx_nir_lower_tcs(struct nir_shader *tcs); - -bool agx_nir_lower_tes(struct nir_shader *tes, bool to_hw_vs); - -uint64_t agx_tcs_per_vertex_outputs(const struct nir_shader *nir); - -unsigned agx_tcs_output_stride(const struct nir_shader *nir); diff --git a/src/asahi/lib/agx_nir_prolog_epilog.c b/src/asahi/lib/agx_nir_prolog_epilog.c index fe2d63e505d..254555f5f4c 100644 --- a/src/asahi/lib/agx_nir_prolog_epilog.c +++ b/src/asahi/lib/agx_nir_prolog_epilog.c @@ -5,11 +5,12 @@ */ #include "gallium/include/pipe/p_defines.h" +#include "poly/cl/libpoly.h" +#include "poly/nir/poly_nir_lower_gs.h" #include "util/format/u_formats.h" #include "agx_abi.h" #include "agx_linker.h" #include "agx_nir.h" -#include "agx_nir_lower_gs.h" #include "agx_nir_lower_vbo.h" #include "agx_pack.h" #include "agx_tilebuffer.h" @@ -149,11 +150,11 @@ lower_adjacency(nir_builder *b, nir_intrinsic_instr *intr, void *data) nir_def *id = nir_load_vertex_id(b); if (key->adjacency == MESA_PRIM_LINES_ADJACENCY) { - id = libagx_map_to_line_adj(b, id); + id = poly_map_to_line_adj(b, id); } else if (key->adjacency == MESA_PRIM_TRIANGLE_STRIP_ADJACENCY) { - id = libagx_map_to_tri_strip_adj(b, id); + id = poly_map_to_tri_strip_adj(b, id); } else if (key->adjacency == MESA_PRIM_LINE_STRIP_ADJACENCY) { - id = libagx_map_to_line_strip_adj(b, id); + id = poly_map_to_line_strip_adj(b, id); } else if (key->adjacency == MESA_PRIM_TRIANGLES_ADJACENCY) { /* Sequence (0, 2, 4), (6, 8, 10), ... */ id = nir_imul_imm(b, id, 2); @@ -161,7 +162,7 @@ lower_adjacency(nir_builder *b, nir_intrinsic_instr *intr, void *data) UNREACHABLE("unknown"); } - id = agx_nir_load_vertex_id(b, id, key->sw_index_size_B); + id = poly_nir_load_vertex_id(b, id, key->sw_index_size_B); nir_def_replace(&intr->def, id); return true; @@ -215,7 +216,7 @@ agx_nir_vs_prolog(nir_builder *b, const void *key_) } if (!key->hw) { - agx_nir_lower_sw_vs(b->shader, key->sw_index_size_B); + poly_nir_lower_sw_vs(b->shader, key->sw_index_size_B); } else if (key->adjacency) { nir_shader_intrinsics_pass(b->shader, lower_adjacency, nir_metadata_control_flow, (void *)key); diff --git a/src/asahi/lib/meson.build b/src/asahi/lib/meson.build index c9e46f41418..e50a70eff51 100644 --- a/src/asahi/lib/meson.build +++ b/src/asahi/lib/meson.build @@ -11,11 +11,8 @@ libasahi_lib_files = files( 'agx_linker.c', 'agx_bg_eot.c', 'agx_tilebuffer.c', - 'agx_nir_lower_gs.c', - 'agx_nir_lower_ia.c', 'agx_nir_lower_msaa.c', 'agx_nir_lower_sample_intrinsics.c', - 'agx_nir_lower_tess.c', 'agx_nir_lower_tilebuffer.c', 'agx_nir_lower_uvs.c', 'agx_nir_lower_vbo.c', @@ -66,8 +63,8 @@ libasahi_lib = static_library( include_directories : [inc_asahi, inc_virtio_gpu, inc_virtio_vdrm], c_args : [no_override_init_args, '-Wno-c2x-extensions'], gnu_symbol_visibility : 'hidden', - link_with: [libasahi_decode, libvdrm], - dependencies: [dep_libdrm, dep_valgrind, idep_nir, idep_mesautil, idep_libagx], + link_with: [libasahi_decode, libvdrm, libpoly_nir], + dependencies: [dep_libdrm, dep_valgrind, idep_nir, idep_mesautil, idep_libagx, idep_libpoly], build_by_default : false, ) diff --git a/src/asahi/libagx/draws.cl b/src/asahi/libagx/draws.cl index 88e88fadb2b..9c39d856691 100644 --- a/src/asahi/libagx/draws.cl +++ b/src/asahi/libagx/draws.cl @@ -4,8 +4,8 @@ */ #include "asahi/lib/agx_abi.h" #include "compiler/libcl/libcl_vk.h" +#include "poly/geometry.h" #include "agx_pack.h" -#include "geometry.h" #include "libagx_dgc.h" /* @@ -36,7 +36,7 @@ libagx_predicate_indirect(global uint32_t *out, constant uint32_t *in, KERNEL(1) libagx_draw_without_adj(global VkDrawIndirectCommand *out, global VkDrawIndirectCommand *in, - global struct agx_ia_state *ia, uint64_t index_buffer, + global struct poly_ia_state *ia, uint64_t index_buffer, uint64_t index_buffer_range_el, int index_size_B, enum mesa_prim prim) { @@ -49,11 +49,11 @@ libagx_draw_without_adj(global VkDrawIndirectCommand *out, if (index_size_B) { uint offs = in->firstVertex; - ia->index_buffer = libagx_index_buffer( - index_buffer, index_buffer_range_el, offs, index_size_B); + ia->index_buffer = poly_index_buffer(index_buffer, index_buffer_range_el, + offs, index_size_B); ia->index_buffer_range_el = - libagx_index_buffer_range_el(index_buffer_range_el, offs); + poly_index_buffer_range_el(index_buffer_range_el, offs); } } @@ -122,8 +122,7 @@ libagx_memset_small(global uchar *dst, uchar b, int len, uint tid) * TODO: Handle multiple draws in parallel. */ KERNEL(32) -libagx_draw_robust_index(global uint32_t *vdm, - global struct agx_heap *heap, +libagx_draw_robust_index(global uint32_t *vdm, global struct poly_heap *heap, constant VkDrawIndexedIndirectCommand *cmd, uint64_t in_buf_ptr, uint32_t in_buf_range_B, ushort restart, enum agx_primitive topology, @@ -163,7 +162,7 @@ libagx_draw_robust_index(global uint32_t *vdm, /* Allocate memory for the shadow index buffer */ global uchar *padded; if (first) { - padded = agx_heap_alloc_nonatomic(heap, out_size_B); + padded = poly_heap_alloc_nonatomic(heap, out_size_B); } padded = (global uchar *)sub_group_broadcast((uintptr_t)padded, 0); @@ -172,7 +171,7 @@ libagx_draw_robust_index(global uint32_t *vdm, draw.start = 0; /* Clone the index buffer. The destination is aligned as a post-condition - * of agx_heap_alloc_nonatomic. + * of poly_heap_alloc_nonatomic. */ libagx_memcpy_to_aligned((global uint *)padded, in_buf, in_size_B, tid, 32); diff --git a/src/asahi/libagx/geometry.cl b/src/asahi/libagx/geometry.cl index bc72b487f5c..037b9dc061e 100644 --- a/src/asahi/libagx/geometry.cl +++ b/src/asahi/libagx/geometry.cl @@ -4,15 +4,11 @@ * SPDX-License-Identifier: MIT */ -#include "asahi/lib/agx_abi.h" #include "compiler/libcl/libcl_vk.h" +#include "poly/geometry.h" +#include "poly/tessellator.h" #include "util/macros.h" #include "util/u_math.h" -#include "geometry.h" -#include "query.h" -#include "tessellator.h" - -uint64_t nir_ro_to_rw_poly(uint64_t address); /* Swap the two non-provoking vertices in odd triangles. This generates a vertex * ID list with a consistent winding order. @@ -32,54 +28,6 @@ map_vertex_in_tri_strip(uint prim, uint vert, bool flatshade_first) return (provoking || even) ? vert : ((3 - pv) - vert); } -static inline uint -xfb_prim(uint id, uint n, uint copy) -{ - return sub_sat(id, n - 1u) + copy; -} - -/* - * Determine whether an output vertex has an n'th copy in the transform feedback - * buffer. This is written weirdly to let constant folding remove unnecessary - * stores when length is known statically. - */ -bool -libagx_xfb_vertex_copy_in_strip(uint n, uint id, uint length, uint copy) -{ - uint prim = xfb_prim(id, n, copy); - - int num_prims = length - (n - 1); - return copy == 0 || (prim < num_prims && id >= copy && copy < num_prims); -} - -uint -libagx_xfb_vertex_offset(uint n, uint invocation_base_prim, - uint strip_base_prim, uint id_in_strip, uint copy, - bool flatshade_first) -{ - uint prim = xfb_prim(id_in_strip, n, copy); - uint vert_0 = min(id_in_strip, n - 1); - uint vert = vert_0 - copy; - - if (n == 3) { - vert = map_vertex_in_tri_strip(prim, vert, flatshade_first); - } - - /* Tally up in the whole buffer */ - uint base_prim = invocation_base_prim + strip_base_prim; - uint base_vertex = base_prim * n; - return base_vertex + (prim * n) + vert; -} - -uint64_t -libagx_xfb_vertex_address(constant struct agx_geometry_params *p, uint index, - uint buffer, uint stride, uint output_offset) -{ - uint xfb_offset = (index * stride) + output_offset; - - return (uintptr_t)(p->xfb_base[buffer]) + xfb_offset; -} - static uint vertex_id_for_line_loop(uint prim, uint vert, uint num_prims) { @@ -90,20 +38,6 @@ vertex_id_for_line_loop(uint prim, uint vert, uint num_prims) return prim + vert; } -uint -libagx_vertex_id_for_line_class(enum mesa_prim mode, uint prim, uint vert, - uint num_prims) -{ - /* Line list, line strip, or line loop */ - if (mode == MESA_PRIM_LINE_LOOP && prim == (num_prims - 1) && vert == 1) - return 0; - - if (mode == MESA_PRIM_LINES) - prim *= 2; - - return prim + vert; -} - static uint vertex_id_for_tri_fan(uint prim, uint vert, bool flatshade_first) { @@ -122,44 +56,6 @@ vertex_id_for_tri_fan(uint prim, uint vert, bool flatshade_first) return (vert == 0) ? 0 : prim + vert; } -uint -libagx_vertex_id_for_tri_class(enum mesa_prim mode, uint prim, uint vert, - bool flatshade_first) -{ - if (flatshade_first && mode == MESA_PRIM_TRIANGLE_FAN) { - vert = vert + 1; - vert = (vert == 3) ? 0 : vert; - } - - if (mode == MESA_PRIM_TRIANGLE_FAN && vert == 0) - return 0; - - if (mode == MESA_PRIM_TRIANGLES) - prim *= 3; - - /* Triangle list, triangle strip, or triangle fan */ - if (mode == MESA_PRIM_TRIANGLE_STRIP) { - unsigned pv = flatshade_first ? 0 : 2; - - bool even = (prim & 1) == 0; - bool provoking = vert == pv; - - vert = ((provoking || even) ? vert : ((3 - pv) - vert)); - } - - return prim + vert; -} - -uint -libagx_vertex_id_for_line_adj_class(enum mesa_prim mode, uint prim, uint vert) -{ - /* Line list adj or line strip adj */ - if (mode == MESA_PRIM_LINES_ADJACENCY) - prim *= 4; - - return prim + vert; -} - static uint vertex_id_for_tri_strip_adj(uint prim, uint vert, uint num_prims, bool flatshade_first) @@ -206,18 +102,6 @@ vertex_id_for_tri_strip_adj(uint prim, uint vert, uint num_prims, return (prim * 2) + offset; } -uint -libagx_vertex_id_for_tri_adj_class(enum mesa_prim mode, uint prim, uint vert, - uint nr, bool flatshade_first) -{ - /* Tri adj list or tri adj strip */ - if (mode == MESA_PRIM_TRIANGLE_STRIP_ADJACENCY) { - return vertex_id_for_tri_strip_adj(prim, vert, nr, flatshade_first); - } else { - return (6 * prim) + vert; - } -} - static uint vertex_id_for_topology(enum mesa_prim mode, bool flatshade_first, uint prim, uint vert, uint num_prims) @@ -262,127 +146,6 @@ vertex_id_for_topology(enum mesa_prim mode, bool flatshade_first, uint prim, } } -uint -libagx_map_to_line_adj(uint id) -{ - /* Sequence (1, 2), (5, 6), (9, 10), ... */ - return ((id & ~1) * 2) + (id & 1) + 1; -} - -uint -libagx_map_to_line_strip_adj(uint id) -{ - /* Sequence (1, 2), (2, 3), (4, 5), .. */ - uint prim = id / 2; - uint vert = id & 1; - return prim + vert + 1; -} - -uint -libagx_map_to_tri_strip_adj(uint id) -{ - /* Sequence (0, 2, 4), (2, 6, 4), (4, 6, 8), (6, 10, 8) - * - * Although tri strips with adjacency have 6 cases in general, after - * disregarding the vertices only available in a geometry shader, there are - * only even/odd cases. In other words, it's just a triangle strip subject to - * extra padding. - * - * Dividing through by two, the sequence is: - * - * (0, 1, 2), (1, 3, 2), (2, 3, 4), (3, 5, 4) - */ - uint prim = id / 3; - uint vtx = id % 3; - - /* Flip the winding order of odd triangles */ - if ((prim % 2) == 1) { - if (vtx == 1) - vtx = 2; - else if (vtx == 2) - vtx = 1; - } - - return 2 * (prim + vtx); -} - -static void -store_index(uintptr_t index_buffer, uint index_size_B, uint id, uint value) -{ - global uint32_t *out_32 = (global uint32_t *)index_buffer; - global uint16_t *out_16 = (global uint16_t *)index_buffer; - global uint8_t *out_8 = (global uint8_t *)index_buffer; - - if (index_size_B == 4) - out_32[id] = value; - else if (index_size_B == 2) - out_16[id] = value; - else - out_8[id] = value; -} - -static uint -load_index(uintptr_t index_buffer, uint32_t index_buffer_range_el, uint id, - uint index_size) -{ - bool oob = id >= index_buffer_range_el; - - /* If the load would be out-of-bounds, load the first element which is - * assumed valid. If the application index buffer is empty with robustness2, - * index_buffer will point to a zero sink where only the first is valid. - */ - if (oob) { - id = 0; - } - - uint el; - if (index_size == 1) { - el = ((constant uint8_t *)index_buffer)[id]; - } else if (index_size == 2) { - el = ((constant uint16_t *)index_buffer)[id]; - } else { - el = ((constant uint32_t *)index_buffer)[id]; - } - - /* D3D robustness semantics. TODO: Optimize? */ - if (oob) { - el = 0; - } - - return el; -} - -uint -libagx_load_index_buffer(constant struct agx_ia_state *p, uint id, - uint index_size) -{ - return load_index(p->index_buffer, p->index_buffer_range_el, id, index_size); -} - -static void -increment_counters(global uint32_t *a, global uint32_t *b, global uint32_t *c, - uint count) -{ - global uint32_t *ptr[] = {a, b, c}; - - for (uint i = 0; i < 3; ++i) { - if (ptr[i]) { - *(ptr[i]) += count; - } - } -} - -static unsigned -decomposed_prims_for_vertices_with_tess(enum mesa_prim prim, int vertices, - unsigned verts_per_patch) -{ - if (prim >= MESA_PRIM_PATCHES) { - return vertices / verts_per_patch; - } else { - return u_decomposed_prims_for_vertices(prim, vertices); - } -} - KERNEL(1) libagx_increment_ia(global uint32_t *ia_vertices, global uint32_t *ia_primitives, @@ -390,13 +153,8 @@ libagx_increment_ia(global uint32_t *ia_vertices, global uint32_t *c_invs, constant uint32_t *draw, enum mesa_prim prim, unsigned verts_per_patch) { - increment_counters(ia_vertices, vs_invocations, NULL, draw[0] * draw[1]); - - uint prims = - decomposed_prims_for_vertices_with_tess(prim, draw[0], verts_per_patch) * - draw[1]; - - increment_counters(ia_primitives, c_prims, c_invs, prims); + poly_increment_ia(ia_vertices, ia_primitives, vs_invocations, c_prims, + c_invs, draw, prim, verts_per_patch); } KERNEL(1024) @@ -418,8 +176,8 @@ libagx_increment_ia_restart(global uint32_t *ia_vertices, /* Count non-restart indices */ for (uint i = tid; i < count; i += 1024) { - uint index = load_index(index_buffer, index_buffer_range_el, start + i, - index_size_B); + uint index = poly_load_index(index_buffer, index_buffer_range_el, + start + i, index_size_B); if (index != restart_index) partial++; @@ -433,7 +191,8 @@ libagx_increment_ia_restart(global uint32_t *ia_vertices, /* Elect a single thread from the workgroup to increment the counters */ if (tid == 0) { - increment_counters(ia_vertices, vs_invocations, NULL, scratch * draw[1]); + poly_increment_counters(ia_vertices, vs_invocations, NULL, + scratch * draw[1]); } /* TODO: We should vectorize this */ @@ -441,22 +200,22 @@ libagx_increment_ia_restart(global uint32_t *ia_vertices, uint accum = 0; int last_restart = -1; for (uint i = 0; i < count; ++i) { - uint index = load_index(index_buffer, index_buffer_range_el, start + i, - index_size_B); + uint index = poly_load_index(index_buffer, index_buffer_range_el, + start + i, index_size_B); if (index == restart_index) { - accum += decomposed_prims_for_vertices_with_tess( + accum += poly_decomposed_prims_for_vertices_with_tess( prim, i - last_restart - 1, verts_per_patch); last_restart = i; } } { - accum += decomposed_prims_for_vertices_with_tess( + accum += poly_decomposed_prims_for_vertices_with_tess( prim, count - last_restart - 1, verts_per_patch); } - increment_counters(ia_primitives, c_prims, c_invs, accum * draw[1]); + poly_increment_counters(ia_primitives, c_prims, c_invs, accum * draw[1]); } } @@ -483,7 +242,7 @@ first_true_thread_in_workgroup(bool cond, local uint *scratch) * sets up most of the new draw descriptor. */ static global void * -setup_unroll_for_draw(global struct agx_heap *heap, constant uint *in_draw, +setup_unroll_for_draw(global struct poly_heap *heap, constant uint *in_draw, global uint *out, enum mesa_prim mode, uint index_size_B) { /* Determine an upper bound on the memory required for the index buffer. @@ -499,7 +258,7 @@ setup_unroll_for_draw(global struct agx_heap *heap, constant uint *in_draw, * TODO: For multidraw, should be atomic. But multidraw+unroll isn't * currently wired up in any driver. */ - uint old_heap_bottom_B = agx_heap_alloc_nonatomic_offs(heap, alloc_size); + uint old_heap_bottom_B = poly_heap_alloc_nonatomic_offs(heap, alloc_size); /* Setup most of the descriptor. Count will be determined after unroll. */ out[1] = in_draw[1]; /* instance count */ @@ -512,14 +271,14 @@ setup_unroll_for_draw(global struct agx_heap *heap, constant uint *in_draw, } KERNEL(1024) -libagx_unroll_restart(global struct agx_heap *heap, uint64_t index_buffer, +libagx_unroll_restart(global struct poly_heap *heap, uint64_t index_buffer, constant uint *in_draw, global uint32_t *out_draw, uint32_t max_draws, uint32_t restart_index, uint32_t index_buffer_size_el, uint32_t index_size_log2, uint32_t flatshade_first, uint mode__11) { uint32_t index_size_B = 1 << index_size_log2; - enum mesa_prim mode = libagx_uncompact_prim(mode__11); + enum mesa_prim mode = poly_uncompact_prim(mode__11); uint tid = cl_local_id.x; uint count = in_draw[0]; @@ -531,7 +290,7 @@ libagx_unroll_restart(global struct agx_heap *heap, uint64_t index_buffer, barrier(CLK_LOCAL_MEM_FENCE); - uintptr_t in_ptr = (uintptr_t)(libagx_index_buffer( + uintptr_t in_ptr = (uintptr_t)(poly_index_buffer( index_buffer, index_buffer_size_el, in_draw[2], index_size_B)); local uint scratch[32]; @@ -545,8 +304,8 @@ libagx_unroll_restart(global struct agx_heap *heap, uint64_t index_buffer, for (;;) { uint idx = next_restart + tid; bool restart = - idx >= count || load_index(in_ptr, index_buffer_size_el, idx, - index_size_B) == restart_index; + idx >= count || poly_load_index(in_ptr, index_buffer_size_el, idx, + index_size_B) == restart_index; uint next_offs = first_true_thread_in_workgroup(restart, scratch); @@ -566,10 +325,10 @@ libagx_unroll_restart(global struct agx_heap *heap, uint64_t index_buffer, uint offset = needle + id; uint x = ((out_prims_base + i) * per_prim) + vtx; - uint y = - load_index(in_ptr, index_buffer_size_el, offset, index_size_B); + uint y = poly_load_index(in_ptr, index_buffer_size_el, offset, + index_size_B); - store_index(out_ptr, index_size_B, x, y); + poly_store_index(out_ptr, index_size_B, x, y); } } @@ -581,216 +340,39 @@ libagx_unroll_restart(global struct agx_heap *heap, uint64_t index_buffer, out_draw[0] = out_prims * per_prim; } -static uint -setup_xfb_buffer(global struct agx_geometry_params *p, uint i, uint stride, - uint max_output_end, uint vertices_per_prim) -{ - uint xfb_offset = *(p->xfb_offs_ptrs[i]); - p->xfb_base[i] = p->xfb_base_original[i] + xfb_offset; - - /* Let output_end = output_offset + output_size. - * - * Primitive P will write up to (but not including) offset: - * - * xfb_offset + ((P - 1) * (verts_per_prim * stride)) - * + ((verts_per_prim - 1) * stride) - * + output_end - * - * To fit all outputs for P, that value must be less than the XFB - * buffer size for the output with maximal output_end, as everything - * else is constant here across outputs within a buffer/primitive: - * - * floor(P) <= (stride + size - xfb_offset - output_end) - * // (stride * verts_per_prim) - */ - int numer_s = p->xfb_size[i] + (stride - max_output_end) - xfb_offset; - uint numer = max(numer_s, 0); - return numer / (stride * vertices_per_prim); -} - -void -libagx_write_strip(GLOBAL uint32_t *index_buffer, uint32_t inv_index_offset, - uint32_t prim_index_offset, uint32_t vertex_offset, - uint32_t verts_in_prim, uint3 info) -{ - _libagx_write_strip(index_buffer, inv_index_offset + prim_index_offset, - vertex_offset, verts_in_prim, info.x, info.y, info.z); -} - -void -libagx_pad_index_gs(global int *index_buffer, uint inv_index_offset, - uint nr_indices, uint alloc) -{ - for (uint i = nr_indices; i < alloc; ++i) { - index_buffer[inv_index_offset + i] = -1; - } -} - KERNEL(1) libagx_gs_setup_indirect( uint64_t index_buffer, constant uint *draw, global uintptr_t *vertex_buffer /* output */, - global struct agx_ia_state *ia /* output */, - global struct agx_geometry_params *p /* output */, - global struct agx_heap *heap, + global struct poly_ia_state *ia /* output */, + global struct poly_geometry_params *p /* output */, + global struct poly_heap *heap, uint64_t vs_outputs /* Vertex (TES) output mask */, uint32_t index_size_B /* 0 if no index bffer */, uint32_t index_buffer_range_el, uint32_t prim /* Input primitive type, enum mesa_prim */, - int is_prefix_summing, uint max_indices, enum agx_gs_shape shape) + int is_prefix_summing, uint max_indices, enum poly_gs_shape shape) { - /* Determine the (primitives, instances) grid size. */ - uint vertex_count = draw[0]; - uint instance_count = draw[1]; - - ia->verts_per_instance = vertex_count; - - /* Calculate number of primitives input into the GS */ - uint prim_per_instance = u_decomposed_prims_for_vertices(prim, vertex_count); - p->input_primitives = prim_per_instance * instance_count; - - /* Invoke VS as (vertices, instances); GS as (primitives, instances) */ - p->vs_grid[0] = vertex_count; - p->vs_grid[1] = instance_count; - - p->gs_grid[0] = prim_per_instance; - p->gs_grid[1] = instance_count; - - p->primitives_log2 = util_logbase2_ceil(prim_per_instance); - - /* If indexing is enabled, the third word is the offset into the index buffer - * in elements. Apply that offset now that we have it. For a hardware - * indirect draw, the hardware would do this for us, but for software input - * assembly we need to do it ourselves. - */ - if (index_size_B) { - ia->index_buffer = libagx_index_buffer( - index_buffer, index_buffer_range_el, draw[2], index_size_B); - - ia->index_buffer_range_el = - libagx_index_buffer_range_el(index_buffer_range_el, draw[2]); - } - - /* We need to allocate VS and GS count buffers, do so now */ - uint vertex_buffer_size = - libagx_tcs_in_size(vertex_count * instance_count, vs_outputs); - - if (is_prefix_summing) { - p->count_buffer = agx_heap_alloc_nonatomic( - heap, p->input_primitives * p->count_buffer_stride); - } - - p->input_buffer = - (uintptr_t)agx_heap_alloc_nonatomic(heap, vertex_buffer_size); - *vertex_buffer = p->input_buffer; - - p->input_mask = vs_outputs; - - /* Allocate the index buffer and write the draw consuming it */ - global VkDrawIndexedIndirectCommand *cmd = (global void *)p->indirect_desc; - - *cmd = (VkDrawIndexedIndirectCommand){ - .indexCount = agx_gs_rast_vertices(shape, max_indices, prim_per_instance, - instance_count), - .instanceCount = agx_gs_rast_instances(shape, max_indices, - prim_per_instance, instance_count), - }; - - if (shape == AGX_GS_SHAPE_DYNAMIC_INDEXED) { - cmd->firstIndex = - agx_heap_alloc_nonatomic_offs(heap, cmd->indexCount * 4) / 4; - - p->output_index_buffer = - (global uint *)(heap->base + (cmd->firstIndex * 4)); - } -} - -/* - * Returns (work_group_scan_inclusive_add(x), work_group_sum(x)). Implemented - * manually with subgroup ops and local memory since Mesa doesn't do those - * lowerings yet. - */ -static uint2 -libagx_work_group_scan_inclusive_add(uint x, local uint *scratch) -{ - uint sg_id = get_sub_group_id(); - - /* Partial prefix sum of the subgroup */ - uint sg = sub_group_scan_inclusive_add(x); - - /* Reduction (sum) for the subgroup */ - uint sg_sum = sub_group_broadcast(sg, 31); - - /* Write out all the subgroups sums */ - barrier(CLK_LOCAL_MEM_FENCE); - scratch[sg_id] = sg_sum; - barrier(CLK_LOCAL_MEM_FENCE); - - /* Read all the subgroup sums. Thread T in subgroup G reads the sum of all - * threads in subgroup T. - */ - uint other_sum = scratch[get_sub_group_local_id()]; - - /* Exclusive sum the subgroup sums to get the total before the current group, - * which can be added to the total for the current group. - */ - uint other_sums = sub_group_scan_exclusive_add(other_sum); - uint base = sub_group_broadcast(other_sums, sg_id); - uint prefix = base + sg; - - /* Reduce the workgroup using the prefix sum we already did */ - uint reduction = sub_group_broadcast(other_sums + other_sum, 31); - - return (uint2)(prefix, reduction); -} - -static void -_libagx_prefix_sum(local uint *scratch, global uint *buffer, uint len, - uint words, uint word) -{ - uint tid = cl_local_id.x; - - /* Main loop: complete workgroups processing 1024 values at once */ - uint i, count = 0; - uint len_remainder = len % 1024; - uint len_rounded_down = len - len_remainder; - - for (i = tid; i < len_rounded_down; i += 1024) { - global uint *ptr = &buffer[(i * words) + word]; - uint value = *ptr; - uint2 sums = libagx_work_group_scan_inclusive_add(value, scratch); - - *ptr = count + sums[0]; - count += sums[1]; - } - - /* The last iteration is special since we won't have a full subgroup unless - * the length is divisible by the subgroup size, and we don't advance count. - */ - global uint *ptr = &buffer[(i * words) + word]; - uint value = (tid < len_remainder) ? *ptr : 0; - uint scan = libagx_work_group_scan_inclusive_add(value, scratch)[0]; - - if (tid < len_remainder) { - *ptr = count + scan; - } + poly_gs_setup_indirect(index_buffer, draw, vertex_buffer, ia, p, heap, + vs_outputs, index_size_B, index_buffer_range_el, prim, + is_prefix_summing, max_indices, shape); } KERNEL(1024) -libagx_prefix_sum_geom(constant struct agx_geometry_params *p) +libagx_prefix_sum_geom(constant struct poly_geometry_params *p) { local uint scratch[32]; - _libagx_prefix_sum(scratch, p->count_buffer, p->input_primitives, - p->count_buffer_stride / 4, cl_group_id.x); + poly_prefix_sum(scratch, p->count_buffer, p->input_primitives, + p->count_buffer_stride / 4, cl_group_id.x, 1024); } KERNEL(1024) -libagx_prefix_sum_tess(global struct libagx_tess_args *p, global uint *c_prims, +libagx_prefix_sum_tess(global struct poly_tess_args *p, global uint *c_prims, global uint *c_invs, uint increment_stats__2) { local uint scratch[32]; - _libagx_prefix_sum(scratch, p->counts, p->nr_patches, 1 /* words */, - 0 /* word */); + poly_prefix_sum(scratch, p->counts, p->nr_patches, 1 /* words */, + 0 /* word */, 1024); /* After prefix summing, we know the total # of indices, so allocate the * index buffer now. Elect a thread for the allocation. @@ -805,7 +387,7 @@ libagx_prefix_sum_tess(global struct libagx_tess_args *p, global uint *c_prims, /* Allocate 4-byte indices */ uint32_t elsize_B = sizeof(uint32_t); uint32_t size_B = total * elsize_B; - uint alloc_B = agx_heap_alloc_nonatomic_offs(p->heap, size_B); + uint alloc_B = poly_heap_alloc_nonatomic_offs(p->heap, size_B); p->index_buffer = (global uint32_t *)(((uintptr_t)p->heap->base) + alloc_B); /* ...and now we can generate the API indexed draw */ @@ -818,7 +400,7 @@ libagx_prefix_sum_tess(global struct libagx_tess_args *p, global uint *c_prims, desc[4] = 0; /* start_instance */ /* If necessary, increment clipper statistics too. This is only used when - * there's no geometry shader following us. See agx_nir_lower_gs.c for more + * there's no geometry shader following us. See poly_nir_lower_gs.c for more * info on the emulation. We just need to calculate the # of primitives * tessellated. */ @@ -827,150 +409,6 @@ libagx_prefix_sum_tess(global struct libagx_tess_args *p, global uint *c_prims, : p->isolines ? (total / 2) : (total / 3); - increment_counters(c_prims, c_invs, NULL, prims); + poly_increment_counters(c_prims, c_invs, NULL, prims); } } - -uintptr_t -libagx_vertex_output_address(uintptr_t buffer, uint64_t mask, uint vtx, - gl_varying_slot location) -{ - /* Written like this to let address arithmetic work */ - return buffer + ((uintptr_t)libagx_tcs_in_offs_el(vtx, location, mask)) * 16; -} - -uintptr_t -libagx_geometry_input_address(constant struct agx_geometry_params *p, uint vtx, - gl_varying_slot location) -{ - return libagx_vertex_output_address(p->input_buffer, p->input_mask, vtx, - location); -} - -unsigned -libagx_input_vertices(constant struct agx_ia_state *ia) -{ - return ia->verts_per_instance; -} - -global uint * -libagx_load_xfb_count_address(constant struct agx_geometry_params *p, int index, - int count_words, uint unrolled_id) -{ - return &p->count_buffer[(unrolled_id * count_words) + index]; -} - -uint -libagx_previous_xfb_primitives(global struct agx_geometry_params *p, - int static_count, int count_index, - int count_words, bool prefix_sum, - uint unrolled_id) -{ - if (static_count >= 0) { - /* If the number of outputted vertices per invocation is known statically, - * we can calculate the base. - */ - return unrolled_id * static_count; - } else { - /* Otherwise, load from the count buffer buffer. Note that the sums are - * inclusive, so index 0 is nonzero. This requires a little fixup here. We - * use a saturating unsigned subtraction so we don't read out-of-bounds. - * - * If we didn't prefix sum, there's only one element. - */ - uint prim_minus_1 = prefix_sum ? sub_sat(unrolled_id, 1u) : 0; - uint count = p->count_buffer[(prim_minus_1 * count_words) + count_index]; - - return unrolled_id == 0 ? 0 : count; - } -} - -/* Like u_foreach_bit, specialized for XFB to enable loop unrolling */ -#define libagx_foreach_xfb(word, index) \ - for (uint i = 0; i < 4; ++i) \ - if (word & BITFIELD_BIT(i)) - -void -libagx_pre_gs(global struct agx_geometry_params *p, uint streams, - uint buffers_written, uint4 buffer_to_stream, int4 count_index, - uint4 stride, uint4 output_end, int4 static_count, - uint invocations, uint vertices_per_prim, - global uint *gs_invocations, global uint *gs_primitives, - global uint *c_primitives, global uint *c_invocations) -{ - unsigned count_words = !!(count_index[0] >= 0) + !!(count_index[1] >= 0) + - !!(count_index[2] >= 0) + !!(count_index[3] >= 0); - bool prefix_sum = count_words && buffers_written; - uint unrolled_in_prims = p->input_primitives; - - /* Determine the number of primitives generated in each stream */ - uint4 in_prims = 0; - libagx_foreach_xfb(streams, i) { - in_prims[i] = libagx_previous_xfb_primitives( - p, static_count[i], count_index[i], count_words, prefix_sum, - unrolled_in_prims); - - *(p->prims_generated_counter[i]) += in_prims[i]; - } - - uint4 prims = in_prims; - uint emitted_prims = prims[0] + prims[1] + prims[2] + prims[3]; - - if (buffers_written) { - libagx_foreach_xfb(buffers_written, i) { - uint max_prims = - setup_xfb_buffer(p, i, stride[i], output_end[i], vertices_per_prim); - - unsigned stream = buffer_to_stream[i]; - prims[stream] = min(prims[stream], max_prims); - } - - int4 overflow = prims < in_prims; - - libagx_foreach_xfb(streams, i) { - p->xfb_verts[i] = prims[i] * vertices_per_prim; - - *(p->xfb_overflow[i]) += (bool)overflow[i]; - *(p->xfb_prims_generated_counter[i]) += prims[i]; - } - - *(p->xfb_any_overflow) += any(overflow); - - /* Update XFB counters */ - libagx_foreach_xfb(buffers_written, i) { - uint32_t prim_stride_B = stride[i] * vertices_per_prim; - unsigned stream = buffer_to_stream[i]; - - global uint *ptr = p->xfb_offs_ptrs[i]; - - ptr = (global uint *)nir_ro_to_rw_poly((uint64_t)ptr); - *ptr += prims[stream] * prim_stride_B; - } - } - - /* The geometry shader is invoked once per primitive (after unrolling - * primitive restart). From the spec: - * - * In case of instanced geometry shaders (see section 11.3.4.2) the - * geometry shader invocations count is incremented for each separate - * instanced invocation. - */ - *gs_invocations += unrolled_in_prims * invocations; - *gs_primitives += emitted_prims; - - /* Clipper queries are not well-defined, so we can emulate them in lots of - * silly ways. We need the hardware counters to implement them properly. For - * now, just consider all primitives emitted as passing through the clipper. - * This satisfies spec text: - * - * The number of primitives that reach the primitive clipping stage. - * - * and - * - * If at least one vertex of the primitive lies inside the clipping - * volume, the counter is incremented by one or more. Otherwise, the - * counter is incremented by zero or more. - */ - *c_primitives += emitted_prims; - *c_invocations += emitted_prims; -} diff --git a/src/asahi/libagx/geometry.h b/src/asahi/libagx/geometry.h deleted file mode 100644 index 870f6489ca4..00000000000 --- a/src/asahi/libagx/geometry.h +++ /dev/null @@ -1,410 +0,0 @@ -/* - * Copyright 2023 Alyssa Rosenzweig - * Copyright 2023 Valve Corporation - * SPDX-License-Identifier: MIT - */ - -#include "asahi/lib/agx_abi.h" -#include "compiler/libcl/libcl.h" -#include "compiler/shader_enums.h" - -#include "util/bitscan.h" -#include "util/u_math.h" - -#pragma once - -#define MAX_SO_BUFFERS 4 -#define MAX_VERTEX_STREAMS 4 - -enum agx_gs_shape { - /* Indexed, where indices are encoded as: - * - * round_to_pot(max_indices) * round_to_pot(input_primitives) * - * * instance_count - * - * invoked for max_indices * input_primitives * instance_count indices. - * - * This is used with any dynamic topology. No hardware instancing used. - */ - AGX_GS_SHAPE_DYNAMIC_INDEXED, - - /* Indexed with a static index buffer. Indices ranges up to max_indices. - * Hardware instance count = input_primitives * software instance count. - */ - AGX_GS_SHAPE_STATIC_INDEXED, - - /* Non-indexed. Dispatched as: - * - * (max_indices, input_primitives * instance count). - */ - AGX_GS_SHAPE_STATIC_PER_PRIM, - - /* Non-indexed. Dispatched as: - * - * (max_indices * input_primitives, instance count). - */ - AGX_GS_SHAPE_STATIC_PER_INSTANCE, -}; - -static inline unsigned -agx_gs_rast_vertices(enum agx_gs_shape shape, unsigned max_indices, - unsigned input_primitives, unsigned instance_count) -{ - switch (shape) { - case AGX_GS_SHAPE_DYNAMIC_INDEXED: - return max_indices * input_primitives * instance_count; - - case AGX_GS_SHAPE_STATIC_INDEXED: - case AGX_GS_SHAPE_STATIC_PER_PRIM: - return max_indices; - - case AGX_GS_SHAPE_STATIC_PER_INSTANCE: - return max_indices * input_primitives; - } - - UNREACHABLE("invalid shape"); -} - -static inline unsigned -agx_gs_rast_instances(enum agx_gs_shape shape, unsigned max_indices, - unsigned input_primitives, unsigned instance_count) -{ - switch (shape) { - case AGX_GS_SHAPE_DYNAMIC_INDEXED: - return 1; - - case AGX_GS_SHAPE_STATIC_INDEXED: - case AGX_GS_SHAPE_STATIC_PER_PRIM: - return input_primitives * instance_count; - - case AGX_GS_SHAPE_STATIC_PER_INSTANCE: - return instance_count; - } - - UNREACHABLE("invalid shape"); -} - -static inline bool -agx_gs_indexed(enum agx_gs_shape shape) -{ - return shape == AGX_GS_SHAPE_DYNAMIC_INDEXED || - shape == AGX_GS_SHAPE_STATIC_INDEXED; -} - -static inline unsigned -agx_gs_index_size(enum agx_gs_shape shape) -{ - switch (shape) { - case AGX_GS_SHAPE_DYNAMIC_INDEXED: - return 4; - case AGX_GS_SHAPE_STATIC_INDEXED: - return 1; - default: - return 0; - } -} - -/* Heap to allocate from. */ -struct agx_heap { - DEVICE(uchar) base; - uint32_t bottom, size; -} PACKED; -static_assert(sizeof(struct agx_heap) == 4 * 4); - -#ifdef __OPENCL_VERSION__ -static inline uint -_agx_heap_alloc_offs(global struct agx_heap *heap, uint size_B, bool atomic) -{ - size_B = align(size_B, 16); - - uint offs; - if (atomic) { - offs = atomic_fetch_add((volatile atomic_uint *)(&heap->bottom), size_B); - } else { - offs = heap->bottom; - heap->bottom = offs + size_B; - } - - /* Use printf+abort because assert is stripped from release builds. */ - if (heap->bottom >= heap->size) { - printf( - "FATAL: GPU heap overflow, allocating size %u, at offset %u, heap size %u!", - size_B, offs, heap->size); - - abort(); - } - - return offs; -} - -static inline uint -agx_heap_alloc_nonatomic_offs(global struct agx_heap *heap, uint size_B) -{ - return _agx_heap_alloc_offs(heap, size_B, false); -} - -static inline uint -agx_heap_alloc_atomic_offs(global struct agx_heap *heap, uint size_B) -{ - return _agx_heap_alloc_offs(heap, size_B, true); -} - -static inline global void * -agx_heap_alloc_nonatomic(global struct agx_heap *heap, uint size_B) -{ - return heap->base + agx_heap_alloc_nonatomic_offs(heap, size_B); -} - -uint64_t nir_load_ro_sink_address_poly(void); - -static inline uint64_t -libagx_index_buffer(uint64_t index_buffer, uint size_el, uint offset_el, - uint elsize_B) -{ - if (offset_el < size_el) - return index_buffer + (offset_el * elsize_B); - else - return nir_load_ro_sink_address_poly(); -} -#endif - -struct agx_ia_state { - /* Index buffer if present. */ - uint64_t index_buffer; - - /* Size of the bound index buffer for bounds checking */ - uint32_t index_buffer_range_el; - - /* Number of vertices per instance. Written by CPU for direct draw, indirect - * setup kernel for indirect. This is used for VS->GS and VS->TCS indexing. - */ - uint32_t verts_per_instance; -} PACKED; -static_assert(sizeof(struct agx_ia_state) == 4 * 4); - -static inline uint -libagx_index_buffer_range_el(uint size_el, uint offset_el) -{ - return offset_el < size_el ? (size_el - offset_el) : 0; -} - -struct agx_geometry_params { - /* Address of associated indirect draw buffer */ - DEVICE(uint) indirect_desc; - - /* Address of count buffer. For an indirect draw, this will be written by the - * indirect setup kernel. - */ - DEVICE(uint) count_buffer; - - /* Address of the primitives generated counters */ - DEVICE(uint) prims_generated_counter[MAX_VERTEX_STREAMS]; - DEVICE(uint) xfb_prims_generated_counter[MAX_VERTEX_STREAMS]; - DEVICE(uint) xfb_overflow[MAX_VERTEX_STREAMS]; - DEVICE(uint) xfb_any_overflow; - - /* Pointers to transform feedback buffer offsets in bytes */ - DEVICE(uint) xfb_offs_ptrs[MAX_SO_BUFFERS]; - - /* Output index buffer, allocated by pre-GS. */ - DEVICE(uint) output_index_buffer; - - /* Address of transform feedback buffer in general, supplied by the CPU. */ - DEVICE(uchar) xfb_base_original[MAX_SO_BUFFERS]; - - /* Address of transform feedback for the current primitive. Written by pre-GS - * program. - */ - DEVICE(uchar) xfb_base[MAX_SO_BUFFERS]; - - /* Address and present mask for the input to the geometry shader. These will - * reflect the vertex shader for VS->GS or instead the tessellation - * evaluation shader for TES->GS. - */ - uint64_t input_buffer; - uint64_t input_mask; - - /* Location-indexed mask of flat outputs, used for lowering GL edge flags. */ - uint64_t flat_outputs; - - uint32_t xfb_size[MAX_SO_BUFFERS]; - - /* Number of vertices emitted by transform feedback per stream. Written by - * the pre-GS program. - */ - uint32_t xfb_verts[MAX_VERTEX_STREAMS]; - - /* Within an indirect GS draw, the grids used to dispatch the VS/GS written - * out by the GS indirect setup kernel or the CPU for a direct draw. This is - * the "indirect local" format: first 3 is in threads, second 3 is in grid - * blocks. This lets us use nontrivial workgroups with indirect draws without - * needing any predication. - */ - uint32_t vs_grid[6]; - uint32_t gs_grid[6]; - - /* Number of input primitives across all instances, calculated by the CPU for - * a direct draw or the GS indirect setup kernel for an indirect draw. - */ - uint32_t input_primitives; - - /* Number of input primitives per instance, rounded up to a power-of-two and - * with the base-2 log taken. This is used to partition the output vertex IDs - * efficiently. - */ - uint32_t primitives_log2; - - /* Number of bytes output by the GS count shader per input primitive (may be - * 0), written by CPU and consumed by indirect draw setup shader for - * allocating counts. - */ - uint32_t count_buffer_stride; - - /* Dynamic input topology. Must be compatible with the geometry shader's - * layout() declared input class. - */ - uint32_t input_topology; -} PACKED; -static_assert(sizeof(struct agx_geometry_params) == 86 * 4); - -/* TCS shared memory layout: - * - * vec4 vs_outputs[VERTICES_IN_INPUT_PATCH][TOTAL_VERTEX_OUTPUTS]; - * - * TODO: compact. - */ -static inline uint -libagx_tcs_in_offs_el(uint vtx, gl_varying_slot location, - uint64_t crosslane_vs_out_mask) -{ - uint base = vtx * util_bitcount64(crosslane_vs_out_mask); - uint offs = util_bitcount64(crosslane_vs_out_mask & - (((uint64_t)(1) << location) - 1)); - - return base + offs; -} - -static inline uint -libagx_tcs_in_offs(uint vtx, gl_varying_slot location, - uint64_t crosslane_vs_out_mask) -{ - return libagx_tcs_in_offs_el(vtx, location, crosslane_vs_out_mask) * 16; -} - -static inline uint -libagx_tcs_in_size(uint32_t vertices_in_patch, uint64_t crosslane_vs_out_mask) -{ - return vertices_in_patch * util_bitcount64(crosslane_vs_out_mask) * 16; -} - -/* - * TCS out buffer layout, per-patch: - * - * float tess_level_outer[4]; - * float tess_level_inner[2]; - * vec4 patch_out[MAX_PATCH_OUTPUTS]; - * vec4 vtx_out[OUT_PATCH_SIZE][TOTAL_VERTEX_OUTPUTS]; - * - * Vertex out are compacted based on the mask of written out. Patch - * out are used as-is. - * - * Bounding boxes are ignored. - */ -static inline uint -libagx_tcs_out_offs_el(uint vtx_id, gl_varying_slot location, uint nr_patch_out, - uint64_t vtx_out_mask) -{ - uint off = 0; - if (location == VARYING_SLOT_TESS_LEVEL_OUTER) - return off; - - off += 4; - if (location == VARYING_SLOT_TESS_LEVEL_INNER) - return off; - - off += 2; - if (location >= VARYING_SLOT_PATCH0) - return off + (4 * (location - VARYING_SLOT_PATCH0)); - - /* Anything else is a per-vtx output */ - off += 4 * nr_patch_out; - off += 4 * vtx_id * util_bitcount64(vtx_out_mask); - - uint idx = util_bitcount64(vtx_out_mask & (((uint64_t)(1) << location) - 1)); - return off + (4 * idx); -} - -static inline uint -libagx_tcs_out_offs(uint vtx_id, gl_varying_slot location, uint nr_patch_out, - uint64_t vtx_out_mask) -{ - return libagx_tcs_out_offs_el(vtx_id, location, nr_patch_out, vtx_out_mask) * - 4; -} - -static inline uint -libagx_tcs_out_stride_el(uint nr_patch_out, uint out_patch_size, - uint64_t vtx_out_mask) -{ - return libagx_tcs_out_offs_el(out_patch_size, 0, nr_patch_out, vtx_out_mask); -} - -static inline uint -libagx_tcs_out_stride(uint nr_patch_out, uint out_patch_size, - uint64_t vtx_out_mask) -{ - return libagx_tcs_out_stride_el(nr_patch_out, out_patch_size, vtx_out_mask) * - 4; -} - -/* In a tess eval shader, stride for hw vertex ID */ -#define LIBAGX_TES_PATCH_ID_STRIDE 8192 - -static uint -libagx_compact_prim(enum mesa_prim prim) -{ - static_assert(MESA_PRIM_QUAD_STRIP == MESA_PRIM_QUADS + 1); - static_assert(MESA_PRIM_POLYGON == MESA_PRIM_QUADS + 2); - -#ifndef __OPENCL_VERSION__ - assert(prim != MESA_PRIM_QUADS); - assert(prim != MESA_PRIM_QUAD_STRIP); - assert(prim != MESA_PRIM_POLYGON); - assert(prim != MESA_PRIM_PATCHES); -#endif - - return (prim >= MESA_PRIM_QUADS) ? (prim - 3) : prim; -} - -static enum mesa_prim -libagx_uncompact_prim(uint packed) -{ - return (packed >= MESA_PRIM_QUADS) ? (packed + 3) : packed; -} - -/* - * Write a strip into a 32-bit index buffer. This is the sequence: - * - * (b, b + 1, b + 2, ..., b + n - 1, -1) where -1 is the restart index - * - * For points, we write index buffers without restart just for remapping. - */ -static inline void -_libagx_write_strip(GLOBAL uint32_t *index_buffer, uint32_t index_offset, - uint32_t vertex_offset, uint32_t verts_in_prim, - uint32_t stream, uint32_t stream_multiplier, uint32_t n) -{ - bool restart = n > 1; - if (verts_in_prim < n) - return; - - GLOBAL uint32_t *out = &index_buffer[index_offset]; - - /* Write out indices for the strip */ - for (uint32_t i = 0; i < verts_in_prim; ++i) { - out[i] = (vertex_offset + i) * stream_multiplier + stream; - } - - if (restart) - out[verts_in_prim] = -1; -} diff --git a/src/asahi/libagx/meson.build b/src/asahi/libagx/meson.build index b772415dbbc..70d249d58b5 100644 --- a/src/asahi/libagx/meson.build +++ b/src/asahi/libagx/meson.build @@ -21,6 +21,7 @@ libagx_spv = custom_target( libagx_shader_files, '--', '-I' + join_paths(meson.project_source_root(), 'include'), '-I' + join_paths(meson.project_source_root(), 'src/compiler/libcl'), + '-I' + join_paths(meson.project_source_root(), 'src/poly/cl'), '-I' + join_paths(meson.current_source_dir(), '.'), '-I' + join_paths(meson.current_source_dir(), '../../'), '-I' + join_paths(meson.current_source_dir(), 'shaders'), diff --git a/src/asahi/libagx/tessellation.cl b/src/asahi/libagx/tessellation.cl index 244158f3d38..a84eed823a3 100644 --- a/src/asahi/libagx/tessellation.cl +++ b/src/asahi/libagx/tessellation.cl @@ -3,148 +3,14 @@ * SPDX-License-Identifier: MIT */ -#include "geometry.h" -#include "tessellator.h" -#include - -uint -libagx_tcs_patch_vertices_in(constant struct libagx_tess_args *p) -{ - return p->input_patch_size; -} - -uint -libagx_tes_patch_vertices_in(constant struct libagx_tess_args *p) -{ - return p->output_patch_size; -} - -uint -libagx_tcs_unrolled_id(constant struct libagx_tess_args *p, uint3 wg_id) -{ - return (wg_id.y * p->patches_per_instance) + wg_id.x; -} - -uint64_t -libagx_tes_buffer(constant struct libagx_tess_args *p) -{ - return p->tes_buffer; -} - -/* - * Helper to lower indexing for a tess eval shader ran as a compute shader. This - * handles the tess+geom case. This is simpler than the general input assembly - * lowering, as we know: - * - * 1. the index buffer is U32 - * 2. the index is in bounds - * - * Therefore we do a simple load. No bounds checking needed. - */ -uint32_t -libagx_load_tes_index(constant struct libagx_tess_args *p, uint32_t index) -{ - /* Swap second and third vertices of each triangle to flip winding order - * dynamically if needed. - */ - if (p->ccw) { - uint id = index % 3; - - if (id == 1) - index++; - else if (id == 2) - index--; - } - - return p->index_buffer[index]; -} - -ushort -libagx_tcs_in_offset(uint vtx, gl_varying_slot location, - uint64_t crosslane_vs_out_mask) -{ - return libagx_tcs_in_offs(vtx, location, crosslane_vs_out_mask); -} - -uintptr_t -libagx_tcs_out_address(constant struct libagx_tess_args *p, uint patch_id, - uint vtx_id, gl_varying_slot location, uint nr_patch_out, - uint out_patch_size, uint64_t vtx_out_mask) -{ - uint stride_el = - libagx_tcs_out_stride_el(nr_patch_out, out_patch_size, vtx_out_mask); - - uint offs_el = - libagx_tcs_out_offs_el(vtx_id, location, nr_patch_out, vtx_out_mask); - - offs_el += patch_id * stride_el; - - /* Written to match the AGX addressing mode */ - return (uintptr_t)(p->tcs_buffer) + (((uintptr_t)offs_el) << 2); -} - -static uint -libagx_tes_unrolled_patch_id(uint raw_id) -{ - return raw_id / LIBAGX_TES_PATCH_ID_STRIDE; -} - -uint -libagx_tes_patch_id(constant struct libagx_tess_args *p, uint raw_id) -{ - return libagx_tes_unrolled_patch_id(raw_id) % p->patches_per_instance; -} - -static uint -tes_vertex_id_in_patch(uint raw_id) -{ - return raw_id % LIBAGX_TES_PATCH_ID_STRIDE; -} - -float2 -libagx_load_tess_coord(constant struct libagx_tess_args *p, uint raw_id) -{ - uint patch = libagx_tes_unrolled_patch_id(raw_id); - uint vtx = tes_vertex_id_in_patch(raw_id); - - global struct libagx_tess_point *t = - &p->patch_coord_buffer[p->coord_allocs[patch] + vtx]; - - /* Written weirdly because NIR struggles with loads of structs */ - uint2 fixed = *((global uint2 *)t); - - /* Convert fixed point to float */ - return convert_float2(fixed) / (1u << 16); -} - -uintptr_t -libagx_tes_in_address(constant struct libagx_tess_args *p, uint raw_id, - uint vtx_id, gl_varying_slot location) -{ - uint patch = libagx_tes_unrolled_patch_id(raw_id); - - return libagx_tcs_out_address(p, patch, vtx_id, location, - p->tcs_patch_constants, p->output_patch_size, - p->tcs_per_vertex_outputs); -} - -float4 -libagx_tess_level_outer_default(constant struct libagx_tess_args *p) -{ - return vload4(0, p->tess_level_outer_default); -} - -float2 -libagx_tess_level_inner_default(constant struct libagx_tess_args *p) -{ - return vload2(0, p->tess_level_inner_default); -} +#include "poly/geometry.h" +#include "poly/tessellator.h" KERNEL(1) libagx_tess_setup_indirect( - global struct libagx_tess_args *p, + global struct poly_tess_args *p, global uint32_t *grids /* output: VS then TCS then tess */, - global struct agx_ia_state *ia /* output */, global uint32_t *indirect, + global struct poly_ia_state *ia /* output */, global uint32_t *indirect, global uint64_t *vertex_output_buffer_ptr, uint64_t in_index_buffer, uint32_t in_index_buffer_range_el, uint32_t in_index_size_B, uint64_t vertex_outputs /* bitfield */, @@ -174,11 +40,11 @@ libagx_tess_setup_indirect( alloc += unrolled_patches * sizeof(uint32_t); uint vb_offs = alloc; - uint vb_size = libagx_tcs_in_size(count * instance_count, vertex_outputs); + uint vb_size = poly_tcs_in_size(count * instance_count, vertex_outputs); alloc += vb_size; /* Allocate all patch calculations in one go */ - global uchar *blob = agx_heap_alloc_nonatomic(p->heap, alloc); + global uchar *blob = poly_heap_alloc_nonatomic(p->heap, alloc); p->tcs_buffer = (global float *)(blob + tcs_out_offs); p->patches_per_instance = in_patches; @@ -201,11 +67,11 @@ libagx_tess_setup_indirect( */ if (in_index_size_B) { ia->index_buffer = - libagx_index_buffer(in_index_buffer, in_index_buffer_range_el, - indirect[2], in_index_size_B); + poly_index_buffer(in_index_buffer, in_index_buffer_range_el, + indirect[2], in_index_size_B); ia->index_buffer_range_el = - libagx_index_buffer_range_el(in_index_buffer_range_el, indirect[2]); + poly_index_buffer_range_el(in_index_buffer_range_el, indirect[2]); } /* VS grid size */ diff --git a/src/asahi/libagx/tessellator.cl b/src/asahi/libagx/tessellator.cl index 957230e422d..6dcbdd8320e 100644 --- a/src/asahi/libagx/tessellator.cl +++ b/src/asahi/libagx/tessellator.cl @@ -19,1594 +19,26 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include "poly/cl/tessellator.h" -#include "util/u_math.h" -#include "geometry.h" -#include "tessellator.h" - -#define LIBAGX_TESS_MIN_ISOLINE_DENSITY_TESSELLATION_FACTOR 1.0f -#define LIBAGX_TESS_MAX_ISOLINE_DENSITY_TESSELLATION_FACTOR 64.0f - -typedef unsigned int FXP; // fixed point number - -enum { - U = 0, // points on a tri patch - V = 1, -}; - -enum { - Ueq0 = 0, // edges on a tri patch - Veq0 = 1, - Weq0 = 2, -}; - -enum { - Ueq1 = 2, // edges on a quad patch: Ueq0, Veq0, Ueq1, Veq1 - Veq1 = 3, -}; - -#define QUAD_AXES 2 -#define QUAD_EDGES 4 -#define TRI_EDGES 3 - -// The interior can just use a simpler stitch. -typedef enum DIAGONALS { - DIAGONALS_INSIDE_TO_OUTSIDE, - DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE, - DIAGONALS_MIRRORED -} DIAGONALS; - -typedef struct TESS_FACTOR_CONTEXT { - FXP fxpInvNumSegmentsOnFloorTessFactor; - FXP fxpInvNumSegmentsOnCeilTessFactor; - FXP fxpHalfTessFactorFraction; - int numHalfTessFactorPoints; - int splitPointOnFloorHalfTessFactor; -} TESS_FACTOR_CONTEXT; - -struct INDEX_PATCH_CONTEXT { - int insidePointIndexDeltaToRealValue; - int insidePointIndexBadValue; - int insidePointIndexReplacementValue; - int outsidePointIndexPatchBase; - int outsidePointIndexDeltaToRealValue; - int outsidePointIndexBadValue; - int outsidePointIndexReplacementValue; -}; - -struct INDEX_PATCH_CONTEXT2 { - int baseIndexToInvert; - int indexInversionEndPoint; - int cornerCaseBadValue; - int cornerCaseReplacementValue; -}; - -struct CHWTessellator { - enum libagx_tess_mode mode; - uint index_bias; - - // array where we will store u/v's for the points we generate - global struct libagx_tess_point *Point; - - // array where we will store index topology - global void *Index; - - // A second index patch we have to do handles the leftover strip of quads in - // the middle of an odd quad patch after finishing all the concentric rings. - // This also handles the leftover strip of points in the middle of an even - // quad patch, when stitching the row of triangles up the left side (V major - // quad) or bottom (U major quad) of the inner ring - bool bUsingPatchedIndices; - bool bUsingPatchedIndices2; - struct INDEX_PATCH_CONTEXT IndexPatchCtx; - struct INDEX_PATCH_CONTEXT2 IndexPatchCtx2; -}; - -#define FXP_INTEGER_BITS 15 -#define FXP_FRACTION_BITS 16 -#define FXP_FRACTION_MASK 0x0000ffff -#define FXP_INTEGER_MASK 0x7fff0000 -#define FXP_ONE (1 << FXP_FRACTION_BITS) -#define FXP_ONE_THIRD 0x00005555 -#define FXP_TWO_THIRDS 0x0000aaaa -#define FXP_ONE_HALF 0x00008000 - -static global float * -tess_factors(constant struct libagx_tess_args *p, uint patch) +KERNEL(64) +libagx_tess_isoline(constant struct poly_tess_args *p, + enum poly_tess_mode mode__2) { - return p->tcs_buffer + (patch * p->tcs_stride_el); -} - -/* - * Generate an indexed draw for a patch with the computed number of indices. - * This allocates heap memory for the index buffer, returning the allocated - * memory. - */ -static global void * -libagx_draw(constant struct libagx_tess_args *p, enum libagx_tess_mode mode, - bool lines, uint patch, uint count) -{ - if (mode == LIBAGX_TESS_MODE_COUNT) { - p->counts[patch] = count; - } - - if (mode == LIBAGX_TESS_MODE_WITH_COUNTS) { - /* The index buffer is already allocated, get a pointer inside it. - * p->counts has had an inclusive prefix sum hence the subtraction. - */ - uint offset_el = p->counts[sub_sat(patch, 1u)]; - if (patch == 0) - offset_el = 0; - - return &p->index_buffer[offset_el]; - } - - return NULL; -} - -static void -libagx_draw_points(private struct CHWTessellator *ctx, - constant struct libagx_tess_args *p, uint patch, uint count) -{ - /* For points mode with a single draw, we need to generate a trivial index - * buffer to stuff in the patch ID in the right place. - */ - global uint32_t *indices = libagx_draw(p, ctx->mode, false, patch, count); - - if (ctx->mode == LIBAGX_TESS_MODE_COUNT) - return; - - for (int i = 0; i < count; ++i) { - indices[i] = ctx->index_bias + i; - } -} - -static void -libagx_draw_empty(constant struct libagx_tess_args *p, - enum libagx_tess_mode mode, - uint patch) -{ - if (mode == LIBAGX_TESS_MODE_COUNT) { - p->counts[patch] = 0; - } -} - -/* - * Allocate heap memory for domain points for a patch. The allocation - * is recorded in the coord_allocs[] array, which is in elements. - */ -static global struct libagx_tess_point * -libagx_heap_alloc_points(constant struct libagx_tess_args *p, uint patch, - uint count) -{ - /* If we're recording statistics, increment now. The statistic is for - * tessellation evaluation shader invocations, which is equal to the number - * of domain points generated. - */ - if (p->statistic) { - atomic_fetch_add((volatile atomic_uint *)(p->statistic), count); - } - - uint32_t elsize_B = sizeof(struct libagx_tess_point); - uint32_t alloc_B = agx_heap_alloc_atomic_offs(p->heap, elsize_B * count); - uint32_t alloc_el = alloc_B / elsize_B; - - p->coord_allocs[patch] = alloc_el; - return (global struct libagx_tess_point *)(((uintptr_t)p->heap->base) + - alloc_B); -} - -// Microsoft D3D11 Fixed Function Tessellator Reference - May 7, 2012 -// amar.patel@microsoft.com - -#define LIBAGX_TESS_MIN_ODD_TESSELLATION_FACTOR 1 -#define LIBAGX_TESS_MAX_ODD_TESSELLATION_FACTOR 63 -#define LIBAGX_TESS_MIN_EVEN_TESSELLATION_FACTOR 2 -#define LIBAGX_TESS_MAX_EVEN_TESSELLATION_FACTOR 64 - -// 2^(-16), min positive fixed point fraction -#define EPSILON 0.0000152587890625f -#define MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON \ - (LIBAGX_TESS_MIN_ODD_TESSELLATION_FACTOR + EPSILON / 2) - -static float clamp_factor(float factor, - enum libagx_tess_partitioning partitioning, - float maxf) -{ - float lower = (partitioning == LIBAGX_TESS_PARTITIONING_FRACTIONAL_EVEN) - ? LIBAGX_TESS_MIN_EVEN_TESSELLATION_FACTOR - : LIBAGX_TESS_MIN_ODD_TESSELLATION_FACTOR; - - float upper = (partitioning == LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD) - ? LIBAGX_TESS_MAX_ODD_TESSELLATION_FACTOR - : LIBAGX_TESS_MAX_EVEN_TESSELLATION_FACTOR; - - // If any TessFactor will end up > 1 after floatToFixed conversion later, - // then force the inside TessFactors to be > 1 so there is a picture frame. - if (partitioning == LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD && - maxf > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON) { - - lower = LIBAGX_TESS_MIN_ODD_TESSELLATION_FACTOR + EPSILON; - } - - factor = clamp(factor, lower, upper); - - if (partitioning == LIBAGX_TESS_PARTITIONING_INTEGER) { - factor = ceil(factor); - } - - return factor; -} - - -static FXP -floatToFixed(const float input) -{ - return mad(input, FXP_ONE, 0.5f); -} - -static bool -isOdd(const float input) -{ - return ((int)input) & 1; -} - -static FXP -fxpCeil(const FXP input) -{ - if (input & FXP_FRACTION_MASK) { - return (input & FXP_INTEGER_MASK) + FXP_ONE; - } - return input; -} - -static FXP -fxpFloor(const FXP input) -{ - return (input & FXP_INTEGER_MASK); -} - -static int -PatchIndexValue(private struct CHWTessellator *ctx, int index) -{ - if (ctx->bUsingPatchedIndices) { - // assumed remapped outide indices are > remapped inside vertices - if (index >= ctx->IndexPatchCtx.outsidePointIndexPatchBase) { - if (index == ctx->IndexPatchCtx.outsidePointIndexBadValue) - return ctx->IndexPatchCtx.outsidePointIndexReplacementValue; - else - return index + ctx->IndexPatchCtx.outsidePointIndexDeltaToRealValue; - } else { - if (index == ctx->IndexPatchCtx.insidePointIndexBadValue) - return ctx->IndexPatchCtx.insidePointIndexReplacementValue; - else - return index + ctx->IndexPatchCtx.insidePointIndexDeltaToRealValue; - } - } else if (ctx->bUsingPatchedIndices2) { - if (index == ctx->IndexPatchCtx2.cornerCaseBadValue) { - return ctx->IndexPatchCtx2.cornerCaseReplacementValue; - } else if (index >= ctx->IndexPatchCtx2.baseIndexToInvert) { - return ctx->IndexPatchCtx2.indexInversionEndPoint - index; - } - } - - return index; -} - -static void -DefinePoint(global struct libagx_tess_point *out, FXP fxpU, FXP fxpV) -{ - out->u = fxpU; - out->v = fxpV; -} - -static void -DefineIndex(private struct CHWTessellator *ctx, int index, - int indexStorageOffset) -{ - global uint32_t *indices = (global uint32_t *)ctx->Index; - indices[indexStorageOffset] = ctx->index_bias + PatchIndexValue(ctx, index); -} - -static void -DefineTriangle(private struct CHWTessellator *ctx, int index0, int index1, - int index2, int indexStorageBaseOffset) -{ - index0 = PatchIndexValue(ctx, index0); - index1 = PatchIndexValue(ctx, index1); - index2 = PatchIndexValue(ctx, index2); - - vstore3(ctx->index_bias + (uint3)(index0, index1, index2), 0, - (global uint *)ctx->Index + indexStorageBaseOffset); -} - -static uint32_t -RemoveMSB(uint32_t val) -{ - uint32_t bit = val ? (1 << (31 - clz(val))) : 0; - return val & ~bit; -} - -static int -NumPointsForTessFactor(bool odd, FXP fxpTessFactor) -{ - // Add epsilon for rounding and add 1 for odd - FXP f = fxpTessFactor + (odd ? (FXP_ONE + 1) : 1); - int r = fxpCeil(f / 2) >> (FXP_FRACTION_BITS - 1); - return odd ? r : r + 1; -} - -static void -ComputeTessFactorCtx(bool odd, FXP fxpTessFactor, - private TESS_FACTOR_CONTEXT *TessFactorCtx) -{ - // fxpHalfTessFactor == 1/2 if TessFactor is 1, - // but we're pretending we are even. - FXP fxpHalfTessFactor = (fxpTessFactor + 1 /*round*/) / 2; - if (odd || (fxpHalfTessFactor == FXP_ONE_HALF)) { - fxpHalfTessFactor += FXP_ONE_HALF; - } - FXP fxpFloorHalfTessFactor = fxpFloor(fxpHalfTessFactor); - FXP fxpCeilHalfTessFactor = fxpCeil(fxpHalfTessFactor); - TessFactorCtx->fxpHalfTessFactorFraction = fxpHalfTessFactor - fxpFloorHalfTessFactor; - TessFactorCtx->numHalfTessFactorPoints = - (fxpCeilHalfTessFactor >> FXP_FRACTION_BITS); // for EVEN, we don't include the point always - // fixed at the midpoint of the TessFactor - if (fxpCeilHalfTessFactor == fxpFloorHalfTessFactor) { - TessFactorCtx->splitPointOnFloorHalfTessFactor = - /*pick value to cause this to be ignored*/ TessFactorCtx->numHalfTessFactorPoints + 1; - } else if (odd) { - if (fxpFloorHalfTessFactor == FXP_ONE) { - TessFactorCtx->splitPointOnFloorHalfTessFactor = 0; - } else { - TessFactorCtx->splitPointOnFloorHalfTessFactor = - (RemoveMSB((fxpFloorHalfTessFactor >> FXP_FRACTION_BITS) - 1) << 1) + 1; - } - } else { - TessFactorCtx->splitPointOnFloorHalfTessFactor = - (RemoveMSB(fxpFloorHalfTessFactor >> FXP_FRACTION_BITS) << 1) + 1; - } - int numFloorSegments = (fxpFloorHalfTessFactor * 2) >> FXP_FRACTION_BITS; - int numCeilSegments = (fxpCeilHalfTessFactor * 2) >> FXP_FRACTION_BITS; - if (odd) { - numFloorSegments -= 1; - numCeilSegments -= 1; - } - TessFactorCtx->fxpInvNumSegmentsOnFloorTessFactor = - floatToFixed(1.0f / (float)numFloorSegments); - TessFactorCtx->fxpInvNumSegmentsOnCeilTessFactor = - floatToFixed(1.0f / (float)numCeilSegments); -} - -static FXP -PlacePointIn1D(private const TESS_FACTOR_CONTEXT *TessFactorCtx, bool odd, - int point) -{ - bool bFlip = point >= TessFactorCtx->numHalfTessFactorPoints; - - if (bFlip) { - point = (TessFactorCtx->numHalfTessFactorPoints << 1) - point - odd; - } - - // special casing middle since 16 bit fixed math below can't reproduce 0.5 exactly - if (point == TessFactorCtx->numHalfTessFactorPoints) - return FXP_ONE_HALF; - - unsigned int indexOnCeilHalfTessFactor = point; - unsigned int indexOnFloorHalfTessFactor = indexOnCeilHalfTessFactor; - if (point > TessFactorCtx->splitPointOnFloorHalfTessFactor) { - indexOnFloorHalfTessFactor -= 1; - } - // For the fixed point multiplies below, we know the results are <= 16 bits - // because the locations on the halfTessFactor are <= half the number of - // segments for the total TessFactor. So a number divided by a number that - // is at least twice as big will give a result no bigger than 0.5 (which in - // fixed point is 16 bits in our case) - FXP fxpLocationOnFloorHalfTessFactor = - indexOnFloorHalfTessFactor * TessFactorCtx->fxpInvNumSegmentsOnFloorTessFactor; - FXP fxpLocationOnCeilHalfTessFactor = - indexOnCeilHalfTessFactor * TessFactorCtx->fxpInvNumSegmentsOnCeilTessFactor; - - // Since we know the numbers calculated above are <= fixed point 0.5, and the - // equation below is just lerping between two values <= fixed point 0.5 - // (0x00008000), then we know that the final result before shifting by 16 bits - // is no larger than 0x80000000. Once we shift that down by 16, we get the - // result of lerping 2 numbers <= 0.5, which is obviously at most 0.5 - // (0x00008000) - FXP fxpLocation = - fxpLocationOnFloorHalfTessFactor * (FXP_ONE - TessFactorCtx->fxpHalfTessFactorFraction) + - fxpLocationOnCeilHalfTessFactor * (TessFactorCtx->fxpHalfTessFactorFraction); - fxpLocation = (fxpLocation + FXP_ONE_HALF /*round*/) >> FXP_FRACTION_BITS; // get back to n.16 - if (bFlip) { - fxpLocation = FXP_ONE - fxpLocation; - } - return fxpLocation; -} - -static void -StitchRegular(private struct CHWTessellator *ctx, bool bTrapezoid, - DIAGONALS diagonals, int baseIndexOffset, int numInsideEdgePoints, - int insideEdgePointBaseOffset, int outsideEdgePointBaseOffset) -{ - int insidePoint = insideEdgePointBaseOffset; - int outsidePoint = outsideEdgePointBaseOffset; - if (bTrapezoid) { - DefineTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint, - baseIndexOffset); - baseIndexOffset += 3; - outsidePoint++; - } - int p; - switch (diagonals) { - case DIAGONALS_INSIDE_TO_OUTSIDE: - // Diagonals pointing from inside edge forward towards outside edge - for (p = 0; p < numInsideEdgePoints - 1; p++) { - DefineTriangle(ctx, insidePoint, outsidePoint, outsidePoint + 1, - baseIndexOffset); - baseIndexOffset += 3; - - DefineTriangle(ctx, insidePoint, outsidePoint + 1, insidePoint + 1, - baseIndexOffset); - baseIndexOffset += 3; - insidePoint++; - outsidePoint++; - } - break; - case DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE: // Assumes ODD tessellation - // Diagonals pointing from outside edge forward towards inside edge - - // First half - for (p = 0; p < numInsideEdgePoints / 2 - 1; p++) { - DefineTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint, - baseIndexOffset); - baseIndexOffset += 3; - DefineTriangle(ctx, insidePoint, outsidePoint + 1, insidePoint + 1, - baseIndexOffset); - baseIndexOffset += 3; - insidePoint++; - outsidePoint++; - } - - // Middle - DefineTriangle(ctx, outsidePoint, insidePoint + 1, insidePoint, - baseIndexOffset); - baseIndexOffset += 3; - DefineTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint + 1, - baseIndexOffset); - baseIndexOffset += 3; - insidePoint++; - outsidePoint++; - p += 2; - - // Second half - for (; p < numInsideEdgePoints; p++) { - DefineTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint, - baseIndexOffset); - baseIndexOffset += 3; - DefineTriangle(ctx, insidePoint, outsidePoint + 1, insidePoint + 1, - baseIndexOffset); - baseIndexOffset += 3; - insidePoint++; - outsidePoint++; - } - break; - case DIAGONALS_MIRRORED: - // First half, diagonals pointing from outside of outside edge to inside of - // inside edge - for (p = 0; p < numInsideEdgePoints / 2; p++) { - DefineTriangle(ctx, outsidePoint, insidePoint + 1, insidePoint, - baseIndexOffset); - baseIndexOffset += 3; - DefineTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint + 1, - baseIndexOffset); - baseIndexOffset += 3; - insidePoint++; - outsidePoint++; - } - // Second half, diagonals pointing from inside of inside edge to outside of - // outside edge - for (; p < numInsideEdgePoints - 1; p++) { - DefineTriangle(ctx, insidePoint, outsidePoint, outsidePoint + 1, - baseIndexOffset); - baseIndexOffset += 3; - DefineTriangle(ctx, insidePoint, outsidePoint + 1, insidePoint + 1, - baseIndexOffset); - baseIndexOffset += 3; - insidePoint++; - outsidePoint++; - } - break; - } - if (bTrapezoid) { - DefineTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint, - baseIndexOffset); - baseIndexOffset += 3; - } -} - -// loop_start and loop_end give optimal loop bounds for -// the stitching algorithm further below, for any given halfTssFactor. There -// is probably a better way to encode this... -// -// Return the FIRST entry in finalPointPositionTable awhich is less than -// halfTessFactor, except entry 0 and 1 which are set up to skip the loop. -static int -loop_start(int N) -{ - if (N < 2) - return 1; - else if (N == 2) - return 17; - else if (N < 5) - return 9; - else if (N < 9) - return 5; - else if (N < 17) - return 3; - else - return 2; -} - -// Return the LAST entry in finalPointPositionTable[] which is less than -// halfTessFactor, except entry 0 and 1 which are set up to skip the loop. -static int -loop_end(int N) -{ - if (N < 2) - return 0; - else if (N < 4) - return 17; - else if (N < 8) - return 25; - else if (N < 16) - return 29; - else if (N < 32) - return 31; - else - return 32; -} - -// Tables to assist in the stitching of 2 rows of points having arbitrary -// TessFactors. The stitching order is governed by Ruler Function vertex -// split ordering (see external documentation). -// -// The contents of the finalPointPositionTable are where vertex i [0..33] -// ends up on the half-edge at the max tessellation amount given -// ruler-function split order. Recall the other half of an edge is mirrored, -// so we only need to deal with one half. This table is used to decide when -// to advance a point on the interior or exterior. It supports odd TessFactor -// up to 65 and even TessFactor up to 64. - -/* TODO: Is this actually faster than a LUT? */ -static uint32_t -finalPointPositionTable(uint32_t x) -{ - if (x == 0) - return 0; - if (x == 1) - return 0x20; - - uint32_t shift; - if ((x & 1) == 0) { - shift = 1; - } else if ((x & 3) == 3) { - shift = 2; - } else if ((x & 7) == 5) { - shift = 3; - } else if (x != 17) { - shift = 4; - } else { - shift = 5; - } - - // SWAR vectorized right-shift of (0x20, x) - // We're calculating `min(0xf, 0x20 >> shift) + (x >> shift)`. - uint32_t items_to_shift = x | (0x20 << 16); - uint32_t shifted = items_to_shift >> shift; - - uint32_t bias = min(0xfu, shifted >> 16); - return bias + (shifted & 0xffff); -} - -static void -StitchTransition(private struct CHWTessellator *ctx, int baseIndexOffset, - int insideEdgePointBaseOffset, - int insideNumHalfTessFactorPoints, - bool insideEdgeTessFactorOdd, int outsideEdgePointBaseOffset, - int outsideNumHalfTessFactorPoints, bool outsideTessFactorOdd) -{ - if (insideEdgeTessFactorOdd) { - insideNumHalfTessFactorPoints -= 1; - } - if (outsideTessFactorOdd) { - outsideNumHalfTessFactorPoints -= 1; - } - // Walk first half - int outsidePoint = outsideEdgePointBaseOffset; - int insidePoint = insideEdgePointBaseOffset; - - // iStart,iEnd are a small optimization so the loop below doesn't have to go - // from 0 up to 31 - int iStart = min(loop_start(insideNumHalfTessFactorPoints), - loop_start(outsideNumHalfTessFactorPoints)); - int iEnd = loop_end( - max(insideNumHalfTessFactorPoints, outsideNumHalfTessFactorPoints)); - - // since we don't start the loop at 0 below, we need a special case. - if (0 < outsideNumHalfTessFactorPoints) { - // Advance outside - DefineTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint, - baseIndexOffset); - baseIndexOffset += 3; - outsidePoint++; - } - - for (int i = iStart; i <= iEnd; i++) { - int bound = finalPointPositionTable(i); - - if (bound < insideNumHalfTessFactorPoints) { - // Advance inside - DefineTriangle(ctx, insidePoint, outsidePoint, insidePoint + 1, - baseIndexOffset); - baseIndexOffset += 3; - insidePoint++; - } - if (bound < outsideNumHalfTessFactorPoints) { - // Advance outside - DefineTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint, - baseIndexOffset); - baseIndexOffset += 3; - outsidePoint++; - } - } - - if ((insideEdgeTessFactorOdd != outsideTessFactorOdd) || - insideEdgeTessFactorOdd) { - if (insideEdgeTessFactorOdd == outsideTessFactorOdd) { - // Quad in the middle - DefineTriangle(ctx, insidePoint, outsidePoint, insidePoint + 1, - baseIndexOffset); - baseIndexOffset += 3; - DefineTriangle(ctx, insidePoint + 1, outsidePoint, outsidePoint + 1, - baseIndexOffset); - baseIndexOffset += 3; - insidePoint++; - outsidePoint++; - } else if (!insideEdgeTessFactorOdd) { - // Triangle pointing inside - DefineTriangle(ctx, insidePoint, outsidePoint, outsidePoint + 1, - baseIndexOffset); - baseIndexOffset += 3; - outsidePoint++; - } else { - // Triangle pointing outside - DefineTriangle(ctx, insidePoint, outsidePoint, insidePoint + 1, - baseIndexOffset); - baseIndexOffset += 3; - insidePoint++; - } - } - - // Walk second half. - for (int i = iEnd; i >= iStart; i--) { - int bound = finalPointPositionTable(i); - - if (bound < outsideNumHalfTessFactorPoints) { - // Advance outside - DefineTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint, - baseIndexOffset); - baseIndexOffset += 3; - outsidePoint++; - } - if (bound < insideNumHalfTessFactorPoints) { - // Advance inside - DefineTriangle(ctx, insidePoint, outsidePoint, insidePoint + 1, - baseIndexOffset); - baseIndexOffset += 3; - insidePoint++; - } - } - // Below case is not needed if we didn't optimize loop above and made it run - // from 31 down to 0. - if (0 < outsideNumHalfTessFactorPoints) { - DefineTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint, - baseIndexOffset); - baseIndexOffset += 3; - outsidePoint++; - } + uint patch = cl_global_id.x; + poly_tess_isoline_process(p, patch, mode__2); } KERNEL(64) -libagx_tess_isoline(constant struct libagx_tess_args *p, - enum libagx_tess_mode mode__2) +libagx_tess_tri(constant struct poly_tess_args *p, enum poly_tess_mode mode__2) { - enum libagx_tess_mode mode = mode__2; uint patch = cl_global_id.x; - enum libagx_tess_partitioning partitioning = p->partitioning; - - bool lineDensityOdd; - bool lineDetailOdd; - TESS_FACTOR_CONTEXT lineDensityTessFactorCtx; - TESS_FACTOR_CONTEXT lineDetailTessFactorCtx; - - global float *factors = tess_factors(p, patch); - float TessFactor_V_LineDensity = factors[0]; - float TessFactor_U_LineDetail = factors[1]; - - // Is the patch culled? NaN will pass. - if (!(TessFactor_V_LineDensity > 0) || !(TessFactor_U_LineDetail > 0)) { - libagx_draw_empty(p, mode, patch); - return; - } - - // Clamp edge TessFactors - TessFactor_V_LineDensity = - clamp(TessFactor_V_LineDensity, - LIBAGX_TESS_MIN_ISOLINE_DENSITY_TESSELLATION_FACTOR, - LIBAGX_TESS_MAX_ISOLINE_DENSITY_TESSELLATION_FACTOR); - TessFactor_U_LineDetail = - clamp_factor(TessFactor_U_LineDetail, partitioning, 0); - - // Process tessFactors - if (partitioning == LIBAGX_TESS_PARTITIONING_INTEGER) { - lineDetailOdd = isOdd(TessFactor_U_LineDetail); - } else { - lineDetailOdd = (partitioning == LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD); - } - - FXP fxpTessFactor_U_LineDetail = floatToFixed(TessFactor_U_LineDetail); - - ComputeTessFactorCtx(lineDetailOdd, fxpTessFactor_U_LineDetail, - &lineDetailTessFactorCtx); - int numPointsPerLine = - NumPointsForTessFactor(lineDetailOdd, fxpTessFactor_U_LineDetail); - - TessFactor_V_LineDensity = ceil(TessFactor_V_LineDensity); - lineDensityOdd = isOdd(TessFactor_V_LineDensity); - FXP fxpTessFactor_V_LineDensity = floatToFixed(TessFactor_V_LineDensity); - ComputeTessFactorCtx(lineDensityOdd, fxpTessFactor_V_LineDensity, - &lineDensityTessFactorCtx); - - // don't draw last line at V == 1. - int numLines = - NumPointsForTessFactor(lineDensityOdd, fxpTessFactor_V_LineDensity) - 1; - - /* Points */ - uint num_points = numPointsPerLine * numLines; - if (mode != LIBAGX_TESS_MODE_COUNT) { - global struct libagx_tess_point *points = - libagx_heap_alloc_points(p, patch, num_points); - - for (int line = 0, pointOffset = 0; line < numLines; line++) { - FXP fxpV = - PlacePointIn1D(&lineDensityTessFactorCtx, lineDensityOdd, line); - - for (int point = 0; point < numPointsPerLine; point++) { - FXP fxpU = - PlacePointIn1D(&lineDetailTessFactorCtx, lineDetailOdd, point); - - DefinePoint(&points[pointOffset++], fxpU, fxpV); - } - } - } - - struct CHWTessellator ctx = { - .mode = mode, - .index_bias = patch * LIBAGX_TES_PATCH_ID_STRIDE, - }; - - /* Connectivity */ - if (!p->points_mode) { - uint num_indices = numLines * (numPointsPerLine - 1) * 2; - ctx.Index = libagx_draw(p, mode, true, patch, num_indices); - - if (mode == LIBAGX_TESS_MODE_COUNT) - return; - - for (int line = 0, pointOffset = 0, indexOffset = 0; line < numLines; - line++) { - pointOffset++; - - for (int point = 1; point < numPointsPerLine; point++) { - DefineIndex(&ctx, pointOffset - 1, indexOffset++); - DefineIndex(&ctx, pointOffset, indexOffset++); - pointOffset++; - } - } - } else { - libagx_draw_points(&ctx, p, patch, num_points); - } + poly_tess_tri_process(p, patch, mode__2); } KERNEL(64) -libagx_tess_tri(constant struct libagx_tess_args *p, - enum libagx_tess_mode mode__2) +libagx_tess_quad(constant struct poly_tess_args *p, enum poly_tess_mode mode__2) { - enum libagx_tess_mode mode = mode__2; uint patch = cl_global_id.x; - enum libagx_tess_partitioning partitioning = p->partitioning; - - global float *factors = tess_factors(p, patch); - float tessFactor_Ueq0 = factors[0]; - float tessFactor_Veq0 = factors[1]; - float tessFactor_Weq0 = factors[2]; - float insideTessFactor_f = factors[4]; - - struct CHWTessellator ctx = { - .mode = mode, - .index_bias = patch * LIBAGX_TES_PATCH_ID_STRIDE, - }; - - // Is the patch culled? NaN will pass. - if (!(tessFactor_Ueq0 > 0) || !(tessFactor_Veq0 > 0) || - !(tessFactor_Weq0 > 0)) { - - libagx_draw_empty(p, mode, patch); - - return; - } - - FXP outsideTessFactor[TRI_EDGES]; - FXP insideTessFactor; - bool outsideTessFactorOdd[TRI_EDGES]; - bool insideTessFactorOdd; - TESS_FACTOR_CONTEXT outsideTessFactorCtx[TRI_EDGES]; - TESS_FACTOR_CONTEXT insideTessFactorCtx; - // Stuff below is just specific to the traversal order - // this code happens to use to generate points/lines - int numPointsForOutsideEdge[TRI_EDGES]; - int numPointsForInsideTessFactor; - int insideEdgePointBaseOffset; - - // Clamp TessFactors - tessFactor_Ueq0 = clamp_factor(tessFactor_Ueq0, partitioning, 0); - tessFactor_Veq0 = clamp_factor(tessFactor_Veq0, partitioning, 0); - tessFactor_Weq0 = clamp_factor(tessFactor_Weq0, partitioning, 0); - - float maxf = max(max(tessFactor_Ueq0, tessFactor_Veq0), tessFactor_Weq0); - insideTessFactor_f = clamp_factor(insideTessFactor_f, partitioning, maxf); - // Note the above clamps map NaN to the lower bound - - // Process tessFactors - float outsideTessFactor_f[TRI_EDGES] = {tessFactor_Ueq0, tessFactor_Veq0, - tessFactor_Weq0}; - if (partitioning == LIBAGX_TESS_PARTITIONING_INTEGER) { - for (int edge = 0; edge < TRI_EDGES; edge++) { - outsideTessFactorOdd[edge] = isOdd(outsideTessFactor_f[edge]); - } - insideTessFactorOdd = - isOdd(insideTessFactor_f) && (1.0f != insideTessFactor_f); - } else { - bool odd = (partitioning == LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD); - - for (int edge = 0; edge < TRI_EDGES; edge++) { - outsideTessFactorOdd[edge] = odd; - } - insideTessFactorOdd = odd; - } - - // Save fixed point TessFactors - for (int edge = 0; edge < TRI_EDGES; edge++) { - outsideTessFactor[edge] = floatToFixed(outsideTessFactor_f[edge]); - } - insideTessFactor = floatToFixed(insideTessFactor_f); - - if (partitioning != LIBAGX_TESS_PARTITIONING_FRACTIONAL_EVEN) { - // Special case if all TessFactors are 1 - if ((FXP_ONE == insideTessFactor) && - (FXP_ONE == outsideTessFactor[Ueq0]) && - (FXP_ONE == outsideTessFactor[Veq0]) && - (FXP_ONE == outsideTessFactor[Weq0])) { - - /* Just do minimum tess factor */ - if (mode == LIBAGX_TESS_MODE_COUNT) { - p->counts[patch] = 3; - return; - } - - global struct libagx_tess_point *points = - libagx_heap_alloc_points(p, patch, 3); - - DefinePoint(&points[0], 0, - FXP_ONE); // V=1 (beginning of Ueq0 edge VW) - DefinePoint(&points[1], 0, 0); // W=1 (beginning of Veq0 edge WU) - DefinePoint(&points[2], FXP_ONE, - 0); // U=1 (beginning of Weq0 edge UV) - - if (!p->points_mode) { - ctx.Index = libagx_draw(p, mode, false, patch, 3); - - DefineTriangle(&ctx, 0, 1, 2, - /*indexStorageBaseOffset*/ 0); - } else { - libagx_draw_points(&ctx, p, patch, 3); - } - - return; - } - } - - // Compute per-TessFactor metadata - for (int edge = 0; edge < TRI_EDGES; edge++) { - ComputeTessFactorCtx(outsideTessFactorOdd[edge], outsideTessFactor[edge], - &outsideTessFactorCtx[edge]); - } - ComputeTessFactorCtx(insideTessFactorOdd, insideTessFactor, - &insideTessFactorCtx); - - // Compute some initial data. - int NumPoints = 0; - - // outside edge offsets and storage - for (int edge = 0; edge < TRI_EDGES; edge++) { - numPointsForOutsideEdge[edge] = NumPointsForTessFactor( - outsideTessFactorOdd[edge], outsideTessFactor[edge]); - NumPoints += numPointsForOutsideEdge[edge]; - } - NumPoints -= 3; - - // inside edge offsets - numPointsForInsideTessFactor = - NumPointsForTessFactor(insideTessFactorOdd, insideTessFactor); - { - int pointCountMin = insideTessFactorOdd ? 4 : 3; - // max() allows degenerate transition regions when inside TessFactor == 1 - numPointsForInsideTessFactor = - max(pointCountMin, numPointsForInsideTessFactor); - } - - insideEdgePointBaseOffset = NumPoints; - - // inside storage, including interior edges above - { - int interiorRings = (numPointsForInsideTessFactor >> 1) - 1; - int even = insideTessFactorOdd ? 0 : 1; - NumPoints += TRI_EDGES * (interiorRings * (interiorRings + even)) + even; - } - - /* GENERATE POINTS */ - if (mode != LIBAGX_TESS_MODE_COUNT) { - ctx.Point = libagx_heap_alloc_points(p, patch, NumPoints); - - // Generate exterior ring edge points, clockwise starting from point V - // (VW, the U==0 edge) - int pointOffset = 0; - for (int edge = 0; edge < TRI_EDGES; edge++) { - int odd = edge & 0x1; - int endPoint = numPointsForOutsideEdge[edge] - 1; - // don't include end, since next edge starts with it. - for (int p = 0; p < endPoint; p++, pointOffset++) { - // whether to reverse point order given we are defining V or U (W - // implicit): edge0, VW, has V decreasing, so reverse 1D points - // below edge1, WU, has U increasing, so don't reverse 1D points - // below edge2, UV, has U decreasing, so reverse 1D points below - int q = odd ? p : endPoint - p; - - FXP fxpParam = PlacePointIn1D(&outsideTessFactorCtx[edge], - outsideTessFactorOdd[edge], q); - DefinePoint(&ctx.Point[pointOffset], (edge == 0) ? 0 : fxpParam, - (edge == 0) ? fxpParam - : (edge == 2) ? FXP_ONE - fxpParam - : 0); - } - } - - // Generate interior ring points, clockwise spiralling in - int numRings = (numPointsForInsideTessFactor >> 1); - for (int ring = 1; ring < numRings; ring++) { - int startPoint = ring; - int endPoint = numPointsForInsideTessFactor - 1 - startPoint; - - int perpendicularAxisPoint = startPoint; - FXP fxpPerpParam = PlacePointIn1D( - &insideTessFactorCtx, insideTessFactorOdd, perpendicularAxisPoint); - - // Map location to the right size in - // barycentric space. We know this fixed - // point math won't over/underflow - fxpPerpParam *= FXP_TWO_THIRDS; - fxpPerpParam = (fxpPerpParam + FXP_ONE_HALF /*round*/) >> - FXP_FRACTION_BITS; // get back to n.16 - - for (int edge = 0; edge < TRI_EDGES; edge++) { - int odd = edge & 0x1; - - // don't include end: next edge starts with it. - for (int p = startPoint; p < endPoint; p++, pointOffset++) { - // whether to reverse point given we are defining V or U (W - // implicit): edge0, VW, has V decreasing, so reverse 1D points - // below edge1, WU, has U increasing, so don't reverse 1D points - // below edge2, UV, has U decreasing, so reverse 1D points below - int q = odd ? p : endPoint - (p - startPoint); - - FXP fxpParam = - PlacePointIn1D(&insideTessFactorCtx, insideTessFactorOdd, q); - // edge0 VW, has perpendicular parameter U constant - // edge1 WU, has perpendicular parameter V constant - // edge2 UV, has perpendicular parameter W constant - // reciprocal is the rate of change of edge-parallel parameters - // as they are pushed into the triangle - const unsigned int deriv = 2; - - // we know this fixed point math won't over/underflow - FXP tmp = fxpParam - (fxpPerpParam + 1 /*round*/) / deriv; - - DefinePoint(&ctx.Point[pointOffset], - edge > 0 ? tmp : fxpPerpParam, - edge == 0 ? tmp - : edge == 1 ? fxpPerpParam - : FXP_ONE - tmp - fxpPerpParam); - } - } - } - if (!insideTessFactorOdd) { - // Last point is the point at the center. - DefinePoint(&ctx.Point[pointOffset], FXP_ONE_THIRD, FXP_ONE_THIRD); - } - } - - if (p->points_mode) { - libagx_draw_points(&ctx, p, patch, NumPoints); - return; - } - - { - // Generate primitives for all the concentric rings, one side at a time - // for each ring +1 is so even tess includes the center point, which we - // want to now - int numRings = ((numPointsForInsideTessFactor + 1) >> 1); - - int NumIndices = 0; - { - int OuterPoints = numPointsForOutsideEdge[0] + - numPointsForOutsideEdge[1] + - numPointsForOutsideEdge[2]; - - int numRings18 = numRings * 18; - NumIndices = ((numRings18 - 27) * numPointsForInsideTessFactor) + - (3 * OuterPoints) - (numRings18 * (numRings - 1)) + - (insideTessFactorOdd ? 3 : 0); - } - - // Generate the draw and allocate the index buffer now that we know the size - ctx.Index = libagx_draw(p, mode, false, patch, NumIndices); - - if (mode == LIBAGX_TESS_MODE_COUNT) - return; - - int insideOffset = insideEdgePointBaseOffset; - int outsideEdgePointBaseOffset = 0; - - NumIndices = 0; - for (int ring = 1; ring < numRings; ring++) { - int numPointsForInsideEdge = numPointsForInsideTessFactor - 2 * ring; - int edge0InsidePointBaseOffset = insideOffset; - int edge0OutsidePointBaseOffset = outsideEdgePointBaseOffset; - for (int edge = 0; edge < TRI_EDGES; edge++) { - int outsidePoints = ring == 1 ? numPointsForOutsideEdge[edge] - : (numPointsForInsideEdge + 2); - - int numTriangles = numPointsForInsideEdge + outsidePoints - 2; - - int insideBaseOffset; - int outsideBaseOffset; - if (edge == 2) { - ctx.IndexPatchCtx.insidePointIndexDeltaToRealValue = - insideOffset; - ctx.IndexPatchCtx.insidePointIndexBadValue = - numPointsForInsideEdge - 1; - ctx.IndexPatchCtx.insidePointIndexReplacementValue = - edge0InsidePointBaseOffset; - ctx.IndexPatchCtx.outsidePointIndexPatchBase = - ctx.IndexPatchCtx.insidePointIndexBadValue + - 1; // past inside patched index range - ctx.IndexPatchCtx.outsidePointIndexDeltaToRealValue = - outsideEdgePointBaseOffset - - ctx.IndexPatchCtx.outsidePointIndexPatchBase; - ctx.IndexPatchCtx.outsidePointIndexBadValue = - ctx.IndexPatchCtx.outsidePointIndexPatchBase + outsidePoints - - 1; - ctx.IndexPatchCtx.outsidePointIndexReplacementValue = - edge0OutsidePointBaseOffset; - ctx.bUsingPatchedIndices = true; - insideBaseOffset = 0; - outsideBaseOffset = ctx.IndexPatchCtx.outsidePointIndexPatchBase; - } else { - insideBaseOffset = insideOffset; - outsideBaseOffset = outsideEdgePointBaseOffset; - } - if (ring == 1) { - StitchTransition( - &ctx, /*baseIndexOffset: */ NumIndices, insideBaseOffset, - insideTessFactorCtx.numHalfTessFactorPoints, - insideTessFactorOdd, outsideBaseOffset, - outsideTessFactorCtx[edge].numHalfTessFactorPoints, - outsideTessFactorOdd[edge]); - } else { - StitchRegular(&ctx, /*bTrapezoid*/ true, DIAGONALS_MIRRORED, - /*baseIndexOffset: */ NumIndices, - numPointsForInsideEdge, insideBaseOffset, - outsideBaseOffset); - } - if (2 == edge) { - ctx.bUsingPatchedIndices = false; - } - NumIndices += numTriangles * 3; - outsideEdgePointBaseOffset += outsidePoints - 1; - insideOffset += numPointsForInsideEdge - 1; - } - } - if (insideTessFactorOdd) { - // Triangulate center (a single triangle) - DefineTriangle(&ctx, outsideEdgePointBaseOffset, - outsideEdgePointBaseOffset + 1, - outsideEdgePointBaseOffset + 2, NumIndices); - NumIndices += 3; - } - } -} - -KERNEL(64) -libagx_tess_quad(constant struct libagx_tess_args *p, - enum libagx_tess_mode mode__2) -{ - enum libagx_tess_mode mode = mode__2; - uint patch = cl_global_id.x; - enum libagx_tess_partitioning partitioning = p->partitioning; - global float *factors = tess_factors(p, patch); - - float tessFactor_Ueq0 = factors[0]; - float tessFactor_Veq0 = factors[1]; - float tessFactor_Ueq1 = factors[2]; - float tessFactor_Veq1 = factors[3]; - - float insideTessFactor_U = factors[4]; - float insideTessFactor_V = factors[5]; - - struct CHWTessellator ctx = { - .mode = mode, - .index_bias = patch * LIBAGX_TES_PATCH_ID_STRIDE, - }; - - // Is the patch culled? - if (!(tessFactor_Ueq0 > 0) || // NaN will pass - !(tessFactor_Veq0 > 0) || !(tessFactor_Ueq1 > 0) || - !(tessFactor_Veq1 > 0)) { - libagx_draw_empty(p, mode, patch); - return; - } - - FXP outsideTessFactor[QUAD_EDGES]; - FXP insideTessFactor[QUAD_AXES]; - bool outsideTessFactorOdd[QUAD_EDGES]; - bool insideTessFactorOdd[QUAD_AXES]; - TESS_FACTOR_CONTEXT outsideTessFactorCtx[QUAD_EDGES]; - TESS_FACTOR_CONTEXT insideTessFactorCtx[QUAD_AXES]; - // Stuff below is just specific to the traversal order - // this code happens to use to generate points/lines - int numPointsForOutsideEdge[QUAD_EDGES]; - int numPointsForInsideTessFactor[QUAD_AXES]; - int insideEdgePointBaseOffset; - - // Clamp edge TessFactors - tessFactor_Ueq0 = clamp_factor(tessFactor_Ueq0, partitioning, 0); - tessFactor_Veq0 = clamp_factor(tessFactor_Veq0, partitioning, 0); - tessFactor_Ueq1 = clamp_factor(tessFactor_Ueq1, partitioning, 0); - tessFactor_Veq1 = clamp_factor(tessFactor_Veq1, partitioning, 0); - - float maxf = max(max(max(tessFactor_Ueq0, tessFactor_Veq0), - max(tessFactor_Ueq1, tessFactor_Veq1)), - max(insideTessFactor_U, insideTessFactor_V)); - - insideTessFactor_U = clamp_factor(insideTessFactor_U, partitioning, maxf); - insideTessFactor_V = clamp_factor(insideTessFactor_V, partitioning, maxf); - // Note the above clamps map NaN to lowerBound - - // Process tessFactors - float outsideTessFactor_f[QUAD_EDGES] = {tessFactor_Ueq0, tessFactor_Veq0, - tessFactor_Ueq1, tessFactor_Veq1}; - float insideTessFactor_f[QUAD_AXES] = {insideTessFactor_U, - insideTessFactor_V}; - if (partitioning == LIBAGX_TESS_PARTITIONING_INTEGER) { - for (int edge = 0; edge < QUAD_EDGES; edge++) { - outsideTessFactorOdd[edge] = isOdd(outsideTessFactor_f[edge]); - } - for (int axis = 0; axis < QUAD_AXES; axis++) { - insideTessFactorOdd[axis] = isOdd(insideTessFactor_f[axis]) && - (1.0f != insideTessFactor_f[axis]); - } - } else { - bool odd = (partitioning == LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD); - - for (int edge = 0; edge < QUAD_EDGES; edge++) { - outsideTessFactorOdd[edge] = odd; - } - insideTessFactorOdd[U] = insideTessFactorOdd[V] = odd; - } - - // Save fixed point TessFactors - for (int edge = 0; edge < QUAD_EDGES; edge++) { - outsideTessFactor[edge] = floatToFixed(outsideTessFactor_f[edge]); - } - for (int axis = 0; axis < QUAD_AXES; axis++) { - insideTessFactor[axis] = floatToFixed(insideTessFactor_f[axis]); - } - - if (partitioning != LIBAGX_TESS_PARTITIONING_FRACTIONAL_EVEN) { - // Special case if all TessFactors are 1 - if ((FXP_ONE == insideTessFactor[U]) && - (FXP_ONE == insideTessFactor[V]) && - (FXP_ONE == outsideTessFactor[Ueq0]) && - (FXP_ONE == outsideTessFactor[Veq0]) && - (FXP_ONE == outsideTessFactor[Ueq1]) && - (FXP_ONE == outsideTessFactor[Veq1])) { - - /* Just do minimum tess factor */ - if (!p->points_mode) { - ctx.Index = libagx_draw(p, mode, false, patch, 6); - if (mode == LIBAGX_TESS_MODE_COUNT) - return; - - DefineTriangle(&ctx, 0, 1, 3, /*indexStorageOffset*/ 0); - DefineTriangle(&ctx, 1, 2, 3, /*indexStorageOffset*/ 3); - } else { - libagx_draw_points(&ctx, p, patch, 4); - if (mode == LIBAGX_TESS_MODE_COUNT) - return; - } - - global struct libagx_tess_point *points = - libagx_heap_alloc_points(p, patch, 4); - - DefinePoint(&points[0], 0, 0); - DefinePoint(&points[1], FXP_ONE, 0); - DefinePoint(&points[2], FXP_ONE, FXP_ONE); - DefinePoint(&points[3], 0, FXP_ONE); - return; - } - } - - // Compute TessFactor-specific metadata - for (int edge = 0; edge < QUAD_EDGES; edge++) { - ComputeTessFactorCtx(outsideTessFactorOdd[edge], outsideTessFactor[edge], - &outsideTessFactorCtx[edge]); - } - - for (int axis = 0; axis < QUAD_AXES; axis++) { - ComputeTessFactorCtx(insideTessFactorOdd[axis], insideTessFactor[axis], - &insideTessFactorCtx[axis]); - } - - int NumPoints = 0; - - // outside edge offsets and storage - for (int edge = 0; edge < QUAD_EDGES; edge++) { - numPointsForOutsideEdge[edge] = NumPointsForTessFactor( - outsideTessFactorOdd[edge], outsideTessFactor[edge]); - NumPoints += numPointsForOutsideEdge[edge]; - } - NumPoints -= 4; - - // inside edge offsets - for (int axis = 0; axis < QUAD_AXES; axis++) { - numPointsForInsideTessFactor[axis] = NumPointsForTessFactor( - insideTessFactorOdd[axis], insideTessFactor[axis]); - int pointCountMin = insideTessFactorOdd[axis] ? 4 : 3; - // max() allows degenerate transition regions when inside TessFactor == 1 - numPointsForInsideTessFactor[axis] = - max(pointCountMin, numPointsForInsideTessFactor[axis]); - } - - insideEdgePointBaseOffset = NumPoints; - - // inside storage, including interior edges above - int numInteriorPoints = (numPointsForInsideTessFactor[U] - 2) * - (numPointsForInsideTessFactor[V] - 2); - NumPoints += numInteriorPoints; - - if (mode != LIBAGX_TESS_MODE_COUNT) { - ctx.Point = libagx_heap_alloc_points(p, patch, NumPoints); - - // Generate exterior ring edge points, clockwise from top-left - int pointOffset = 0; - for (int edge = 0; edge < QUAD_EDGES; edge++) { - int odd = edge & 0x1; - // don't include end, since next edge starts with it. - int endPoint = numPointsForOutsideEdge[edge] - 1; - for (int p = 0; p < endPoint; p++, pointOffset++) { - int q = - ((edge == 1) || (edge == 2)) ? p : endPoint - p; // reverse order - FXP fxpParam = PlacePointIn1D(&outsideTessFactorCtx[edge], - outsideTessFactorOdd[edge], q); - - FXP u = odd ? fxpParam : ((edge == 2) ? FXP_ONE : 0); - FXP v = odd ? ((edge == 3) ? FXP_ONE : 0) : fxpParam; - DefinePoint(&ctx.Point[pointOffset], u, v); - } - } - - // Generate interior ring points, clockwise from (U==0,V==1) (bottom-left) - // spiralling toward center - int minNumPointsForTessFactor = - min(numPointsForInsideTessFactor[U], numPointsForInsideTessFactor[V]); - // note for even tess we aren't counting center point here. - int numRings = (minNumPointsForTessFactor >> 1); - - for (int ring = 1; ring < numRings; ring++) { - int startPoint = ring; - int endPoint[QUAD_AXES] = { - numPointsForInsideTessFactor[U] - 1 - startPoint, - numPointsForInsideTessFactor[V] - 1 - startPoint, - }; - - for (int edge = 0; edge < QUAD_EDGES; edge++) { - int odd[QUAD_AXES] = {edge & 0x1, ((edge + 1) & 0x1)}; - int perpendicularAxisPoint = - (edge < 2) ? startPoint : endPoint[odd[0]]; - FXP fxpPerpParam = PlacePointIn1D(&insideTessFactorCtx[odd[0]], - insideTessFactorOdd[odd[0]], - perpendicularAxisPoint); - - for (int p = startPoint; p < endPoint[odd[1]]; p++, - pointOffset++) // don't include end: next edge starts with - // it. - { - bool odd_ = odd[1]; - int q = ((edge == 1) || (edge == 2)) - ? p - : endPoint[odd_] - (p - startPoint); - FXP fxpParam = PlacePointIn1D(&insideTessFactorCtx[odd_], - insideTessFactorOdd[odd_], q); - DefinePoint(&ctx.Point[pointOffset], - odd_ ? fxpPerpParam : fxpParam, - odd_ ? fxpParam : fxpPerpParam); - } - } - } - // For even tessellation, the inner "ring" is degenerate - a row of points - if ((numPointsForInsideTessFactor[U] > numPointsForInsideTessFactor[V]) && - !insideTessFactorOdd[V]) { - int startPoint = numRings; - int endPoint = numPointsForInsideTessFactor[U] - 1 - startPoint; - for (int p = startPoint; p <= endPoint; p++, pointOffset++) { - FXP fxpParam = PlacePointIn1D(&insideTessFactorCtx[U], - insideTessFactorOdd[U], p); - DefinePoint(&ctx.Point[pointOffset], fxpParam, FXP_ONE_HALF); - } - } else if ((numPointsForInsideTessFactor[V] >= - numPointsForInsideTessFactor[U]) && - !insideTessFactorOdd[U]) { - int startPoint = numRings; - int endPoint = numPointsForInsideTessFactor[V] - 1 - startPoint; - for (int p = endPoint; p >= startPoint; p--, pointOffset++) { - FXP fxpParam = PlacePointIn1D(&insideTessFactorCtx[V], - insideTessFactorOdd[V], p); - DefinePoint(&ctx.Point[pointOffset], FXP_ONE_HALF, fxpParam); - } - } - } - - if (p->points_mode) { - libagx_draw_points(&ctx, p, patch, NumPoints); - return; - } - - /* CONNECTIVITY */ - { - // Generate primitives for all the concentric rings, one side at a time - // for each ring. +1 is so even tess includes the center point - int numPointRowsToCenter[QUAD_AXES] = { - (numPointsForInsideTessFactor[U] + 1) >> 1, - (numPointsForInsideTessFactor[V] + 1) >> 1, - }; - - int numRings = min(numPointRowsToCenter[U], numPointRowsToCenter[V]); - - /* Calculate # of indices so we can allocate */ - { - /* Handle main case */ - int OuterPoints = - numPointsForOutsideEdge[0] + numPointsForOutsideEdge[1] + - numPointsForOutsideEdge[2] + numPointsForOutsideEdge[3]; - - int InnerPoints = - numPointsForInsideTessFactor[U] + numPointsForInsideTessFactor[V]; - - int NumIndices = (OuterPoints * 3) + (12 * numRings * InnerPoints) - - (InnerPoints * 18) - (24 * numRings * (numRings - 1)); - - /* Determine major/minor axes */ - bool U_major = - (numPointsForInsideTessFactor[U] > numPointsForInsideTessFactor[V]); - unsigned M = U_major ? U : V; - unsigned m = U_major ? V : U; - - /* Handle degenerate ring */ - if (insideTessFactorOdd[m]) { - NumIndices += 12 * ((numPointsForInsideTessFactor[M] >> 1) - - (numPointsForInsideTessFactor[m] >> 1)); - NumIndices += (insideTessFactorOdd[M] ? 6 : 12); - } - - // Generate the draw and allocate the index buffer with the size - ctx.Index = libagx_draw(p, mode, false, patch, NumIndices); - } - - if (mode == LIBAGX_TESS_MODE_COUNT) - return; - - int degeneratePointRing[QUAD_AXES] = { - // Even partitioning causes degenerate row of points, - // which results in exceptions to the point ordering conventions - // when travelling around the rings counterclockwise. - !insideTessFactorOdd[V] ? numPointRowsToCenter[V] - 1 : -1, - !insideTessFactorOdd[U] ? numPointRowsToCenter[U] - 1 : -1, - }; - - int numPointsForOutsideEdge_[QUAD_EDGES] = { - numPointsForOutsideEdge[Ueq0], - numPointsForOutsideEdge[Veq0], - numPointsForOutsideEdge[Ueq1], - numPointsForOutsideEdge[Veq1], - }; - - int insideEdgePointBaseOffset_ = insideEdgePointBaseOffset; - int outsideEdgePointBaseOffset = 0; - - int NumIndices = 0; - - for (int ring = 1; ring < numRings; ring++) { - int numPointsForInsideEdge[QUAD_AXES] = { - numPointsForInsideTessFactor[U] - 2 * ring, - numPointsForInsideTessFactor[V] - 2 * ring}; - - int edge0InsidePointBaseOffset = insideEdgePointBaseOffset_; - int edge0OutsidePointBaseOffset = outsideEdgePointBaseOffset; - - for (int edge = 0; edge < QUAD_EDGES; edge++) { - int odd = (edge + 1) & 0x1; - - int numTriangles = - numPointsForInsideEdge[odd] + numPointsForOutsideEdge_[edge] - 2; - int insideBaseOffset; - int outsideBaseOffset; - - // We need to patch the indexing so Stitch() can think it sees 2 - // sequentially increasing rows of points, even though we have - // wrapped around to the end of the inner and outer ring's points, - // so the last point is really the first point for the ring. We make - // it so that when Stitch() calls AddIndex(), that function will do - // any necessary index adjustment. - if (edge == 3) { - if (ring == degeneratePointRing[odd]) { - ctx.IndexPatchCtx2.baseIndexToInvert = - insideEdgePointBaseOffset_ + 1; - ctx.IndexPatchCtx2.cornerCaseBadValue = - outsideEdgePointBaseOffset + - numPointsForOutsideEdge_[edge] - 1; - ctx.IndexPatchCtx2.cornerCaseReplacementValue = - edge0OutsidePointBaseOffset; - ctx.IndexPatchCtx2.indexInversionEndPoint = - (ctx.IndexPatchCtx2.baseIndexToInvert << 1) - 1; - insideBaseOffset = ctx.IndexPatchCtx2.baseIndexToInvert; - outsideBaseOffset = outsideEdgePointBaseOffset; - ctx.bUsingPatchedIndices2 = true; - } else { - ctx.IndexPatchCtx.insidePointIndexDeltaToRealValue = - insideEdgePointBaseOffset_; - ctx.IndexPatchCtx.insidePointIndexBadValue = - numPointsForInsideEdge[odd] - 1; - ctx.IndexPatchCtx.insidePointIndexReplacementValue = - edge0InsidePointBaseOffset; - ctx.IndexPatchCtx.outsidePointIndexPatchBase = - ctx.IndexPatchCtx.insidePointIndexBadValue + - 1; // past inside patched index range - ctx.IndexPatchCtx.outsidePointIndexDeltaToRealValue = - outsideEdgePointBaseOffset - - ctx.IndexPatchCtx.outsidePointIndexPatchBase; - ctx.IndexPatchCtx.outsidePointIndexBadValue = - ctx.IndexPatchCtx.outsidePointIndexPatchBase + - numPointsForOutsideEdge_[edge] - 1; - ctx.IndexPatchCtx.outsidePointIndexReplacementValue = - edge0OutsidePointBaseOffset; - - insideBaseOffset = 0; - outsideBaseOffset = - ctx.IndexPatchCtx.outsidePointIndexPatchBase; - ctx.bUsingPatchedIndices = true; - } - } else if ((edge == 2) && (ring == degeneratePointRing[odd])) { - ctx.IndexPatchCtx2.baseIndexToInvert = - insideEdgePointBaseOffset_; - ctx.IndexPatchCtx2.cornerCaseBadValue = -1; // unused - ctx.IndexPatchCtx2.cornerCaseReplacementValue = -1; // unused - ctx.IndexPatchCtx2.indexInversionEndPoint = - ctx.IndexPatchCtx2.baseIndexToInvert << 1; - insideBaseOffset = ctx.IndexPatchCtx2.baseIndexToInvert; - outsideBaseOffset = outsideEdgePointBaseOffset; - ctx.bUsingPatchedIndices2 = true; - } else { - insideBaseOffset = insideEdgePointBaseOffset_; - outsideBaseOffset = outsideEdgePointBaseOffset; - } - if (ring == 1) { - StitchTransition( - &ctx, /*baseIndexOffset: */ NumIndices, insideBaseOffset, - insideTessFactorCtx[odd].numHalfTessFactorPoints, - insideTessFactorOdd[odd], outsideBaseOffset, - outsideTessFactorCtx[edge].numHalfTessFactorPoints, - outsideTessFactorOdd[edge]); - } else { - StitchRegular(&ctx, /*bTrapezoid*/ true, DIAGONALS_MIRRORED, - /*baseIndexOffset: */ NumIndices, - numPointsForInsideEdge[odd], insideBaseOffset, - outsideBaseOffset); - } - ctx.bUsingPatchedIndices = false; - ctx.bUsingPatchedIndices2 = false; - NumIndices += numTriangles * 3; - outsideEdgePointBaseOffset += numPointsForOutsideEdge_[edge] - 1; - if ((edge == 2) && (ring == degeneratePointRing[odd])) { - insideEdgePointBaseOffset_ -= numPointsForInsideEdge[odd] - 1; - } else { - insideEdgePointBaseOffset_ += numPointsForInsideEdge[odd] - 1; - } - numPointsForOutsideEdge_[edge] = numPointsForInsideEdge[odd]; - } - } - - // Triangulate center - a row of quads if odd - // This triangulation may be producing diagonals that are asymmetric about - // the center of the patch in this region. - if ((numPointsForInsideTessFactor[U] > numPointsForInsideTessFactor[V]) && - insideTessFactorOdd[V]) { - ctx.bUsingPatchedIndices2 = true; - int stripNumQuads = (((numPointsForInsideTessFactor[U] >> 1) - - (numPointsForInsideTessFactor[V] >> 1)) - << 1) + - (insideTessFactorOdd[U] ? 1 : 2); - ctx.IndexPatchCtx2.baseIndexToInvert = - outsideEdgePointBaseOffset + stripNumQuads + 2; - ctx.IndexPatchCtx2.cornerCaseBadValue = - ctx.IndexPatchCtx2.baseIndexToInvert; - ctx.IndexPatchCtx2.cornerCaseReplacementValue = - outsideEdgePointBaseOffset; - ctx.IndexPatchCtx2.indexInversionEndPoint = - ctx.IndexPatchCtx2.baseIndexToInvert + - ctx.IndexPatchCtx2.baseIndexToInvert + stripNumQuads; - StitchRegular( - &ctx, /*bTrapezoid*/ false, DIAGONALS_INSIDE_TO_OUTSIDE, - /*baseIndexOffset: */ NumIndices, - /*numInsideEdgePoints:*/ stripNumQuads + 1, - /*insideEdgePointBaseOffset*/ ctx.IndexPatchCtx2.baseIndexToInvert, - outsideEdgePointBaseOffset + 1); - ctx.bUsingPatchedIndices2 = false; - NumIndices += stripNumQuads * 6; - } else if ((numPointsForInsideTessFactor[V] >= - numPointsForInsideTessFactor[U]) && - insideTessFactorOdd[U]) { - ctx.bUsingPatchedIndices2 = true; - int stripNumQuads = (((numPointsForInsideTessFactor[V] >> 1) - - (numPointsForInsideTessFactor[U] >> 1)) - << 1) + - (insideTessFactorOdd[V] ? 1 : 2); - ctx.IndexPatchCtx2.baseIndexToInvert = - outsideEdgePointBaseOffset + stripNumQuads + 1; - ctx.IndexPatchCtx2.cornerCaseBadValue = -1; // unused - ctx.IndexPatchCtx2.indexInversionEndPoint = - ctx.IndexPatchCtx2.baseIndexToInvert + - ctx.IndexPatchCtx2.baseIndexToInvert + stripNumQuads; - DIAGONALS diag = insideTessFactorOdd[V] - ? DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE - : DIAGONALS_INSIDE_TO_OUTSIDE; - StitchRegular( - &ctx, /*bTrapezoid*/ false, diag, - /*baseIndexOffset: */ NumIndices, - /*numInsideEdgePoints:*/ stripNumQuads + 1, - /*insideEdgePointBaseOffset*/ ctx.IndexPatchCtx2.baseIndexToInvert, - outsideEdgePointBaseOffset); - ctx.bUsingPatchedIndices2 = false; - NumIndices += stripNumQuads * 6; - } - } + poly_tess_quad_process(p, patch, mode__2); } diff --git a/src/asahi/libagx/tessellator.h b/src/asahi/libagx/tessellator.h index 5841d5578f1..4cf8ab01938 100644 --- a/src/asahi/libagx/tessellator.h +++ b/src/asahi/libagx/tessellator.h @@ -5,104 +5,14 @@ #pragma once -#include "compiler/libcl/libcl.h" +#include "poly/tessellator.h" -enum libagx_tess_partitioning { - LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD, - LIBAGX_TESS_PARTITIONING_FRACTIONAL_EVEN, - LIBAGX_TESS_PARTITIONING_INTEGER, -}; - -enum libagx_tess_mode { - /* Do not actually tessellate, just write the index counts */ - LIBAGX_TESS_MODE_COUNT, - - /* Tessellate using the count buffers to allocate indices */ - LIBAGX_TESS_MODE_WITH_COUNTS, -}; - -struct libagx_tess_point { - uint32_t u; - uint32_t v; -}; -static_assert(sizeof(struct libagx_tess_point) == 8); - -struct libagx_tess_args { - /* Heap to allocate tessellator outputs in */ - DEVICE(struct agx_heap) heap; - - /* Patch coordinate buffer, indexed as: - * - * coord_allocs[patch_ID] + vertex_in_patch - */ - DEVICE(struct libagx_tess_point) patch_coord_buffer; - - /* Per-patch index within the heap for the tess coords, written by the - * tessellator based on the allocated memory. - */ - DEVICE(uint32_t) coord_allocs; - - /* Space for output draws from the tessellator. API draw calls. */ - DEVICE(uint32_t) out_draws; - - /* Tessellation control shader output buffer. */ - DEVICE(float) tcs_buffer; - - /* Count buffer. # of indices per patch written here, then prefix summed. */ - DEVICE(uint32_t) counts; - - /* Allocated index buffer for all patches, if we're prefix summing counts */ - DEVICE(uint32_t) index_buffer; - - /* Address of the tess eval invocation counter for implementing pipeline - * statistics, if active. Zero if inactive. Incremented by tessellator. - */ - DEVICE(uint32_t) statistic; - - /* When geom+tess used together, the buffer containing TES outputs (executed - * as a hardware compute shader). - */ - uint64_t tes_buffer; - - /* Bitfield of TCS per-vertex outputs */ - uint64_t tcs_per_vertex_outputs; - - /* Default tess levels used in OpenGL when there is no TCS in the pipeline. - * Unused in Vulkan and OpenGL ES. - */ - float tess_level_outer_default[4]; - float tess_level_inner_default[2]; - - /* Number of vertices in the input patch */ - uint32_t input_patch_size; - - /* Number of vertices in the TCS output patch */ - uint32_t output_patch_size; - - /* Number of patch constants written by TCS */ - uint32_t tcs_patch_constants; - - /* Number of input patches per instance of the VS/TCS */ - uint32_t patches_per_instance; - - /* Stride between tessellation facotrs in the TCS output buffer. */ - uint32_t tcs_stride_el; - - /* Number of patches being tessellated */ - uint32_t nr_patches; - - /* Partitioning and points mode. These affect per-patch setup code but not - * the hot tessellation loop so we make them dynamic to reduce tessellator - * variants. - */ - enum libagx_tess_partitioning partitioning; - uint32_t points_mode; - uint32_t isolines; - - /* When fed into a geometry shader, triangles should be counter-clockwise. - * The tessellator always produces clockwise triangles, but we can swap - * dynamically in the TES. - */ - uint32_t ccw; -} PACKED; -static_assert(sizeof(struct libagx_tess_args) == 36 * 4); +#define libagx_tessellate(context, grid, barrier, prim, mode, state) \ + if (prim == TESS_PRIMITIVE_QUADS) { \ + libagx_tess_quad(context, grid, barrier, state, mode); \ + } else if (prim == TESS_PRIMITIVE_TRIANGLES) { \ + libagx_tess_tri(context, grid, barrier, state, mode); \ + } else { \ + assert(prim == TESS_PRIMITIVE_ISOLINES); \ + libagx_tess_isoline(context, grid, barrier, state, mode); \ + } diff --git a/src/asahi/vulkan/hk_cmd_dispatch.c b/src/asahi/vulkan/hk_cmd_dispatch.c index 9ff1006134e..b8eab93c4d6 100644 --- a/src/asahi/vulkan/hk_cmd_dispatch.c +++ b/src/asahi/vulkan/hk_cmd_dispatch.c @@ -5,10 +5,10 @@ * SPDX-License-Identifier: MIT */ #include "libagx/query.h" +#include "poly/nir/poly_nir_lower_gs.h" #include "vulkan/vulkan_core.h" #include "agx_helpers.h" #include "agx_linker.h" -#include "agx_nir_lower_gs.h" #include "agx_pack.h" #include "agx_scratch.h" #include "agx_tilebuffer.h" diff --git a/src/asahi/vulkan/hk_cmd_draw.c b/src/asahi/vulkan/hk_cmd_draw.c index 2c07b0fa168..a31098eb6fa 100644 --- a/src/asahi/vulkan/hk_cmd_draw.c +++ b/src/asahi/vulkan/hk_cmd_draw.c @@ -5,6 +5,7 @@ * SPDX-License-Identifier: MIT */ #include +#include "poly/nir/poly_nir_lower_gs.h" #include "agx_abi.h" #include "agx_bg_eot.h" #include "agx_bo.h" @@ -13,7 +14,6 @@ #include "agx_device.h" #include "agx_helpers.h" #include "agx_linker.h" -#include "agx_nir_lower_gs.h" #include "agx_nir_lower_vbo.h" #include "agx_ppp.h" #include "agx_tilebuffer.h" @@ -31,10 +31,10 @@ #include "asahi/genxml/agx_pack.h" #include "asahi/libagx/compression.h" -#include "asahi/libagx/geometry.h" #include "asahi/libagx/libagx.h" #include "asahi/libagx/query.h" #include "asahi/libagx/tessellator.h" +#include "poly/geometry.h" #include "util/blend.h" #include "util/format/format_utils.h" #include "util/format/u_formats.h" @@ -1007,9 +1007,9 @@ hk_heap(struct hk_cmd_buffer *cmd) * the CPU as rodata, even though the GPU uses it for scratch internally. */ off_t off = dev->rodata.heap - dev->rodata.bo->va->addr; - struct agx_heap *map = agx_bo_map(dev->rodata.bo) + off; + struct poly_heap *map = agx_bo_map(dev->rodata.bo) + off; - *map = (struct agx_heap){ + *map = (struct poly_heap){ .base = dev->heap->va->addr, .size = size, }; @@ -1021,7 +1021,7 @@ hk_heap(struct hk_cmd_buffer *cmd) uint64_t addr = dev->rodata.heap; /* Zeroing the allocated index frees everything */ - hk_queue_write(cmd, addr + offsetof(struct agx_heap, bottom), 0, + hk_queue_write(cmd, addr + offsetof(struct poly_heap, bottom), 0, true /* after gfx */); cmd->uses_heap = true; @@ -1045,7 +1045,7 @@ hk_upload_ia_params(struct hk_cmd_buffer *cmd, struct agx_draw draw) { assert(!agx_is_indirect(draw.b) && "indirect params written by GPU"); - struct agx_ia_state ia = {.verts_per_instance = draw.b.count[0]}; + struct poly_ia_state ia = {.verts_per_instance = draw.b.count[0]}; if (draw.indexed) { unsigned index_size_B = agx_index_size_to_B(draw.index_size); @@ -1115,7 +1115,7 @@ hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw) mode = u_decomposed_prim(mode); } - struct agx_geometry_params params = { + struct poly_geometry_params params = { .flat_outputs = fs->info.fs.interp.flat, .input_topology = mode, @@ -1174,7 +1174,7 @@ hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw) params.vs_grid[4] = params.gs_grid[4] = 1; params.vs_grid[5] = params.gs_grid[5] = 1; - struct agx_gs_info *gsi = &count->info.gs; + struct poly_gs_info *gsi = &count->info.gs; if (indirect) { /* TODO: size */ @@ -1183,7 +1183,7 @@ hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw) params.indirect_desc = cmd->geom_indirect; params.vs_grid[2] = params.gs_grid[2] = 1; - if (gsi->shape == AGX_GS_SHAPE_DYNAMIC_INDEXED) { + if (gsi->shape == POLY_GS_SHAPE_DYNAMIC_INDEXED) { /* Need to allocate heap if we haven't yet */ hk_heap(cmd); @@ -1191,7 +1191,7 @@ hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw) cmd->geom_index_count = dev->heap->size; } else { cmd->geom_index_count = - agx_gs_rast_vertices(gsi->shape, gsi->max_indices, 1, 0); + poly_gs_rast_vertices(gsi->shape, gsi->max_indices, 1, 0); } } else { uint32_t verts = draw.b.count[0], instances = draw.b.count[1]; @@ -1207,13 +1207,13 @@ hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw) params.count_buffer = hk_pool_alloc(cmd, size, 4).gpu; } - cmd->geom_index_count = agx_gs_rast_vertices( + cmd->geom_index_count = poly_gs_rast_vertices( gsi->shape, gsi->max_indices, params.gs_grid[0], instances); - cmd->geom_instance_count = agx_gs_rast_instances( + cmd->geom_instance_count = poly_gs_rast_instances( gsi->shape, gsi->max_indices, params.gs_grid[0], instances); - if (gsi->shape == AGX_GS_SHAPE_DYNAMIC_INDEXED) { + if (gsi->shape == POLY_GS_SHAPE_DYNAMIC_INDEXED) { params.output_index_buffer = hk_pool_alloc(cmd, cmd->geom_index_count * 4, 4).gpu; @@ -1221,7 +1221,7 @@ hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw) } } - if (gsi->shape == AGX_GS_SHAPE_STATIC_INDEXED) { + if (gsi->shape == POLY_GS_SHAPE_STATIC_INDEXED) { cmd->geom_index_buffer = hk_pool_upload(cmd, count->info.gs.topology, gsi->max_indices * 4, 4); } @@ -1231,7 +1231,7 @@ hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw) } static void -hk_upload_tess_params(struct hk_cmd_buffer *cmd, struct libagx_tess_args *out, +hk_upload_tess_params(struct hk_cmd_buffer *cmd, struct poly_tess_args *out, struct agx_draw draw) { struct hk_device *dev = hk_cmd_buffer_device(cmd); @@ -1239,14 +1239,14 @@ hk_upload_tess_params(struct hk_cmd_buffer *cmd, struct libagx_tess_args *out, struct hk_graphics_state *gfx = &cmd->state.gfx; struct hk_shader *tcs = hk_only_variant(gfx->shaders[MESA_SHADER_TESS_CTRL]); - enum libagx_tess_partitioning partitioning = + enum poly_tess_partitioning partitioning = gfx->tess.info.spacing == TESS_SPACING_EQUAL - ? LIBAGX_TESS_PARTITIONING_INTEGER + ? POLY_TESS_PARTITIONING_INTEGER : gfx->tess.info.spacing == TESS_SPACING_FRACTIONAL_ODD - ? LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD - : LIBAGX_TESS_PARTITIONING_FRACTIONAL_EVEN; + ? POLY_TESS_PARTITIONING_FRACTIONAL_ODD + : POLY_TESS_PARTITIONING_FRACTIONAL_EVEN; - struct libagx_tess_args args = { + struct poly_tess_args args = { .heap = hk_heap(cmd), .tcs_stride_el = tcs->info.tess.tcs_output_stride / 4, .statistic = hk_pipeline_stat_addr( @@ -1428,7 +1428,7 @@ hk_draw_without_restart(struct hk_cmd_buffer *cmd, struct agx_draw draw, libagx_unroll_restart_struct(cmd, agx_1d(1024 * draw_count), AGX_BARRIER_ALL | AGX_PREGFX, ia, - libagx_compact_prim(prim)); + poly_compact_prim(prim)); return agx_draw_indexed_indirect(ia.out_draw, dev->heap->va->addr, dev->heap->size, draw.index_size, @@ -1485,7 +1485,7 @@ hk_launch_gs_prerast(struct hk_cmd_buffer *cmd, struct hk_cs *cs, if (cmd->state.gfx.shaders[MESA_SHADER_TESS_EVAL]) { gsi.vertex_buffer = desc->root.draw.tess_params + - offsetof(struct libagx_tess_args, tes_buffer); + offsetof(struct poly_tess_args, tes_buffer); } else { gsi.vertex_buffer = desc->root.root_desc_addr + offsetof(struct hk_root_descriptor_table, @@ -1501,10 +1501,10 @@ hk_launch_gs_prerast(struct hk_cmd_buffer *cmd, struct hk_cs *cs, AGX_BARRIER_ALL | AGX_PREGFX, gsi); grid_vs = agx_grid_indirect_local( - geometry_params + offsetof(struct agx_geometry_params, vs_grid)); + geometry_params + offsetof(struct poly_geometry_params, vs_grid)); grid_gs = agx_grid_indirect_local( - geometry_params + offsetof(struct agx_geometry_params, gs_grid)); + geometry_params + offsetof(struct poly_geometry_params, gs_grid)); } else { grid_vs = grid_gs = draw.b; grid_gs.count[0] = u_decomposed_prims_for_vertices(mode, draw.b.count[0]); @@ -1554,9 +1554,9 @@ hk_launch_gs_prerast(struct hk_cmd_buffer *cmd, struct hk_cs *cs, /* Pre-rast geometry shader */ hk_dispatch_with_local_size(cmd, cs, main, grid_gs, wg); - if (agx_gs_indexed(count->info.gs.shape)) { + if (poly_gs_indexed(count->info.gs.shape)) { enum agx_index_size index_size = - agx_translate_index_size(agx_gs_index_size(count->info.gs.shape)); + agx_translate_index_size(poly_gs_index_size(count->info.gs.shape)); if (agx_is_indirect(draw.b)) { return agx_draw_indexed_indirect( @@ -1661,13 +1661,13 @@ hk_launch_tess(struct hk_cmd_buffer *cmd, struct hk_cs *cs, /* First generate counts, then prefix sum them, and then tessellate. */ libagx_tessellate(cmd, grid_tess, AGX_BARRIER_ALL | AGX_PREGFX, info.mode, - LIBAGX_TESS_MODE_COUNT, state); + POLY_TESS_MODE_COUNT, state); libagx_prefix_sum_tess(cmd, agx_1d(1024), AGX_BARRIER_ALL | AGX_PREGFX, state, c_prims, c_inv, c_prims || c_inv); libagx_tessellate(cmd, grid_tess, AGX_BARRIER_ALL | AGX_PREGFX, info.mode, - LIBAGX_TESS_MODE_WITH_COUNTS, state); + POLY_TESS_MODE_WITH_COUNTS, state); return agx_draw_indexed_indirect(gfx->tess.out_draws, dev->heap->va->addr, dev->heap->size, AGX_INDEX_SIZE_U32, false); @@ -2219,8 +2219,9 @@ hk_flush_index(struct hk_cmd_buffer *cmd, struct hk_cs *cs) uint32_t index = cmd->state.gfx.index.restart; if (gs) { - enum agx_gs_shape shape = gs->variants[HK_GS_VARIANT_COUNT].info.gs.shape; - index = BITFIELD_MASK(8 * agx_gs_index_size(shape)); + enum poly_gs_shape shape = + gs->variants[HK_GS_VARIANT_COUNT].info.gs.shape; + index = BITFIELD_MASK(8 * poly_gs_index_size(shape)); } /* VDM State updates are relatively expensive, so only emit them when the @@ -3061,7 +3062,7 @@ hk_flush_dynamic_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs, bool indirect = agx_is_indirect(draw.b) || draw.restart; desc->root.draw.input_assembly = - indirect ? hk_pool_alloc(cmd, sizeof(struct agx_ia_state), 4).gpu + indirect ? hk_pool_alloc(cmd, sizeof(struct poly_ia_state), 4).gpu : hk_upload_ia_params(cmd, draw); desc->root_dirty = true; } @@ -3078,7 +3079,7 @@ hk_flush_dynamic_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs, if (!indirect) { uint32_t verts = draw.b.count[0], instances = draw.b.count[1]; unsigned vb_size = - libagx_tcs_in_size(verts * instances, vs->b.info.outputs); + poly_tcs_in_size(verts * instances, vs->b.info.outputs); /* Allocate if there are any outputs, or use the null sink to trap * reads if there aren't. Those reads are undefined but should not @@ -3094,7 +3095,7 @@ hk_flush_dynamic_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs, struct agx_ptr tess_args = {0}; if (gfx->shaders[MESA_SHADER_TESS_EVAL]) { - tess_args = hk_pool_alloc(cmd, sizeof(struct libagx_tess_args), 4); + tess_args = hk_pool_alloc(cmd, sizeof(struct poly_tess_args), 4); gfx->descriptors.root.draw.tess_params = tess_args.gpu; gfx->descriptors.root_dirty = true; } diff --git a/src/asahi/vulkan/hk_device.c b/src/asahi/vulkan/hk_device.c index 66a22934bb2..117ae784839 100644 --- a/src/asahi/vulkan/hk_device.c +++ b/src/asahi/vulkan/hk_device.c @@ -19,8 +19,8 @@ #include "asahi/genxml/agx_pack.h" #include "asahi/lib/agx_bo.h" #include "asahi/lib/agx_device.h" -#include "asahi/libagx/geometry.h" #include "compiler/nir/nir_builder.h" +#include "poly/geometry.h" #include "util/hash_table.h" #include "util/ralloc.h" #include "util/simple_mtx.h" @@ -86,7 +86,7 @@ hk_upload_rodata(struct hk_device *dev) */ offs = align(offs, sizeof(uint64_t)); dev->rodata.heap = dev->rodata.bo->va->addr + offs; - offs += sizeof(struct agx_heap); + offs += sizeof(struct poly_heap); return VK_SUCCESS; } diff --git a/src/asahi/vulkan/hk_shader.c b/src/asahi/vulkan/hk_shader.c index 0cce56c0ec4..229887c6b46 100644 --- a/src/asahi/vulkan/hk_shader.c +++ b/src/asahi/vulkan/hk_shader.c @@ -8,10 +8,10 @@ */ #include "hk_shader.h" +#include "poly/nir/poly_nir_lower_gs.h" #include "agx_debug.h" #include "agx_device.h" #include "agx_helpers.h" -#include "agx_nir_lower_gs.h" #include "agx_nir_lower_vbo.h" #include "glsl_types.h" #include "hk_instance.h" @@ -1114,13 +1114,13 @@ hk_compile_nir(struct hk_device *dev, const VkAllocationCallbacks *pAllocator, shader->info.tess.tcs_output_patch_size = nir->info.tess.tcs_vertices_out; shader->info.tess.tcs_per_vertex_outputs = - agx_tcs_per_vertex_outputs(nir); + poly_tcs_per_vertex_outputs(nir); shader->info.tess.tcs_nr_patch_outputs = util_last_bit(nir->info.patch_outputs_written); - shader->info.tess.tcs_output_stride = agx_tcs_output_stride(nir); + shader->info.tess.tcs_output_stride = poly_tcs_output_stride(nir); } else { /* This destroys info so it needs to happen after the gather */ - NIR_PASS(_, nir, agx_nir_lower_tes, hw); + NIR_PASS(_, nir, poly_nir_lower_tes, hw); } } @@ -1137,7 +1137,7 @@ hk_compile_nir(struct hk_device *dev, const VkAllocationCallbacks *pAllocator, if (hw) { hk_lower_hw_vs(nir, shader, kill_psiz); } else { - NIR_PASS(_, nir, agx_nir_lower_vs_before_gs); + NIR_PASS(_, nir, poly_nir_lower_vs_before_gs); nir->info.stage = MESA_SHADER_COMPUTE; memset(&nir->info.cs, 0, sizeof(nir->info.cs)); nir->xfb_info = NULL; @@ -1335,7 +1335,7 @@ hk_compile_shader(struct hk_device *dev, struct vk_shader_compile_info *info, hk_populate_vs_key(&key_tmp.vs, state); key = &key_tmp; } else if (sw_stage == MESA_SHADER_TESS_CTRL) { - NIR_PASS(_, nir, agx_nir_lower_tcs); + NIR_PASS(_, nir, poly_nir_lower_tcs); } /* Compile all variants up front */ @@ -1345,7 +1345,7 @@ hk_compile_shader(struct hk_device *dev, struct vk_shader_compile_info *info, nir_shader *count = NULL, *rast = NULL, *pre_gs = NULL; - NIR_PASS(_, nir, agx_nir_lower_gs, &count, &rast, &pre_gs, + NIR_PASS(_, nir, poly_nir_lower_gs, &count, &rast, &pre_gs, &count_variant->info.gs); agx_preprocess_nir(count); diff --git a/src/asahi/vulkan/hk_shader.h b/src/asahi/vulkan/hk_shader.h index 836c7fbffaa..36712f30f33 100644 --- a/src/asahi/vulkan/hk_shader.h +++ b/src/asahi/vulkan/hk_shader.h @@ -8,9 +8,9 @@ #pragma once #include "asahi/compiler/agx_compile.h" +#include "poly/nir/poly_nir_lower_gs.h" #include "util/macros.h" #include "agx_linker.h" -#include "agx_nir_lower_gs.h" #include "agx_nir_lower_vbo.h" #include "agx_pack.h" #include "agx_usc.h" @@ -94,7 +94,7 @@ struct hk_shader_info { struct hk_tess_info info; } tess; - struct agx_gs_info gs; + struct poly_gs_info gs; /* Used to initialize the union for other stages */ uint8_t _pad[32]; diff --git a/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c b/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c index 5061ecde35d..c1ebac6c431 100644 --- a/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c +++ b/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c @@ -5,10 +5,10 @@ #include "compiler/nir/nir_builder.h" #include "pipe/p_defines.h" +#include "poly/nir/poly_nir_lower_gs.h" #include "util/bitset.h" #include "util/u_dynarray.h" #include "agx_abi.h" -#include "agx_nir_lower_gs.h" #include "agx_state.h" #include "nir.h" #include "nir_builder_opcodes.h" diff --git a/src/gallium/drivers/asahi/agx_state.c b/src/gallium/drivers/asahi/agx_state.c index 91489b117ac..438bf63d8fa 100644 --- a/src/gallium/drivers/asahi/agx_state.c +++ b/src/gallium/drivers/asahi/agx_state.c @@ -34,6 +34,8 @@ #include "pipe/p_defines.h" #include "pipe/p_screen.h" #include "pipe/p_state.h" +#include "poly/geometry.h" +#include "poly/nir/poly_nir_lower_gs.h" #include "util/bitscan.h" #include "util/bitset.h" #include "util/blend.h" @@ -57,10 +59,8 @@ #include "agx_disk_cache.h" #include "agx_linker.h" #include "agx_nir.h" -#include "agx_nir_lower_gs.h" #include "agx_nir_lower_vbo.h" #include "agx_tilebuffer.h" -#include "geometry.h" #include "libagx.h" #include "libagx_dgc.h" #include "libagx_shaders.h" @@ -1544,7 +1544,7 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx, nir_shader *nir = nir_deserialize(NULL, &agx_nir_options, &reader); /* Auxiliary programs */ - struct agx_gs_info gs_info = {0}; + struct poly_gs_info gs_info = {0}; uint64_t outputs = 0; struct agx_fs_epilog_link_info epilog_key = {false}; nir_shader *gs_count = NULL; @@ -1564,7 +1564,7 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx, struct asahi_vs_shader_key *key = &key_->vs; if (nir->info.vs.tes_poly) { - NIR_PASS(_, nir, agx_nir_lower_tes, key->hw); + NIR_PASS(_, nir, poly_nir_lower_tes, key->hw); } else { NIR_PASS(_, nir, agx_nir_gather_vs_inputs, attrib_components_read); NIR_PASS(_, nir, agx_nir_lower_vs_input_to_prolog); @@ -1580,7 +1580,7 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx, NIR_PASS(_, nir, agx_nir_lower_cull_distance_vs); NIR_PASS(_, nir, agx_nir_lower_uvs, &uvs); } else { - NIR_PASS(_, nir, agx_nir_lower_vs_before_gs); + NIR_PASS(_, nir, poly_nir_lower_vs_before_gs); /* Turn into a compute shader now that we're free of vertexisms */ nir->info.stage = MESA_SHADER_COMPUTE; @@ -1589,9 +1589,9 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx, outputs = nir->info.outputs_written; } } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) { - NIR_PASS(_, nir, agx_nir_lower_tcs); + NIR_PASS(_, nir, poly_nir_lower_tcs); } else if (nir->info.stage == MESA_SHADER_GEOMETRY) { - NIR_PASS(_, nir, agx_nir_lower_gs, &gs_count, &gs_copy, &pre_gs, + NIR_PASS(_, nir, poly_nir_lower_gs, &gs_count, &gs_copy, &pre_gs, &gs_info); agx_preprocess_nir(gs_count); @@ -1932,11 +1932,11 @@ agx_create_shader_state(struct pipe_context *pctx, so->tess.spacing = nir->info.tess.spacing; so->tess.output_patch_size = nir->info.tess.tcs_vertices_out; so->tess.primitive = nir->info.tess._primitive_mode; - so->tess.per_vertex_outputs = agx_tcs_per_vertex_outputs(nir); + so->tess.per_vertex_outputs = poly_tcs_per_vertex_outputs(nir); so->tess.nr_patch_outputs = util_last_bit(nir->info.patch_outputs_written); if (nir->info.stage == MESA_SHADER_TESS_CTRL) - so->tess.output_stride = agx_tcs_output_stride(nir); + so->tess.output_stride = poly_tcs_output_stride(nir); } else if (nir->info.stage == MESA_SHADER_GEOMETRY) { so->gs_mode = nir->info.gs.output_primitive; } @@ -3903,7 +3903,7 @@ agx_batch_heap(struct agx_batch *batch) PIPE_USAGE_DEFAULT, size); } - struct agx_heap heap = { + struct poly_heap heap = { .base = agx_resource(ctx->heap)->bo->va->addr, .size = size, }; @@ -3924,7 +3924,7 @@ agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer, const struct pipe_draw_start_count_bias *draw, const struct pipe_draw_indirect_info *indirect) { - struct agx_ia_state ia = { + struct poly_ia_state ia = { .index_buffer = input_index_buffer, .index_buffer_range_el = index_buffer_size_B / info->index_size, .verts_per_instance = draw ? draw->count : 0, @@ -3933,7 +3933,7 @@ agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer, batch->uniforms.input_assembly = agx_pool_upload_aligned(&batch->pool, &ia, sizeof(ia), 8); - struct agx_geometry_params params = { + struct poly_geometry_params params = { .indirect_desc = batch->geom_indirect, .flat_outputs = batch->ctx->stage[MESA_SHADER_FRAGMENT].shader->info.inputs_flat_shaded, @@ -4017,8 +4017,8 @@ agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer, params.input_primitives = params.gs_grid[0] * info->instance_count; - unsigned vb_size = libagx_tcs_in_size(draw->count * info->instance_count, - batch->uniforms.vertex_outputs); + unsigned vb_size = poly_tcs_in_size(draw->count * info->instance_count, + batch->uniforms.vertex_outputs); unsigned size = params.input_primitives * params.count_buffer_stride; if (size && prefix_sum) { @@ -4034,8 +4034,8 @@ agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer, params.input_buffer = addr; } - struct agx_gs_info *gsi = &batch->ctx->gs->gs; - if (gsi->shape == AGX_GS_SHAPE_DYNAMIC_INDEXED) { + struct poly_gs_info *gsi = &batch->ctx->gs->gs; + if (gsi->shape == POLY_GS_SHAPE_DYNAMIC_INDEXED) { unsigned idx_size = params.input_primitives * gsi->max_indices; params.output_index_buffer = @@ -4125,10 +4125,10 @@ agx_launch_gs_prerast(struct agx_batch *batch, libagx_gs_setup_indirect_struct(batch, agx_1d(1), AGX_BARRIER_ALL, gsi); grid_vs = agx_grid_indirect_local( - gp + offsetof(struct agx_geometry_params, vs_grid)); + gp + offsetof(struct poly_geometry_params, vs_grid)); grid_gs = agx_grid_indirect_local( - gp + offsetof(struct agx_geometry_params, gs_grid)); + gp + offsetof(struct poly_geometry_params, gs_grid)); } else { grid_vs = agx_3d(draws->count, info->instance_count, 1); @@ -4246,7 +4246,7 @@ agx_draw_without_restart(struct agx_batch *batch, /* Unroll the index buffer for each draw */ libagx_unroll_restart_struct(batch, agx_1d(1024 * indirect->draw_count), AGX_BARRIER_ALL, unroll, - libagx_compact_prim(info->mode)); + poly_compact_prim(info->mode)); /* Now draw the results without restart */ struct pipe_draw_info new_info = { @@ -4538,8 +4538,8 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info, bool point_mode = MAX2(tcs->tess.point_mode, tes->tess.point_mode); enum mesa_prim out_prim = agx_tess_output_prim(tcs, tes); - enum libagx_tess_partitioning partitioning = - (enum libagx_tess_partitioning)pspacing; + enum poly_tess_partitioning partitioning = + (enum poly_tess_partitioning)pspacing; struct agx_bo *draw_bo = NULL; size_t draw_stride = 5 * sizeof(uint32_t); @@ -4557,7 +4557,7 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info, if (info->index_size) ib = agx_index_buffer_ptr(batch, info, draws, &ib_extent); - struct agx_ia_state ia = { + struct poly_ia_state ia = { .index_buffer = ib, .index_buffer_range_el = ib_extent, .verts_per_instance = draws ? draws->count : 0, @@ -4572,7 +4572,7 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info, uint64_t heap = agx_batch_heap(batch); assert((tcs->tess.output_stride & 3) == 0 && "must be aligned"); - struct libagx_tess_args args = { + struct poly_tess_args args = { .heap = heap, .tcs_stride_el = tcs->tess.output_stride / 4, .statistic = agx_get_query_address( @@ -4644,8 +4644,8 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info, args.out_draws = blob.gpu + draw_offs; args.counts = blob.gpu + count_offs; - unsigned vb_size = libagx_tcs_in_size(draws->count * info->instance_count, - batch->uniforms.vertex_outputs); + unsigned vb_size = poly_tcs_in_size(draws->count * info->instance_count, + batch->uniforms.vertex_outputs); uint64_t addr = agx_pool_alloc_aligned(&batch->pool, vb_size, 4).gpu; batch->uniforms.vertex_output_buffer_ptr = agx_pool_upload(&batch->pool, &addr, 8); @@ -4716,11 +4716,11 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info, /* Generate counts, then prefix sum them, then finally tessellate. */ libagx_tessellate(batch, tess_grid, AGX_BARRIER_ALL, mode, - LIBAGX_TESS_MODE_COUNT, state); + POLY_TESS_MODE_COUNT, state); libagx_prefix_sum_tess(batch, agx_1d(1024), AGX_BARRIER_ALL, state, c_prims, c_invs, c_prims || c_invs); libagx_tessellate(batch, tess_grid, AGX_BARRIER_ALL, mode, - LIBAGX_TESS_MODE_WITH_COUNTS, state); + POLY_TESS_MODE_WITH_COUNTS, state); /* Face culling state needs to be specialized for tess */ ctx->dirty |= AGX_DIRTY_RS; @@ -5141,12 +5141,12 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info, agx_launch_gs_prerast(batch, info, draws, indirect); /* Setup to rasterize the GS results */ - struct agx_gs_info *gsi = &ctx->gs->gs; + struct poly_gs_info *gsi = &ctx->gs->gs; info_gs = (struct pipe_draw_info){ .mode = gsi->mode, - .index_size = agx_gs_index_size(gsi->shape), - .primitive_restart = agx_gs_indexed(gsi->shape), - .restart_index = agx_gs_index_size(gsi->shape) == 1 ? 0xFF : ~0, + .index_size = poly_gs_index_size(gsi->shape), + .primitive_restart = poly_gs_indexed(gsi->shape), + .restart_index = poly_gs_index_size(gsi->shape) == 1 ? 0xFF : ~0, .index.resource = &index_rsrc.base, .instance_count = 1, }; @@ -5167,11 +5167,11 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info, u_decomposed_prims_for_vertices(info->mode, draws->count); draw_gs = (struct pipe_draw_start_count_bias){ - .count = agx_gs_rast_vertices(gsi->shape, gsi->max_indices, prims, - info->instance_count), + .count = poly_gs_rast_vertices(gsi->shape, gsi->max_indices, prims, + info->instance_count), }; - info_gs.instance_count = agx_gs_rast_instances( + info_gs.instance_count = poly_gs_rast_instances( gsi->shape, gsi->max_indices, prims, info->instance_count); draws = &draw_gs; @@ -5184,10 +5184,10 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info, batch->reduced_prim = u_reduced_prim(info->mode); ctx->dirty |= AGX_DIRTY_PRIM; - if (gsi->shape == AGX_GS_SHAPE_DYNAMIC_INDEXED) { + if (gsi->shape == POLY_GS_SHAPE_DYNAMIC_INDEXED) { ib = batch->geom_index; ib_extent = index_rsrc.bo->size - (batch->geom_index - ib); - } else if (gsi->shape == AGX_GS_SHAPE_STATIC_INDEXED) { + } else if (gsi->shape == POLY_GS_SHAPE_STATIC_INDEXED) { ib = agx_pool_upload(&batch->pool, gsi->topology, gsi->max_indices); ib_extent = gsi->max_indices; } diff --git a/src/gallium/drivers/asahi/agx_state.h b/src/gallium/drivers/asahi/agx_state.h index 31c88ee989e..32b21bf4ffc 100644 --- a/src/gallium/drivers/asahi/agx_state.h +++ b/src/gallium/drivers/asahi/agx_state.h @@ -18,13 +18,14 @@ #include "asahi/lib/agx_tilebuffer.h" #include "asahi/lib/agx_uvs.h" #include "asahi/lib/pool.h" -#include "asahi/libagx/geometry.h" #include "compiler/shader_enums.h" #include "gallium/auxiliary/util/u_blitter.h" #include "gallium/include/pipe/p_context.h" #include "gallium/include/pipe/p_screen.h" #include "gallium/include/pipe/p_state.h" #include "pipe/p_defines.h" +#include "poly/geometry.h" +#include "poly/nir/poly_nir_lower_gs.h" #include "util/bitset.h" #include "util/disk_cache.h" #include "util/hash_table.h" @@ -32,7 +33,6 @@ #include "util/u_range.h" #include "agx_bg_eot.h" #include "agx_helpers.h" -#include "agx_nir_lower_gs.h" #include "agx_nir_texture.h" #ifdef __GLIBC__ @@ -248,7 +248,7 @@ struct agx_compiled_shader { struct agx_compiled_shader *gs_count, *pre_gs; struct agx_compiled_shader *gs_copy; - struct agx_gs_info gs; + struct poly_gs_info gs; /* Logical shader stage used for descriptor access. This may differ from the * physical shader stage of the compiled shader, for example when executing a diff --git a/src/meson.build b/src/meson.build index fdfb2dc246f..d08e26466da 100644 --- a/src/meson.build +++ b/src/meson.build @@ -53,6 +53,9 @@ if with_gallium_or_lvp or with_gbm or with_platform_wayland subdir('loader') endif subdir('compiler') +if with_poly + subdir('poly') +endif if with_tools.contains('drm-shim') subdir('drm-shim') endif diff --git a/src/poly/.clang-format b/src/poly/.clang-format new file mode 100644 index 00000000000..6fc36ba4cca --- /dev/null +++ b/src/poly/.clang-format @@ -0,0 +1,8 @@ + +BasedOnStyle: InheritParentConfig +DisableFormat: false + +AlignConsecutiveBitFields: Consecutive +ColumnLimit: 80 +BreakStringLiterals: false +SpaceBeforeParens: ControlStatementsExceptControlMacros diff --git a/src/poly/cl/geometry.cl b/src/poly/cl/geometry.cl new file mode 100644 index 00000000000..b1ae4ba1620 --- /dev/null +++ b/src/poly/cl/geometry.cl @@ -0,0 +1,501 @@ +/* + * Copyright 2023 Alyssa Rosenzweig + * Copyright 2023 Valve Corporation + * Copyright 2025 Collabora Ltd. + * SPDX-License-Identifier: MIT + */ + +#include "compiler/libcl/libcl_vk.h" +#include "poly/geometry.h" +#include "poly/tessellator.h" +#include "util/macros.h" +#include "util/u_math.h" + +uint64_t nir_ro_to_rw_poly(uint64_t address); + +/* Swap the two non-provoking vertices in odd triangles. This generates a vertex + * ID list with a consistent winding order. + * + * Holding prim and flatshade_first constant, the map : [0, 1, 2] -> [0, 1, 2] + * is its own inverse. It is hence used both vertex fetch and transform + * feedback. + */ +static uint +map_vertex_in_tri_strip(uint prim, uint vert, bool flatshade_first) +{ + unsigned pv = flatshade_first ? 0 : 2; + + bool even = (prim & 1) == 0; + bool provoking = vert == pv; + + return (provoking || even) ? vert : ((3 - pv) - vert); +} + +static inline uint +xfb_prim(uint id, uint n, uint copy) +{ + return sub_sat(id, n - 1u) + copy; +} + +/* + * Determine whether an output vertex has an n'th copy in the transform feedback + * buffer. This is written weirdly to let constant folding remove unnecessary + * stores when length is known statically. + */ +bool +poly_xfb_vertex_copy_in_strip(uint n, uint id, uint length, uint copy) +{ + uint prim = xfb_prim(id, n, copy); + + int num_prims = length - (n - 1); + return copy == 0 || (prim < num_prims && id >= copy && copy < num_prims); +} + +uint +poly_xfb_vertex_offset(uint n, uint invocation_base_prim, uint strip_base_prim, + uint id_in_strip, uint copy, bool flatshade_first) +{ + uint prim = xfb_prim(id_in_strip, n, copy); + uint vert_0 = min(id_in_strip, n - 1); + uint vert = vert_0 - copy; + + if (n == 3) { + vert = map_vertex_in_tri_strip(prim, vert, flatshade_first); + } + + /* Tally up in the whole buffer */ + uint base_prim = invocation_base_prim + strip_base_prim; + uint base_vertex = base_prim * n; + return base_vertex + (prim * n) + vert; +} + +uint64_t +poly_xfb_vertex_address(constant struct poly_geometry_params *p, uint index, + uint buffer, uint stride, uint output_offset) +{ + uint xfb_offset = (index * stride) + output_offset; + + return (uintptr_t)(p->xfb_base[buffer]) + xfb_offset; +} + +static uint +vertex_id_for_line_loop(uint prim, uint vert, uint num_prims) +{ + /* (0, 1), (1, 2), (2, 0) */ + if (prim == (num_prims - 1) && vert == 1) + return 0; + else + return prim + vert; +} + +uint +poly_vertex_id_for_line_class(enum mesa_prim mode, uint prim, uint vert, + uint num_prims) +{ + /* Line list, line strip, or line loop */ + if (mode == MESA_PRIM_LINE_LOOP && prim == (num_prims - 1) && vert == 1) + return 0; + + if (mode == MESA_PRIM_LINES) + prim *= 2; + + return prim + vert; +} + +static uint +vertex_id_for_tri_fan(uint prim, uint vert, bool flatshade_first) +{ + /* Vulkan spec section 20.1.7 gives (i + 1, i + 2, 0) for a provoking + * first. OpenGL instead wants (0, i + 1, i + 2) with a provoking last. + * Piglit clipflat expects us to switch between these orders depending on + * provoking vertex, to avoid trivializing the fan. + * + * Rotate accordingly. + */ + if (flatshade_first) { + vert = (vert == 2) ? 0 : (vert + 1); + } + + /* The simpler form assuming last is provoking. */ + return (vert == 0) ? 0 : prim + vert; +} + +uint +poly_vertex_id_for_tri_class(enum mesa_prim mode, uint prim, uint vert, + bool flatshade_first) +{ + if (flatshade_first && mode == MESA_PRIM_TRIANGLE_FAN) { + vert = vert + 1; + vert = (vert == 3) ? 0 : vert; + } + + if (mode == MESA_PRIM_TRIANGLE_FAN && vert == 0) + return 0; + + if (mode == MESA_PRIM_TRIANGLES) + prim *= 3; + + /* Triangle list, triangle strip, or triangle fan */ + if (mode == MESA_PRIM_TRIANGLE_STRIP) { + unsigned pv = flatshade_first ? 0 : 2; + + bool even = (prim & 1) == 0; + bool provoking = vert == pv; + + vert = ((provoking || even) ? vert : ((3 - pv) - vert)); + } + + return prim + vert; +} + +uint +poly_vertex_id_for_line_adj_class(enum mesa_prim mode, uint prim, uint vert) +{ + /* Line list adj or line strip adj */ + if (mode == MESA_PRIM_LINES_ADJACENCY) + prim *= 4; + + return prim + vert; +} + +static uint +vertex_id_for_tri_strip_adj(uint prim, uint vert, uint num_prims, + bool flatshade_first) +{ + /* See Vulkan spec section 20.1.11 "Triangle Strips With Adjancency". + * + * There are different cases for first/middle/last/only primitives and for + * odd/even primitives. Determine which case we're in. + */ + bool last = prim == (num_prims - 1); + bool first = prim == 0; + bool even = (prim & 1) == 0; + bool even_or_first = even || first; + + /* When the last vertex is provoking, we rotate the primitives + * accordingly. This seems required for OpenGL. + */ + if (!flatshade_first && !even_or_first) { + vert = (vert + 4u) % 6u; + } + + /* Offsets per the spec. The spec lists 6 cases with 6 offsets. Luckily, + * there are lots of patterns we can exploit, avoiding a full 6x6 LUT. + * + * Here we assume the first vertex is provoking, the Vulkan default. + */ + uint offsets[6] = { + 0, + first ? 1 : (even ? -2 : 3), + even_or_first ? 2 : 4, + last ? 5 : 6, + even_or_first ? 4 : 2, + even_or_first ? 3 : -2, + }; + + /* Ensure NIR can see thru the local array */ + uint offset = 0; + for (uint i = 1; i < 6; ++i) { + if (i == vert) + offset = offsets[i]; + } + + /* Finally add to the base of the primitive */ + return (prim * 2) + offset; +} + +uint +poly_vertex_id_for_tri_adj_class(enum mesa_prim mode, uint prim, uint vert, + uint nr, bool flatshade_first) +{ + /* Tri adj list or tri adj strip */ + if (mode == MESA_PRIM_TRIANGLE_STRIP_ADJACENCY) { + return vertex_id_for_tri_strip_adj(prim, vert, nr, flatshade_first); + } else { + return (6 * prim) + vert; + } +} + +static uint +vertex_id_for_topology(enum mesa_prim mode, bool flatshade_first, uint prim, + uint vert, uint num_prims) +{ + switch (mode) { + case MESA_PRIM_POINTS: + case MESA_PRIM_LINES: + case MESA_PRIM_TRIANGLES: + case MESA_PRIM_LINES_ADJACENCY: + case MESA_PRIM_TRIANGLES_ADJACENCY: + /* Regular primitive: every N vertices defines a primitive */ + return (prim * mesa_vertices_per_prim(mode)) + vert; + + case MESA_PRIM_LINE_LOOP: + return vertex_id_for_line_loop(prim, vert, num_prims); + + case MESA_PRIM_LINE_STRIP: + case MESA_PRIM_LINE_STRIP_ADJACENCY: + /* (i, i + 1) or (i, ..., i + 3) */ + return prim + vert; + + case MESA_PRIM_TRIANGLE_STRIP: { + /* Order depends on the provoking vert. + * + * First: (0, 1, 2), (1, 3, 2), (2, 3, 4). + * Last: (0, 1, 2), (2, 1, 3), (2, 3, 4). + * + * Pull the (maybe swapped) vert from the corresponding primitive + */ + return prim + map_vertex_in_tri_strip(prim, vert, flatshade_first); + } + + case MESA_PRIM_TRIANGLE_FAN: + return vertex_id_for_tri_fan(prim, vert, flatshade_first); + + case MESA_PRIM_TRIANGLE_STRIP_ADJACENCY: + return vertex_id_for_tri_strip_adj(prim, vert, num_prims, + flatshade_first); + + default: + return 0; + } +} + +uint +poly_map_to_line_adj(uint id) +{ + /* Sequence (1, 2), (5, 6), (9, 10), ... */ + return ((id & ~1) * 2) + (id & 1) + 1; +} + +uint +poly_map_to_line_strip_adj(uint id) +{ + /* Sequence (1, 2), (2, 3), (4, 5), .. */ + uint prim = id / 2; + uint vert = id & 1; + return prim + vert + 1; +} + +uint +poly_map_to_tri_strip_adj(uint id) +{ + /* Sequence (0, 2, 4), (2, 6, 4), (4, 6, 8), (6, 10, 8) + * + * Although tri strips with adjacency have 6 cases in general, after + * disregarding the vertices only available in a geometry shader, there are + * only even/odd cases. In other words, it's just a triangle strip subject to + * extra padding. + * + * Dividing through by two, the sequence is: + * + * (0, 1, 2), (1, 3, 2), (2, 3, 4), (3, 5, 4) + */ + uint prim = id / 3; + uint vtx = id % 3; + + /* Flip the winding order of odd triangles */ + if ((prim % 2) == 1) { + if (vtx == 1) + vtx = 2; + else if (vtx == 2) + vtx = 1; + } + + return 2 * (prim + vtx); +} + +uint +poly_load_index_buffer(constant struct poly_ia_state *p, uint id, + uint index_size) +{ + return poly_load_index(p->index_buffer, p->index_buffer_range_el, id, + index_size); +} + +static uint +setup_xfb_buffer(global struct poly_geometry_params *p, uint i, uint stride, + uint max_output_end, uint vertices_per_prim) +{ + uint xfb_offset = *(p->xfb_offs_ptrs[i]); + p->xfb_base[i] = p->xfb_base_original[i] + xfb_offset; + + /* Let output_end = output_offset + output_size. + * + * Primitive P will write up to (but not including) offset: + * + * xfb_offset + ((P - 1) * (verts_per_prim * stride)) + * + ((verts_per_prim - 1) * stride) + * + output_end + * + * To fit all outputs for P, that value must be less than the XFB + * buffer size for the output with maximal output_end, as everything + * else is constant here across outputs within a buffer/primitive: + * + * floor(P) <= (stride + size - xfb_offset - output_end) + * // (stride * verts_per_prim) + */ + int numer_s = p->xfb_size[i] + (stride - max_output_end) - xfb_offset; + uint numer = max(numer_s, 0); + return numer / (stride * vertices_per_prim); +} + +void +poly_write_strip(GLOBAL uint32_t *index_buffer, uint32_t inv_index_offset, + uint32_t prim_index_offset, uint32_t vertex_offset, + uint32_t verts_in_prim, uint3 info) +{ + _poly_write_strip(index_buffer, inv_index_offset + prim_index_offset, + vertex_offset, verts_in_prim, info.x, info.y, info.z); +} + +void +poly_pad_index_gs(global int *index_buffer, uint inv_index_offset, + uint nr_indices, uint alloc) +{ + for (uint i = nr_indices; i < alloc; ++i) { + index_buffer[inv_index_offset + i] = -1; + } +} + +uintptr_t +poly_vertex_output_address(uintptr_t buffer, uint64_t mask, uint vtx, + gl_varying_slot location) +{ + /* Written like this to let address arithmetic work */ + return buffer + ((uintptr_t)poly_tcs_in_offs_el(vtx, location, mask)) * 16; +} + +uintptr_t +poly_geometry_input_address(constant struct poly_geometry_params *p, uint vtx, + gl_varying_slot location) +{ + return poly_vertex_output_address(p->input_buffer, p->input_mask, vtx, + location); +} + +unsigned +poly_input_vertices(constant struct poly_ia_state *ia) +{ + return ia->verts_per_instance; +} + +global uint * +poly_load_xfb_count_address(constant struct poly_geometry_params *p, int index, + int count_words, uint unrolled_id) +{ + return &p->count_buffer[(unrolled_id * count_words) + index]; +} + +uint +poly_previous_xfb_primitives(global struct poly_geometry_params *p, + int static_count, int count_index, int count_words, + bool prefix_sum, uint unrolled_id) +{ + if (static_count >= 0) { + /* If the number of outputted vertices per invocation is known statically, + * we can calculate the base. + */ + return unrolled_id * static_count; + } else { + /* Otherwise, load from the count buffer buffer. Note that the sums are + * inclusive, so index 0 is nonzero. This requires a little fixup here. We + * use a saturating unsigned subtraction so we don't read out-of-bounds. + * + * If we didn't prefix sum, there's only one element. + */ + uint prim_minus_1 = prefix_sum ? sub_sat(unrolled_id, 1u) : 0; + uint count = p->count_buffer[(prim_minus_1 * count_words) + count_index]; + + return unrolled_id == 0 ? 0 : count; + } +} + +/* Like u_foreach_bit, specialized for XFB to enable loop unrolling */ +#define poly_foreach_xfb(word, index) \ + for (uint i = 0; i < 4; ++i) \ + if (word & BITFIELD_BIT(i)) + +void +poly_pre_gs(global struct poly_geometry_params *p, uint streams, + uint buffers_written, uint4 buffer_to_stream, int4 count_index, + uint4 stride, uint4 output_end, int4 static_count, uint invocations, + uint vertices_per_prim, global uint *gs_invocations, + global uint *gs_primitives, global uint *c_primitives, + global uint *c_invocations) +{ + unsigned count_words = !!(count_index[0] >= 0) + !!(count_index[1] >= 0) + + !!(count_index[2] >= 0) + !!(count_index[3] >= 0); + bool prefix_sum = count_words && buffers_written; + uint unrolled_in_prims = p->input_primitives; + + /* Determine the number of primitives generated in each stream */ + uint4 in_prims = 0; + poly_foreach_xfb(streams, i) { + in_prims[i] = poly_previous_xfb_primitives(p, static_count[i], + count_index[i], count_words, + prefix_sum, unrolled_in_prims); + + *(p->prims_generated_counter[i]) += in_prims[i]; + } + + uint4 prims = in_prims; + uint emitted_prims = prims[0] + prims[1] + prims[2] + prims[3]; + + if (buffers_written) { + poly_foreach_xfb(buffers_written, i) { + uint max_prims = + setup_xfb_buffer(p, i, stride[i], output_end[i], vertices_per_prim); + + unsigned stream = buffer_to_stream[i]; + prims[stream] = min(prims[stream], max_prims); + } + + int4 overflow = prims < in_prims; + + poly_foreach_xfb(streams, i) { + p->xfb_verts[i] = prims[i] * vertices_per_prim; + + *(p->xfb_overflow[i]) += (bool)overflow[i]; + *(p->xfb_prims_generated_counter[i]) += prims[i]; + } + + *(p->xfb_any_overflow) += any(overflow); + + /* Update XFB counters */ + poly_foreach_xfb(buffers_written, i) { + uint32_t prim_stride_B = stride[i] * vertices_per_prim; + unsigned stream = buffer_to_stream[i]; + + global uint *ptr = p->xfb_offs_ptrs[i]; + + ptr = (global uint *)nir_ro_to_rw_poly((uint64_t)ptr); + *ptr += prims[stream] * prim_stride_B; + } + } + + /* The geometry shader is invoked once per primitive (after unrolling + * primitive restart). From the spec: + * + * In case of instanced geometry shaders (see section 11.3.4.2) the + * geometry shader invocations count is incremented for each separate + * instanced invocation. + */ + *gs_invocations += unrolled_in_prims * invocations; + *gs_primitives += emitted_prims; + + /* Clipper queries are not well-defined, so we can emulate them in lots of + * silly ways. We need the hardware counters to implement them properly. For + * now, just consider all primitives emitted as passing through the clipper. + * This satisfies spec text: + * + * The number of primitives that reach the primitive clipping stage. + * + * and + * + * If at least one vertex of the primitive lies inside the clipping + * volume, the counter is incremented by one or more. Otherwise, the + * counter is incremented by zero or more. + */ + *c_primitives += emitted_prims; + *c_invocations += emitted_prims; +} diff --git a/src/poly/cl/meson.build b/src/poly/cl/meson.build new file mode 100644 index 00000000000..286dcf1c90e --- /dev/null +++ b/src/poly/cl/meson.build @@ -0,0 +1,35 @@ +# Copyright 2024 Valve Corporation +# Copyright © 2025 Collabora Ltd. +# SPDX-License-Identifier: MIT + +libpoly_shader_files = files( + 'geometry.cl', + 'tessellation.cl', +) + +libpoly_shaders_spv = custom_target( + input : libpoly_shader_files, + output : 'libpoly.spv', + command : [ + prog_mesa_clc, '-o', '@OUTPUT@', '--depfile', '@DEPFILE@', + libpoly_shader_files, '--', + '-I' + join_paths(meson.project_source_root(), 'include'), + '-I' + join_paths(meson.project_source_root(), 'src/compiler/libcl'), + '-I' + join_paths(meson.current_source_dir(), '.'), + '-I' + join_paths(meson.current_source_dir(), '../../'), + cl_args, + ], + depends : [], + depfile : 'libpoly_shaders.h.d', +) + +libpoly_shaders = custom_target( + input : libpoly_shaders_spv, + output : ['libpoly.cpp', 'libpoly.h'], + command : [prog_vtn_bindgen2, libpoly_shaders_spv, '@OUTPUT0@', '@OUTPUT1@'], +) + +idep_libpoly = declare_dependency( + sources : [libpoly_shaders], + include_directories : include_directories('.'), +) diff --git a/src/poly/cl/tessellation.cl b/src/poly/cl/tessellation.cl new file mode 100644 index 00000000000..ed37ca889ba --- /dev/null +++ b/src/poly/cl/tessellation.cl @@ -0,0 +1,133 @@ +/* + * Copyright 2023 Alyssa Rosenzweig + * SPDX-License-Identifier: MIT + */ + +#include "poly/geometry.h" +#include "poly/tessellator.h" + +uint +poly_tcs_patch_vertices_in(constant struct poly_tess_args *p) +{ + return p->input_patch_size; +} + +uint +poly_tes_patch_vertices_in(constant struct poly_tess_args *p) +{ + return p->output_patch_size; +} + +uint +poly_tcs_unrolled_id(constant struct poly_tess_args *p, uint3 wg_id) +{ + return (wg_id.y * p->patches_per_instance) + wg_id.x; +} + +uint64_t +poly_tes_buffer(constant struct poly_tess_args *p) +{ + return p->tes_buffer; +} + +/* + * Helper to lower indexing for a tess eval shader ran as a compute shader. This + * handles the tess+geom case. This is simpler than the general input assembly + * lowering, as we know: + * + * 1. the index buffer is U32 + * 2. the index is in bounds + * + * Therefore we do a simple load. No bounds checking needed. + */ +uint32_t +poly_load_tes_index(constant struct poly_tess_args *p, uint32_t index) +{ + /* Swap second and third vertices of each triangle to flip winding order + * dynamically if needed. + */ + if (p->ccw) { + uint id = index % 3; + + if (id == 1) + index++; + else if (id == 2) + index--; + } + + return p->index_buffer[index]; +} + +uintptr_t +poly_tcs_out_address(constant struct poly_tess_args *p, uint patch_id, + uint vtx_id, gl_varying_slot location, uint nr_patch_out, + uint out_patch_size, uint64_t vtx_out_mask) +{ + uint stride_el = + poly_tcs_out_stride_el(nr_patch_out, out_patch_size, vtx_out_mask); + + uint offs_el = + poly_tcs_out_offs_el(vtx_id, location, nr_patch_out, vtx_out_mask); + + offs_el += patch_id * stride_el; + + /* Written to match the AGX addressing mode */ + return (uintptr_t)(p->tcs_buffer) + (((uintptr_t)offs_el) << 2); +} + +static uint +tes_unrolled_patch_id(uint raw_id) +{ + return raw_id / POLY_TES_PATCH_ID_STRIDE; +} + +uint +poly_tes_patch_id(constant struct poly_tess_args *p, uint raw_id) +{ + return tes_unrolled_patch_id(raw_id) % p->patches_per_instance; +} + +static uint +tes_vertex_id_in_patch(uint raw_id) +{ + return raw_id % POLY_TES_PATCH_ID_STRIDE; +} + +float2 +poly_load_tess_coord(constant struct poly_tess_args *p, uint raw_id) +{ + uint patch = tes_unrolled_patch_id(raw_id); + uint vtx = tes_vertex_id_in_patch(raw_id); + + global struct poly_tess_point *t = + &p->patch_coord_buffer[p->coord_allocs[patch] + vtx]; + + /* Written weirdly because NIR struggles with loads of structs */ + uint2 fixed = *((global uint2 *)t); + + /* Convert fixed point to float */ + return convert_float2(fixed) / (1u << 16); +} + +uintptr_t +poly_tes_in_address(constant struct poly_tess_args *p, uint raw_id, uint vtx_id, + gl_varying_slot location) +{ + uint patch = tes_unrolled_patch_id(raw_id); + + return poly_tcs_out_address(p, patch, vtx_id, location, + p->tcs_patch_constants, p->output_patch_size, + p->tcs_per_vertex_outputs); +} + +float4 +poly_tess_level_outer_default(constant struct poly_tess_args *p) +{ + return vload4(0, p->tess_level_outer_default); +} + +float2 +poly_tess_level_inner_default(constant struct poly_tess_args *p) +{ + return vload2(0, p->tess_level_inner_default); +} diff --git a/src/poly/cl/tessellator.h b/src/poly/cl/tessellator.h new file mode 100644 index 00000000000..0dbe5b76d52 --- /dev/null +++ b/src/poly/cl/tessellator.h @@ -0,0 +1,1609 @@ +/* + Copyright (c) Microsoft Corporation + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#ifndef __OPENCL_VERSION__ +#error "Tessellator should only be imported by OpenCL C code" +#endif + +#include "poly/geometry.h" +#include "poly/tessellator.h" +#include "util/u_math.h" + +#pragma once + +#define POLY_TESS_MIN_ISOLINE_DENSITY_TESSELLATION_FACTOR 1.0f +#define POLY_TESS_MAX_ISOLINE_DENSITY_TESSELLATION_FACTOR 64.0f + +typedef unsigned int FXP; // fixed point number + +enum { + U = 0, // points on a tri patch + V = 1, +}; + +enum { + Ueq0 = 0, // edges on a tri patch + Veq0 = 1, + Weq0 = 2, +}; + +enum { + Ueq1 = 2, // edges on a quad patch: Ueq0, Veq0, Ueq1, Veq1 + Veq1 = 3, +}; + +#define QUAD_AXES 2 +#define QUAD_EDGES 4 +#define TRI_EDGES 3 + +// The interior can just use a simpler stitch. +typedef enum DIAGONALS { + DIAGONALS_INSIDE_TO_OUTSIDE, + DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE, + DIAGONALS_MIRRORED +} DIAGONALS; + +typedef struct TESS_FACTOR_CONTEXT { + FXP fxpInvNumSegmentsOnFloorTessFactor; + FXP fxpInvNumSegmentsOnCeilTessFactor; + FXP fxpHalfTessFactorFraction; + int numHalfTessFactorPoints; + int splitPointOnFloorHalfTessFactor; +} TESS_FACTOR_CONTEXT; + +struct INDEX_PATCH_CONTEXT { + int insidePointIndexDeltaToRealValue; + int insidePointIndexBadValue; + int insidePointIndexReplacementValue; + int outsidePointIndexPatchBase; + int outsidePointIndexDeltaToRealValue; + int outsidePointIndexBadValue; + int outsidePointIndexReplacementValue; +}; + +struct INDEX_PATCH_CONTEXT2 { + int baseIndexToInvert; + int indexInversionEndPoint; + int cornerCaseBadValue; + int cornerCaseReplacementValue; +}; + +struct CHWTessellator { + enum poly_tess_mode mode; + uint index_bias; + + // array where we will store u/v's for the points we generate + global struct poly_tess_point *Point; + + // array where we will store index topology + global void *Index; + + // A second index patch we have to do handles the leftover strip of quads in + // the middle of an odd quad patch after finishing all the concentric rings. + // This also handles the leftover strip of points in the middle of an even + // quad patch, when stitching the row of triangles up the left side (V major + // quad) or bottom (U major quad) of the inner ring + bool bUsingPatchedIndices; + bool bUsingPatchedIndices2; + struct INDEX_PATCH_CONTEXT IndexPatchCtx; + struct INDEX_PATCH_CONTEXT2 IndexPatchCtx2; +}; + +#define FXP_INTEGER_BITS 15 +#define FXP_FRACTION_BITS 16 +#define FXP_FRACTION_MASK 0x0000ffff +#define FXP_INTEGER_MASK 0x7fff0000 +#define FXP_ONE (1 << FXP_FRACTION_BITS) +#define FXP_ONE_THIRD 0x00005555 +#define FXP_TWO_THIRDS 0x0000aaaa +#define FXP_ONE_HALF 0x00008000 + +static inline global float * +tess_factors(constant struct poly_tess_args *p, uint patch) +{ + return p->tcs_buffer + (patch * p->tcs_stride_el); +} + +/* + * Generate an indexed draw for a patch with the computed number of indices. + * This allocates heap memory for the index buffer, returning the allocated + * memory. + */ +static inline global void * +poly_draw(constant struct poly_tess_args *p, enum poly_tess_mode mode, + bool lines, uint patch, uint count) +{ + if (mode == POLY_TESS_MODE_COUNT) { + p->counts[patch] = count; + } + + if (mode == POLY_TESS_MODE_WITH_COUNTS) { + /* The index buffer is already allocated, get a pointer inside it. + * p->counts has had an inclusive prefix sum hence the subtraction. + */ + uint offset_el = p->counts[sub_sat(patch, 1u)]; + if (patch == 0) + offset_el = 0; + + return &p->index_buffer[offset_el]; + } + + return NULL; +} + +static inline void +poly_draw_points(private struct CHWTessellator *ctx, + constant struct poly_tess_args *p, uint patch, uint count) +{ + /* For points mode with a single draw, we need to generate a trivial index + * buffer to stuff in the patch ID in the right place. + */ + global uint32_t *indices = poly_draw(p, ctx->mode, false, patch, count); + + if (ctx->mode == POLY_TESS_MODE_COUNT) + return; + + for (int i = 0; i < count; ++i) { + indices[i] = ctx->index_bias + i; + } +} + +static inline void +poly_draw_empty(constant struct poly_tess_args *p, enum poly_tess_mode mode, + uint patch) +{ + if (mode == POLY_TESS_MODE_COUNT) { + p->counts[patch] = 0; + } +} + +/* + * Allocate heap memory for domain points for a patch. The allocation + * is recorded in the coord_allocs[] array, which is in elements. + */ +static inline global struct poly_tess_point * +poly_heap_alloc_points(constant struct poly_tess_args *p, uint patch, + uint count) +{ + /* If we're recording statistics, increment now. The statistic is for + * tessellation evaluation shader invocations, which is equal to the number + * of domain points generated. + */ + if (p->statistic) { + atomic_fetch_add((volatile atomic_uint *)(p->statistic), count); + } + + uint32_t elsize_B = sizeof(struct poly_tess_point); + uint32_t alloc_B = poly_heap_alloc_atomic_offs(p->heap, elsize_B * count); + uint32_t alloc_el = alloc_B / elsize_B; + + p->coord_allocs[patch] = alloc_el; + return (global struct poly_tess_point *)(((uintptr_t)p->heap->base) + + alloc_B); +} + +// Microsoft D3D11 Fixed Function Tessellator Reference - May 7, 2012 +// amar.patel@microsoft.com + +#define POLY_TESS_MIN_ODD_TESSELLATION_FACTOR 1 +#define POLY_TESS_MAX_ODD_TESSELLATION_FACTOR 63 +#define POLY_TESS_MIN_EVEN_TESSELLATION_FACTOR 2 +#define POLY_TESS_MAX_EVEN_TESSELLATION_FACTOR 64 + +// 2^(-16), min positive fixed point fraction +#define EPSILON 0.0000152587890625f +#define MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON \ + (POLY_TESS_MIN_ODD_TESSELLATION_FACTOR + EPSILON / 2) + +static inline float +clamp_factor(float factor, enum poly_tess_partitioning partitioning, float maxf) +{ + float lower = (partitioning == POLY_TESS_PARTITIONING_FRACTIONAL_EVEN) + ? POLY_TESS_MIN_EVEN_TESSELLATION_FACTOR + : POLY_TESS_MIN_ODD_TESSELLATION_FACTOR; + + float upper = (partitioning == POLY_TESS_PARTITIONING_FRACTIONAL_ODD) + ? POLY_TESS_MAX_ODD_TESSELLATION_FACTOR + : POLY_TESS_MAX_EVEN_TESSELLATION_FACTOR; + + // If any TessFactor will end up > 1 after floatToFixed conversion later, + // then force the inside TessFactors to be > 1 so there is a picture frame. + if (partitioning == POLY_TESS_PARTITIONING_FRACTIONAL_ODD && + maxf > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON) { + + lower = POLY_TESS_MIN_ODD_TESSELLATION_FACTOR + EPSILON; + } + + factor = clamp(factor, lower, upper); + + if (partitioning == POLY_TESS_PARTITIONING_INTEGER) { + factor = ceil(factor); + } + + return factor; +} + +static inline FXP +floatToFixed(const float input) +{ + return mad(input, FXP_ONE, 0.5f); +} + +static inline bool +isOdd(const float input) +{ + return ((int)input) & 1; +} + +static inline FXP +fxpCeil(const FXP input) +{ + if (input & FXP_FRACTION_MASK) { + return (input & FXP_INTEGER_MASK) + FXP_ONE; + } + return input; +} + +static inline FXP +fxpFloor(const FXP input) +{ + return (input & FXP_INTEGER_MASK); +} + +static inline int +PatchIndexValue(private struct CHWTessellator *ctx, int index) +{ + if (ctx->bUsingPatchedIndices) { + // assumed remapped outide indices are > remapped inside vertices + if (index >= ctx->IndexPatchCtx.outsidePointIndexPatchBase) { + if (index == ctx->IndexPatchCtx.outsidePointIndexBadValue) + return ctx->IndexPatchCtx.outsidePointIndexReplacementValue; + else + return index + ctx->IndexPatchCtx.outsidePointIndexDeltaToRealValue; + } else { + if (index == ctx->IndexPatchCtx.insidePointIndexBadValue) + return ctx->IndexPatchCtx.insidePointIndexReplacementValue; + else + return index + ctx->IndexPatchCtx.insidePointIndexDeltaToRealValue; + } + } else if (ctx->bUsingPatchedIndices2) { + if (index == ctx->IndexPatchCtx2.cornerCaseBadValue) { + return ctx->IndexPatchCtx2.cornerCaseReplacementValue; + } else if (index >= ctx->IndexPatchCtx2.baseIndexToInvert) { + return ctx->IndexPatchCtx2.indexInversionEndPoint - index; + } + } + + return index; +} + +static inline void +DefinePoint(global struct poly_tess_point *out, FXP fxpU, FXP fxpV) +{ + out->u = fxpU; + out->v = fxpV; +} + +static inline void +DefineIndex(private struct CHWTessellator *ctx, int index, + int indexStorageOffset) +{ + global uint32_t *indices = (global uint32_t *)ctx->Index; + indices[indexStorageOffset] = ctx->index_bias + PatchIndexValue(ctx, index); +} + +static inline void +DefineTriangle(private struct CHWTessellator *ctx, int index0, int index1, + int index2, int indexStorageBaseOffset) +{ + index0 = PatchIndexValue(ctx, index0); + index1 = PatchIndexValue(ctx, index1); + index2 = PatchIndexValue(ctx, index2); + + vstore3(ctx->index_bias + (uint3)(index0, index1, index2), 0, + (global uint *)ctx->Index + indexStorageBaseOffset); +} + +static inline uint32_t +RemoveMSB(uint32_t val) +{ + uint32_t bit = val ? (1 << (31 - clz(val))) : 0; + return val & ~bit; +} + +static inline int +NumPointsForTessFactor(bool odd, FXP fxpTessFactor) +{ + // Add epsilon for rounding and add 1 for odd + FXP f = fxpTessFactor + (odd ? (FXP_ONE + 1) : 1); + int r = fxpCeil(f / 2) >> (FXP_FRACTION_BITS - 1); + return odd ? r : r + 1; +} + +static inline void +ComputeTessFactorCtx(bool odd, FXP fxpTessFactor, + private TESS_FACTOR_CONTEXT *TessFactorCtx) +{ + // fxpHalfTessFactor == 1/2 if TessFactor is 1, + // but we're pretending we are even. + FXP fxpHalfTessFactor = (fxpTessFactor + 1 /*round*/) / 2; + if (odd || (fxpHalfTessFactor == FXP_ONE_HALF)) { + fxpHalfTessFactor += FXP_ONE_HALF; + } + FXP fxpFloorHalfTessFactor = fxpFloor(fxpHalfTessFactor); + FXP fxpCeilHalfTessFactor = fxpCeil(fxpHalfTessFactor); + TessFactorCtx->fxpHalfTessFactorFraction = fxpHalfTessFactor - fxpFloorHalfTessFactor; + TessFactorCtx->numHalfTessFactorPoints = + (fxpCeilHalfTessFactor >> FXP_FRACTION_BITS); // for EVEN, we don't include the point always + // fixed at the midpoint of the TessFactor + if (fxpCeilHalfTessFactor == fxpFloorHalfTessFactor) { + TessFactorCtx->splitPointOnFloorHalfTessFactor = + /*pick value to cause this to be ignored*/ TessFactorCtx->numHalfTessFactorPoints + 1; + } else if (odd) { + if (fxpFloorHalfTessFactor == FXP_ONE) { + TessFactorCtx->splitPointOnFloorHalfTessFactor = 0; + } else { + TessFactorCtx->splitPointOnFloorHalfTessFactor = + (RemoveMSB((fxpFloorHalfTessFactor >> FXP_FRACTION_BITS) - 1) << 1) + 1; + } + } else { + TessFactorCtx->splitPointOnFloorHalfTessFactor = + (RemoveMSB(fxpFloorHalfTessFactor >> FXP_FRACTION_BITS) << 1) + 1; + } + int numFloorSegments = (fxpFloorHalfTessFactor * 2) >> FXP_FRACTION_BITS; + int numCeilSegments = (fxpCeilHalfTessFactor * 2) >> FXP_FRACTION_BITS; + if (odd) { + numFloorSegments -= 1; + numCeilSegments -= 1; + } + TessFactorCtx->fxpInvNumSegmentsOnFloorTessFactor = + floatToFixed(1.0f / (float)numFloorSegments); + TessFactorCtx->fxpInvNumSegmentsOnCeilTessFactor = + floatToFixed(1.0f / (float)numCeilSegments); +} + +static inline FXP +PlacePointIn1D(private const TESS_FACTOR_CONTEXT *TessFactorCtx, bool odd, + int point) +{ + bool bFlip = point >= TessFactorCtx->numHalfTessFactorPoints; + + if (bFlip) { + point = (TessFactorCtx->numHalfTessFactorPoints << 1) - point - odd; + } + + // special casing middle since 16 bit fixed math below can't reproduce 0.5 exactly + if (point == TessFactorCtx->numHalfTessFactorPoints) + return FXP_ONE_HALF; + + unsigned int indexOnCeilHalfTessFactor = point; + unsigned int indexOnFloorHalfTessFactor = indexOnCeilHalfTessFactor; + if (point > TessFactorCtx->splitPointOnFloorHalfTessFactor) { + indexOnFloorHalfTessFactor -= 1; + } + // For the fixed point multiplies below, we know the results are <= 16 bits + // because the locations on the halfTessFactor are <= half the number of + // segments for the total TessFactor. So a number divided by a number that + // is at least twice as big will give a result no bigger than 0.5 (which in + // fixed point is 16 bits in our case) + FXP fxpLocationOnFloorHalfTessFactor = + indexOnFloorHalfTessFactor * TessFactorCtx->fxpInvNumSegmentsOnFloorTessFactor; + FXP fxpLocationOnCeilHalfTessFactor = + indexOnCeilHalfTessFactor * TessFactorCtx->fxpInvNumSegmentsOnCeilTessFactor; + + // Since we know the numbers calculated above are <= fixed point 0.5, and the + // equation below is just lerping between two values <= fixed point 0.5 + // (0x00008000), then we know that the final result before shifting by 16 bits + // is no larger than 0x80000000. Once we shift that down by 16, we get the + // result of lerping 2 numbers <= 0.5, which is obviously at most 0.5 + // (0x00008000) + FXP fxpLocation = + fxpLocationOnFloorHalfTessFactor * (FXP_ONE - TessFactorCtx->fxpHalfTessFactorFraction) + + fxpLocationOnCeilHalfTessFactor * (TessFactorCtx->fxpHalfTessFactorFraction); + fxpLocation = (fxpLocation + FXP_ONE_HALF /*round*/) >> FXP_FRACTION_BITS; // get back to n.16 + if (bFlip) { + fxpLocation = FXP_ONE - fxpLocation; + } + return fxpLocation; +} + +static inline void +StitchRegular(private struct CHWTessellator *ctx, bool bTrapezoid, + DIAGONALS diagonals, int baseIndexOffset, int numInsideEdgePoints, + int insideEdgePointBaseOffset, int outsideEdgePointBaseOffset) +{ + int insidePoint = insideEdgePointBaseOffset; + int outsidePoint = outsideEdgePointBaseOffset; + if (bTrapezoid) { + DefineTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint, + baseIndexOffset); + baseIndexOffset += 3; + outsidePoint++; + } + int p; + switch (diagonals) { + case DIAGONALS_INSIDE_TO_OUTSIDE: + // Diagonals pointing from inside edge forward towards outside edge + for (p = 0; p < numInsideEdgePoints - 1; p++) { + DefineTriangle(ctx, insidePoint, outsidePoint, outsidePoint + 1, + baseIndexOffset); + baseIndexOffset += 3; + + DefineTriangle(ctx, insidePoint, outsidePoint + 1, insidePoint + 1, + baseIndexOffset); + baseIndexOffset += 3; + insidePoint++; + outsidePoint++; + } + break; + case DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE: // Assumes ODD tessellation + // Diagonals pointing from outside edge forward towards inside edge + + // First half + for (p = 0; p < numInsideEdgePoints / 2 - 1; p++) { + DefineTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint, + baseIndexOffset); + baseIndexOffset += 3; + DefineTriangle(ctx, insidePoint, outsidePoint + 1, insidePoint + 1, + baseIndexOffset); + baseIndexOffset += 3; + insidePoint++; + outsidePoint++; + } + + // Middle + DefineTriangle(ctx, outsidePoint, insidePoint + 1, insidePoint, + baseIndexOffset); + baseIndexOffset += 3; + DefineTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint + 1, + baseIndexOffset); + baseIndexOffset += 3; + insidePoint++; + outsidePoint++; + p += 2; + + // Second half + for (; p < numInsideEdgePoints; p++) { + DefineTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint, + baseIndexOffset); + baseIndexOffset += 3; + DefineTriangle(ctx, insidePoint, outsidePoint + 1, insidePoint + 1, + baseIndexOffset); + baseIndexOffset += 3; + insidePoint++; + outsidePoint++; + } + break; + case DIAGONALS_MIRRORED: + // First half, diagonals pointing from outside of outside edge to inside of + // inside edge + for (p = 0; p < numInsideEdgePoints / 2; p++) { + DefineTriangle(ctx, outsidePoint, insidePoint + 1, insidePoint, + baseIndexOffset); + baseIndexOffset += 3; + DefineTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint + 1, + baseIndexOffset); + baseIndexOffset += 3; + insidePoint++; + outsidePoint++; + } + // Second half, diagonals pointing from inside of inside edge to outside of + // outside edge + for (; p < numInsideEdgePoints - 1; p++) { + DefineTriangle(ctx, insidePoint, outsidePoint, outsidePoint + 1, + baseIndexOffset); + baseIndexOffset += 3; + DefineTriangle(ctx, insidePoint, outsidePoint + 1, insidePoint + 1, + baseIndexOffset); + baseIndexOffset += 3; + insidePoint++; + outsidePoint++; + } + break; + } + if (bTrapezoid) { + DefineTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint, + baseIndexOffset); + baseIndexOffset += 3; + } +} + +// loop_start and loop_end give optimal loop bounds for +// the stitching algorithm further below, for any given halfTssFactor. There +// is probably a better way to encode this... +// +// Return the FIRST entry in finalPointPositionTable awhich is less than +// halfTessFactor, except entry 0 and 1 which are set up to skip the loop. +static inline int +loop_start(int N) +{ + if (N < 2) + return 1; + else if (N == 2) + return 17; + else if (N < 5) + return 9; + else if (N < 9) + return 5; + else if (N < 17) + return 3; + else + return 2; +} + +// Return the LAST entry in finalPointPositionTable[] which is less than +// halfTessFactor, except entry 0 and 1 which are set up to skip the loop. +static int +loop_end(int N) +{ + if (N < 2) + return 0; + else if (N < 4) + return 17; + else if (N < 8) + return 25; + else if (N < 16) + return 29; + else if (N < 32) + return 31; + else + return 32; +} + +// Tables to assist in the stitching of 2 rows of points having arbitrary +// TessFactors. The stitching order is governed by Ruler Function vertex +// split ordering (see external documentation). +// +// The contents of the finalPointPositionTable are where vertex i [0..33] +// ends up on the half-edge at the max tessellation amount given +// ruler-function split order. Recall the other half of an edge is mirrored, +// so we only need to deal with one half. This table is used to decide when +// to advance a point on the interior or exterior. It supports odd TessFactor +// up to 65 and even TessFactor up to 64. + +/* TODO: Is this actually faster than a LUT? */ +static inline uint32_t +finalPointPositionTable(uint32_t x) +{ + if (x == 0) + return 0; + if (x == 1) + return 0x20; + + uint32_t shift; + if ((x & 1) == 0) { + shift = 1; + } else if ((x & 3) == 3) { + shift = 2; + } else if ((x & 7) == 5) { + shift = 3; + } else if (x != 17) { + shift = 4; + } else { + shift = 5; + } + + // SWAR vectorized right-shift of (0x20, x) + // We're calculating `min(0xf, 0x20 >> shift) + (x >> shift)`. + uint32_t items_to_shift = x | (0x20 << 16); + uint32_t shifted = items_to_shift >> shift; + + uint32_t bias = min(0xfu, shifted >> 16); + return bias + (shifted & 0xffff); +} + +static inline void +StitchTransition(private struct CHWTessellator *ctx, int baseIndexOffset, + int insideEdgePointBaseOffset, + int insideNumHalfTessFactorPoints, + bool insideEdgeTessFactorOdd, int outsideEdgePointBaseOffset, + int outsideNumHalfTessFactorPoints, bool outsideTessFactorOdd) +{ + if (insideEdgeTessFactorOdd) { + insideNumHalfTessFactorPoints -= 1; + } + if (outsideTessFactorOdd) { + outsideNumHalfTessFactorPoints -= 1; + } + // Walk first half + int outsidePoint = outsideEdgePointBaseOffset; + int insidePoint = insideEdgePointBaseOffset; + + // iStart,iEnd are a small optimization so the loop below doesn't have to go + // from 0 up to 31 + int iStart = min(loop_start(insideNumHalfTessFactorPoints), + loop_start(outsideNumHalfTessFactorPoints)); + int iEnd = loop_end( + max(insideNumHalfTessFactorPoints, outsideNumHalfTessFactorPoints)); + + // since we don't start the loop at 0 below, we need a special case. + if (0 < outsideNumHalfTessFactorPoints) { + // Advance outside + DefineTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint, + baseIndexOffset); + baseIndexOffset += 3; + outsidePoint++; + } + + for (int i = iStart; i <= iEnd; i++) { + int bound = finalPointPositionTable(i); + + if (bound < insideNumHalfTessFactorPoints) { + // Advance inside + DefineTriangle(ctx, insidePoint, outsidePoint, insidePoint + 1, + baseIndexOffset); + baseIndexOffset += 3; + insidePoint++; + } + if (bound < outsideNumHalfTessFactorPoints) { + // Advance outside + DefineTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint, + baseIndexOffset); + baseIndexOffset += 3; + outsidePoint++; + } + } + + if ((insideEdgeTessFactorOdd != outsideTessFactorOdd) || + insideEdgeTessFactorOdd) { + if (insideEdgeTessFactorOdd == outsideTessFactorOdd) { + // Quad in the middle + DefineTriangle(ctx, insidePoint, outsidePoint, insidePoint + 1, + baseIndexOffset); + baseIndexOffset += 3; + DefineTriangle(ctx, insidePoint + 1, outsidePoint, outsidePoint + 1, + baseIndexOffset); + baseIndexOffset += 3; + insidePoint++; + outsidePoint++; + } else if (!insideEdgeTessFactorOdd) { + // Triangle pointing inside + DefineTriangle(ctx, insidePoint, outsidePoint, outsidePoint + 1, + baseIndexOffset); + baseIndexOffset += 3; + outsidePoint++; + } else { + // Triangle pointing outside + DefineTriangle(ctx, insidePoint, outsidePoint, insidePoint + 1, + baseIndexOffset); + baseIndexOffset += 3; + insidePoint++; + } + } + + // Walk second half. + for (int i = iEnd; i >= iStart; i--) { + int bound = finalPointPositionTable(i); + + if (bound < outsideNumHalfTessFactorPoints) { + // Advance outside + DefineTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint, + baseIndexOffset); + baseIndexOffset += 3; + outsidePoint++; + } + if (bound < insideNumHalfTessFactorPoints) { + // Advance inside + DefineTriangle(ctx, insidePoint, outsidePoint, insidePoint + 1, + baseIndexOffset); + baseIndexOffset += 3; + insidePoint++; + } + } + // Below case is not needed if we didn't optimize loop above and made it run + // from 31 down to 0. + if (0 < outsideNumHalfTessFactorPoints) { + DefineTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint, + baseIndexOffset); + baseIndexOffset += 3; + outsidePoint++; + } +} + +static inline void +poly_tess_isoline_process(constant struct poly_tess_args *p, uint32_t patch, + enum poly_tess_mode mode) +{ + enum poly_tess_partitioning partitioning = p->partitioning; + + bool lineDensityOdd; + bool lineDetailOdd; + TESS_FACTOR_CONTEXT lineDensityTessFactorCtx; + TESS_FACTOR_CONTEXT lineDetailTessFactorCtx; + + global float *factors = tess_factors(p, patch); + float TessFactor_V_LineDensity = factors[0]; + float TessFactor_U_LineDetail = factors[1]; + + // Is the patch culled? NaN will pass. + if (!(TessFactor_V_LineDensity > 0) || !(TessFactor_U_LineDetail > 0)) { + poly_draw_empty(p, mode, patch); + return; + } + + // Clamp edge TessFactors + TessFactor_V_LineDensity = + clamp(TessFactor_V_LineDensity, + POLY_TESS_MIN_ISOLINE_DENSITY_TESSELLATION_FACTOR, + POLY_TESS_MAX_ISOLINE_DENSITY_TESSELLATION_FACTOR); + TessFactor_U_LineDetail = + clamp_factor(TessFactor_U_LineDetail, partitioning, 0); + + // Process tessFactors + if (partitioning == POLY_TESS_PARTITIONING_INTEGER) { + lineDetailOdd = isOdd(TessFactor_U_LineDetail); + } else { + lineDetailOdd = (partitioning == POLY_TESS_PARTITIONING_FRACTIONAL_ODD); + } + + FXP fxpTessFactor_U_LineDetail = floatToFixed(TessFactor_U_LineDetail); + + ComputeTessFactorCtx(lineDetailOdd, fxpTessFactor_U_LineDetail, + &lineDetailTessFactorCtx); + int numPointsPerLine = + NumPointsForTessFactor(lineDetailOdd, fxpTessFactor_U_LineDetail); + + TessFactor_V_LineDensity = ceil(TessFactor_V_LineDensity); + lineDensityOdd = isOdd(TessFactor_V_LineDensity); + FXP fxpTessFactor_V_LineDensity = floatToFixed(TessFactor_V_LineDensity); + ComputeTessFactorCtx(lineDensityOdd, fxpTessFactor_V_LineDensity, + &lineDensityTessFactorCtx); + + // don't draw last line at V == 1. + int numLines = + NumPointsForTessFactor(lineDensityOdd, fxpTessFactor_V_LineDensity) - 1; + + /* Points */ + uint num_points = numPointsPerLine * numLines; + if (mode != POLY_TESS_MODE_COUNT) { + global struct poly_tess_point *points = + poly_heap_alloc_points(p, patch, num_points); + + for (int line = 0, pointOffset = 0; line < numLines; line++) { + FXP fxpV = + PlacePointIn1D(&lineDensityTessFactorCtx, lineDensityOdd, line); + + for (int point = 0; point < numPointsPerLine; point++) { + FXP fxpU = + PlacePointIn1D(&lineDetailTessFactorCtx, lineDetailOdd, point); + + DefinePoint(&points[pointOffset++], fxpU, fxpV); + } + } + } + + struct CHWTessellator ctx = { + .mode = mode, + .index_bias = patch * POLY_TES_PATCH_ID_STRIDE, + }; + + /* Connectivity */ + if (!p->points_mode) { + uint num_indices = numLines * (numPointsPerLine - 1) * 2; + ctx.Index = poly_draw(p, mode, true, patch, num_indices); + + if (mode == POLY_TESS_MODE_COUNT) + return; + + for (int line = 0, pointOffset = 0, indexOffset = 0; line < numLines; + line++) { + pointOffset++; + + for (int point = 1; point < numPointsPerLine; point++) { + DefineIndex(&ctx, pointOffset - 1, indexOffset++); + DefineIndex(&ctx, pointOffset, indexOffset++); + pointOffset++; + } + } + } else { + poly_draw_points(&ctx, p, patch, num_points); + } +} + +static inline void +poly_tess_tri_process(constant struct poly_tess_args *p, uint32_t patch, + enum poly_tess_mode mode) +{ + enum poly_tess_partitioning partitioning = p->partitioning; + + global float *factors = tess_factors(p, patch); + float tessFactor_Ueq0 = factors[0]; + float tessFactor_Veq0 = factors[1]; + float tessFactor_Weq0 = factors[2]; + float insideTessFactor_f = factors[4]; + + struct CHWTessellator ctx = { + .mode = mode, + .index_bias = patch * POLY_TES_PATCH_ID_STRIDE, + }; + + // Is the patch culled? NaN will pass. + if (!(tessFactor_Ueq0 > 0) || !(tessFactor_Veq0 > 0) || + !(tessFactor_Weq0 > 0)) { + + poly_draw_empty(p, mode, patch); + + return; + } + + FXP outsideTessFactor[TRI_EDGES]; + FXP insideTessFactor; + bool outsideTessFactorOdd[TRI_EDGES]; + bool insideTessFactorOdd; + TESS_FACTOR_CONTEXT outsideTessFactorCtx[TRI_EDGES]; + TESS_FACTOR_CONTEXT insideTessFactorCtx; + // Stuff below is just specific to the traversal order + // this code happens to use to generate points/lines + int numPointsForOutsideEdge[TRI_EDGES]; + int numPointsForInsideTessFactor; + int insideEdgePointBaseOffset; + + // Clamp TessFactors + tessFactor_Ueq0 = clamp_factor(tessFactor_Ueq0, partitioning, 0); + tessFactor_Veq0 = clamp_factor(tessFactor_Veq0, partitioning, 0); + tessFactor_Weq0 = clamp_factor(tessFactor_Weq0, partitioning, 0); + + float maxf = max(max(tessFactor_Ueq0, tessFactor_Veq0), tessFactor_Weq0); + insideTessFactor_f = clamp_factor(insideTessFactor_f, partitioning, maxf); + // Note the above clamps map NaN to the lower bound + + // Process tessFactors + float outsideTessFactor_f[TRI_EDGES] = {tessFactor_Ueq0, tessFactor_Veq0, + tessFactor_Weq0}; + if (partitioning == POLY_TESS_PARTITIONING_INTEGER) { + for (int edge = 0; edge < TRI_EDGES; edge++) { + outsideTessFactorOdd[edge] = isOdd(outsideTessFactor_f[edge]); + } + insideTessFactorOdd = + isOdd(insideTessFactor_f) && (1.0f != insideTessFactor_f); + } else { + bool odd = (partitioning == POLY_TESS_PARTITIONING_FRACTIONAL_ODD); + + for (int edge = 0; edge < TRI_EDGES; edge++) { + outsideTessFactorOdd[edge] = odd; + } + insideTessFactorOdd = odd; + } + + // Save fixed point TessFactors + for (int edge = 0; edge < TRI_EDGES; edge++) { + outsideTessFactor[edge] = floatToFixed(outsideTessFactor_f[edge]); + } + insideTessFactor = floatToFixed(insideTessFactor_f); + + if (partitioning != POLY_TESS_PARTITIONING_FRACTIONAL_EVEN) { + // Special case if all TessFactors are 1 + if ((FXP_ONE == insideTessFactor) && + (FXP_ONE == outsideTessFactor[Ueq0]) && + (FXP_ONE == outsideTessFactor[Veq0]) && + (FXP_ONE == outsideTessFactor[Weq0])) { + + /* Just do minimum tess factor */ + if (mode == POLY_TESS_MODE_COUNT) { + p->counts[patch] = 3; + return; + } + + global struct poly_tess_point *points = + poly_heap_alloc_points(p, patch, 3); + + DefinePoint(&points[0], 0, + FXP_ONE); // V=1 (beginning of Ueq0 edge VW) + DefinePoint(&points[1], 0, 0); // W=1 (beginning of Veq0 edge WU) + DefinePoint(&points[2], FXP_ONE, + 0); // U=1 (beginning of Weq0 edge UV) + + if (!p->points_mode) { + ctx.Index = poly_draw(p, mode, false, patch, 3); + + DefineTriangle(&ctx, 0, 1, 2, + /*indexStorageBaseOffset*/ 0); + } else { + poly_draw_points(&ctx, p, patch, 3); + } + + return; + } + } + + // Compute per-TessFactor metadata + for (int edge = 0; edge < TRI_EDGES; edge++) { + ComputeTessFactorCtx(outsideTessFactorOdd[edge], outsideTessFactor[edge], + &outsideTessFactorCtx[edge]); + } + ComputeTessFactorCtx(insideTessFactorOdd, insideTessFactor, + &insideTessFactorCtx); + + // Compute some initial data. + int NumPoints = 0; + + // outside edge offsets and storage + for (int edge = 0; edge < TRI_EDGES; edge++) { + numPointsForOutsideEdge[edge] = NumPointsForTessFactor( + outsideTessFactorOdd[edge], outsideTessFactor[edge]); + NumPoints += numPointsForOutsideEdge[edge]; + } + NumPoints -= 3; + + // inside edge offsets + numPointsForInsideTessFactor = + NumPointsForTessFactor(insideTessFactorOdd, insideTessFactor); + { + int pointCountMin = insideTessFactorOdd ? 4 : 3; + // max() allows degenerate transition regions when inside TessFactor == 1 + numPointsForInsideTessFactor = + max(pointCountMin, numPointsForInsideTessFactor); + } + + insideEdgePointBaseOffset = NumPoints; + + // inside storage, including interior edges above + { + int interiorRings = (numPointsForInsideTessFactor >> 1) - 1; + int even = insideTessFactorOdd ? 0 : 1; + NumPoints += TRI_EDGES * (interiorRings * (interiorRings + even)) + even; + } + + /* GENERATE POINTS */ + if (mode != POLY_TESS_MODE_COUNT) { + ctx.Point = poly_heap_alloc_points(p, patch, NumPoints); + + // Generate exterior ring edge points, clockwise starting from point V + // (VW, the U==0 edge) + int pointOffset = 0; + for (int edge = 0; edge < TRI_EDGES; edge++) { + int odd = edge & 0x1; + int endPoint = numPointsForOutsideEdge[edge] - 1; + // don't include end, since next edge starts with it. + for (int p = 0; p < endPoint; p++, pointOffset++) { + // whether to reverse point order given we are defining V or U (W + // implicit): edge0, VW, has V decreasing, so reverse 1D points + // below edge1, WU, has U increasing, so don't reverse 1D points + // below edge2, UV, has U decreasing, so reverse 1D points below + int q = odd ? p : endPoint - p; + + FXP fxpParam = PlacePointIn1D(&outsideTessFactorCtx[edge], + outsideTessFactorOdd[edge], q); + DefinePoint(&ctx.Point[pointOffset], (edge == 0) ? 0 : fxpParam, + (edge == 0) ? fxpParam + : (edge == 2) ? FXP_ONE - fxpParam + : 0); + } + } + + // Generate interior ring points, clockwise spiralling in + int numRings = (numPointsForInsideTessFactor >> 1); + for (int ring = 1; ring < numRings; ring++) { + int startPoint = ring; + int endPoint = numPointsForInsideTessFactor - 1 - startPoint; + + int perpendicularAxisPoint = startPoint; + FXP fxpPerpParam = PlacePointIn1D( + &insideTessFactorCtx, insideTessFactorOdd, perpendicularAxisPoint); + + // Map location to the right size in + // barycentric space. We know this fixed + // point math won't over/underflow + fxpPerpParam *= FXP_TWO_THIRDS; + fxpPerpParam = (fxpPerpParam + FXP_ONE_HALF /*round*/) >> + FXP_FRACTION_BITS; // get back to n.16 + + for (int edge = 0; edge < TRI_EDGES; edge++) { + int odd = edge & 0x1; + + // don't include end: next edge starts with it. + for (int p = startPoint; p < endPoint; p++, pointOffset++) { + // whether to reverse point given we are defining V or U (W + // implicit): edge0, VW, has V decreasing, so reverse 1D points + // below edge1, WU, has U increasing, so don't reverse 1D points + // below edge2, UV, has U decreasing, so reverse 1D points below + int q = odd ? p : endPoint - (p - startPoint); + + FXP fxpParam = + PlacePointIn1D(&insideTessFactorCtx, insideTessFactorOdd, q); + // edge0 VW, has perpendicular parameter U constant + // edge1 WU, has perpendicular parameter V constant + // edge2 UV, has perpendicular parameter W constant + // reciprocal is the rate of change of edge-parallel parameters + // as they are pushed into the triangle + const unsigned int deriv = 2; + + // we know this fixed point math won't over/underflow + FXP tmp = fxpParam - (fxpPerpParam + 1 /*round*/) / deriv; + + DefinePoint(&ctx.Point[pointOffset], + edge > 0 ? tmp : fxpPerpParam, + edge == 0 ? tmp + : edge == 1 ? fxpPerpParam + : FXP_ONE - tmp - fxpPerpParam); + } + } + } + if (!insideTessFactorOdd) { + // Last point is the point at the center. + DefinePoint(&ctx.Point[pointOffset], FXP_ONE_THIRD, FXP_ONE_THIRD); + } + } + + if (p->points_mode) { + poly_draw_points(&ctx, p, patch, NumPoints); + return; + } + + { + // Generate primitives for all the concentric rings, one side at a time + // for each ring +1 is so even tess includes the center point, which we + // want to now + int numRings = ((numPointsForInsideTessFactor + 1) >> 1); + + int NumIndices = 0; + { + int OuterPoints = numPointsForOutsideEdge[0] + + numPointsForOutsideEdge[1] + + numPointsForOutsideEdge[2]; + + int numRings18 = numRings * 18; + NumIndices = ((numRings18 - 27) * numPointsForInsideTessFactor) + + (3 * OuterPoints) - (numRings18 * (numRings - 1)) + + (insideTessFactorOdd ? 3 : 0); + } + + // Generate the draw and allocate the index buffer now that we know the size + ctx.Index = poly_draw(p, mode, false, patch, NumIndices); + + if (mode == POLY_TESS_MODE_COUNT) + return; + + int insideOffset = insideEdgePointBaseOffset; + int outsideEdgePointBaseOffset = 0; + + NumIndices = 0; + for (int ring = 1; ring < numRings; ring++) { + int numPointsForInsideEdge = numPointsForInsideTessFactor - 2 * ring; + int edge0InsidePointBaseOffset = insideOffset; + int edge0OutsidePointBaseOffset = outsideEdgePointBaseOffset; + for (int edge = 0; edge < TRI_EDGES; edge++) { + int outsidePoints = ring == 1 ? numPointsForOutsideEdge[edge] + : (numPointsForInsideEdge + 2); + + int numTriangles = numPointsForInsideEdge + outsidePoints - 2; + + int insideBaseOffset; + int outsideBaseOffset; + if (edge == 2) { + ctx.IndexPatchCtx.insidePointIndexDeltaToRealValue = + insideOffset; + ctx.IndexPatchCtx.insidePointIndexBadValue = + numPointsForInsideEdge - 1; + ctx.IndexPatchCtx.insidePointIndexReplacementValue = + edge0InsidePointBaseOffset; + ctx.IndexPatchCtx.outsidePointIndexPatchBase = + ctx.IndexPatchCtx.insidePointIndexBadValue + + 1; // past inside patched index range + ctx.IndexPatchCtx.outsidePointIndexDeltaToRealValue = + outsideEdgePointBaseOffset - + ctx.IndexPatchCtx.outsidePointIndexPatchBase; + ctx.IndexPatchCtx.outsidePointIndexBadValue = + ctx.IndexPatchCtx.outsidePointIndexPatchBase + outsidePoints - + 1; + ctx.IndexPatchCtx.outsidePointIndexReplacementValue = + edge0OutsidePointBaseOffset; + ctx.bUsingPatchedIndices = true; + insideBaseOffset = 0; + outsideBaseOffset = ctx.IndexPatchCtx.outsidePointIndexPatchBase; + } else { + insideBaseOffset = insideOffset; + outsideBaseOffset = outsideEdgePointBaseOffset; + } + if (ring == 1) { + StitchTransition( + &ctx, /*baseIndexOffset: */ NumIndices, insideBaseOffset, + insideTessFactorCtx.numHalfTessFactorPoints, + insideTessFactorOdd, outsideBaseOffset, + outsideTessFactorCtx[edge].numHalfTessFactorPoints, + outsideTessFactorOdd[edge]); + } else { + StitchRegular(&ctx, /*bTrapezoid*/ true, DIAGONALS_MIRRORED, + /*baseIndexOffset: */ NumIndices, + numPointsForInsideEdge, insideBaseOffset, + outsideBaseOffset); + } + if (2 == edge) { + ctx.bUsingPatchedIndices = false; + } + NumIndices += numTriangles * 3; + outsideEdgePointBaseOffset += outsidePoints - 1; + insideOffset += numPointsForInsideEdge - 1; + } + } + if (insideTessFactorOdd) { + // Triangulate center (a single triangle) + DefineTriangle(&ctx, outsideEdgePointBaseOffset, + outsideEdgePointBaseOffset + 1, + outsideEdgePointBaseOffset + 2, NumIndices); + NumIndices += 3; + } + } +} + +static inline void +poly_tess_quad_process(constant struct poly_tess_args *p, uint32_t patch, + enum poly_tess_mode mode) +{ + enum poly_tess_partitioning partitioning = p->partitioning; + global float *factors = tess_factors(p, patch); + + float tessFactor_Ueq0 = factors[0]; + float tessFactor_Veq0 = factors[1]; + float tessFactor_Ueq1 = factors[2]; + float tessFactor_Veq1 = factors[3]; + + float insideTessFactor_U = factors[4]; + float insideTessFactor_V = factors[5]; + + struct CHWTessellator ctx = { + .mode = mode, + .index_bias = patch * POLY_TES_PATCH_ID_STRIDE, + }; + + // Is the patch culled? + if (!(tessFactor_Ueq0 > 0) || // NaN will pass + !(tessFactor_Veq0 > 0) || !(tessFactor_Ueq1 > 0) || + !(tessFactor_Veq1 > 0)) { + poly_draw_empty(p, mode, patch); + return; + } + + FXP outsideTessFactor[QUAD_EDGES]; + FXP insideTessFactor[QUAD_AXES]; + bool outsideTessFactorOdd[QUAD_EDGES]; + bool insideTessFactorOdd[QUAD_AXES]; + TESS_FACTOR_CONTEXT outsideTessFactorCtx[QUAD_EDGES]; + TESS_FACTOR_CONTEXT insideTessFactorCtx[QUAD_AXES]; + // Stuff below is just specific to the traversal order + // this code happens to use to generate points/lines + int numPointsForOutsideEdge[QUAD_EDGES]; + int numPointsForInsideTessFactor[QUAD_AXES]; + int insideEdgePointBaseOffset; + + // Clamp edge TessFactors + tessFactor_Ueq0 = clamp_factor(tessFactor_Ueq0, partitioning, 0); + tessFactor_Veq0 = clamp_factor(tessFactor_Veq0, partitioning, 0); + tessFactor_Ueq1 = clamp_factor(tessFactor_Ueq1, partitioning, 0); + tessFactor_Veq1 = clamp_factor(tessFactor_Veq1, partitioning, 0); + + float maxf = max(max(max(tessFactor_Ueq0, tessFactor_Veq0), + max(tessFactor_Ueq1, tessFactor_Veq1)), + max(insideTessFactor_U, insideTessFactor_V)); + + insideTessFactor_U = clamp_factor(insideTessFactor_U, partitioning, maxf); + insideTessFactor_V = clamp_factor(insideTessFactor_V, partitioning, maxf); + // Note the above clamps map NaN to lowerBound + + // Process tessFactors + float outsideTessFactor_f[QUAD_EDGES] = {tessFactor_Ueq0, tessFactor_Veq0, + tessFactor_Ueq1, tessFactor_Veq1}; + float insideTessFactor_f[QUAD_AXES] = {insideTessFactor_U, + insideTessFactor_V}; + if (partitioning == POLY_TESS_PARTITIONING_INTEGER) { + for (int edge = 0; edge < QUAD_EDGES; edge++) { + outsideTessFactorOdd[edge] = isOdd(outsideTessFactor_f[edge]); + } + for (int axis = 0; axis < QUAD_AXES; axis++) { + insideTessFactorOdd[axis] = isOdd(insideTessFactor_f[axis]) && + (1.0f != insideTessFactor_f[axis]); + } + } else { + bool odd = (partitioning == POLY_TESS_PARTITIONING_FRACTIONAL_ODD); + + for (int edge = 0; edge < QUAD_EDGES; edge++) { + outsideTessFactorOdd[edge] = odd; + } + insideTessFactorOdd[U] = insideTessFactorOdd[V] = odd; + } + + // Save fixed point TessFactors + for (int edge = 0; edge < QUAD_EDGES; edge++) { + outsideTessFactor[edge] = floatToFixed(outsideTessFactor_f[edge]); + } + for (int axis = 0; axis < QUAD_AXES; axis++) { + insideTessFactor[axis] = floatToFixed(insideTessFactor_f[axis]); + } + + if (partitioning != POLY_TESS_PARTITIONING_FRACTIONAL_EVEN) { + // Special case if all TessFactors are 1 + if ((FXP_ONE == insideTessFactor[U]) && + (FXP_ONE == insideTessFactor[V]) && + (FXP_ONE == outsideTessFactor[Ueq0]) && + (FXP_ONE == outsideTessFactor[Veq0]) && + (FXP_ONE == outsideTessFactor[Ueq1]) && + (FXP_ONE == outsideTessFactor[Veq1])) { + + /* Just do minimum tess factor */ + if (!p->points_mode) { + ctx.Index = poly_draw(p, mode, false, patch, 6); + if (mode == POLY_TESS_MODE_COUNT) + return; + + DefineTriangle(&ctx, 0, 1, 3, /*indexStorageOffset*/ 0); + DefineTriangle(&ctx, 1, 2, 3, /*indexStorageOffset*/ 3); + } else { + poly_draw_points(&ctx, p, patch, 4); + if (mode == POLY_TESS_MODE_COUNT) + return; + } + + global struct poly_tess_point *points = + poly_heap_alloc_points(p, patch, 4); + + DefinePoint(&points[0], 0, 0); + DefinePoint(&points[1], FXP_ONE, 0); + DefinePoint(&points[2], FXP_ONE, FXP_ONE); + DefinePoint(&points[3], 0, FXP_ONE); + return; + } + } + + // Compute TessFactor-specific metadata + for (int edge = 0; edge < QUAD_EDGES; edge++) { + ComputeTessFactorCtx(outsideTessFactorOdd[edge], outsideTessFactor[edge], + &outsideTessFactorCtx[edge]); + } + + for (int axis = 0; axis < QUAD_AXES; axis++) { + ComputeTessFactorCtx(insideTessFactorOdd[axis], insideTessFactor[axis], + &insideTessFactorCtx[axis]); + } + + int NumPoints = 0; + + // outside edge offsets and storage + for (int edge = 0; edge < QUAD_EDGES; edge++) { + numPointsForOutsideEdge[edge] = NumPointsForTessFactor( + outsideTessFactorOdd[edge], outsideTessFactor[edge]); + NumPoints += numPointsForOutsideEdge[edge]; + } + NumPoints -= 4; + + // inside edge offsets + for (int axis = 0; axis < QUAD_AXES; axis++) { + numPointsForInsideTessFactor[axis] = NumPointsForTessFactor( + insideTessFactorOdd[axis], insideTessFactor[axis]); + int pointCountMin = insideTessFactorOdd[axis] ? 4 : 3; + // max() allows degenerate transition regions when inside TessFactor == 1 + numPointsForInsideTessFactor[axis] = + max(pointCountMin, numPointsForInsideTessFactor[axis]); + } + + insideEdgePointBaseOffset = NumPoints; + + // inside storage, including interior edges above + int numInteriorPoints = (numPointsForInsideTessFactor[U] - 2) * + (numPointsForInsideTessFactor[V] - 2); + NumPoints += numInteriorPoints; + + if (mode != POLY_TESS_MODE_COUNT) { + ctx.Point = poly_heap_alloc_points(p, patch, NumPoints); + + // Generate exterior ring edge points, clockwise from top-left + int pointOffset = 0; + for (int edge = 0; edge < QUAD_EDGES; edge++) { + int odd = edge & 0x1; + // don't include end, since next edge starts with it. + int endPoint = numPointsForOutsideEdge[edge] - 1; + for (int p = 0; p < endPoint; p++, pointOffset++) { + int q = + ((edge == 1) || (edge == 2)) ? p : endPoint - p; // reverse order + FXP fxpParam = PlacePointIn1D(&outsideTessFactorCtx[edge], + outsideTessFactorOdd[edge], q); + + FXP u = odd ? fxpParam : ((edge == 2) ? FXP_ONE : 0); + FXP v = odd ? ((edge == 3) ? FXP_ONE : 0) : fxpParam; + DefinePoint(&ctx.Point[pointOffset], u, v); + } + } + + // Generate interior ring points, clockwise from (U==0,V==1) (bottom-left) + // spiralling toward center + int minNumPointsForTessFactor = + min(numPointsForInsideTessFactor[U], numPointsForInsideTessFactor[V]); + // note for even tess we aren't counting center point here. + int numRings = (minNumPointsForTessFactor >> 1); + + for (int ring = 1; ring < numRings; ring++) { + int startPoint = ring; + int endPoint[QUAD_AXES] = { + numPointsForInsideTessFactor[U] - 1 - startPoint, + numPointsForInsideTessFactor[V] - 1 - startPoint, + }; + + for (int edge = 0; edge < QUAD_EDGES; edge++) { + int odd[QUAD_AXES] = {edge & 0x1, ((edge + 1) & 0x1)}; + int perpendicularAxisPoint = + (edge < 2) ? startPoint : endPoint[odd[0]]; + FXP fxpPerpParam = PlacePointIn1D(&insideTessFactorCtx[odd[0]], + insideTessFactorOdd[odd[0]], + perpendicularAxisPoint); + + for (int p = startPoint; p < endPoint[odd[1]]; p++, + pointOffset++) // don't include end: next edge starts with + // it. + { + bool odd_ = odd[1]; + int q = ((edge == 1) || (edge == 2)) + ? p + : endPoint[odd_] - (p - startPoint); + FXP fxpParam = PlacePointIn1D(&insideTessFactorCtx[odd_], + insideTessFactorOdd[odd_], q); + DefinePoint(&ctx.Point[pointOffset], + odd_ ? fxpPerpParam : fxpParam, + odd_ ? fxpParam : fxpPerpParam); + } + } + } + // For even tessellation, the inner "ring" is degenerate - a row of points + if ((numPointsForInsideTessFactor[U] > numPointsForInsideTessFactor[V]) && + !insideTessFactorOdd[V]) { + int startPoint = numRings; + int endPoint = numPointsForInsideTessFactor[U] - 1 - startPoint; + for (int p = startPoint; p <= endPoint; p++, pointOffset++) { + FXP fxpParam = PlacePointIn1D(&insideTessFactorCtx[U], + insideTessFactorOdd[U], p); + DefinePoint(&ctx.Point[pointOffset], fxpParam, FXP_ONE_HALF); + } + } else if ((numPointsForInsideTessFactor[V] >= + numPointsForInsideTessFactor[U]) && + !insideTessFactorOdd[U]) { + int startPoint = numRings; + int endPoint = numPointsForInsideTessFactor[V] - 1 - startPoint; + for (int p = endPoint; p >= startPoint; p--, pointOffset++) { + FXP fxpParam = PlacePointIn1D(&insideTessFactorCtx[V], + insideTessFactorOdd[V], p); + DefinePoint(&ctx.Point[pointOffset], FXP_ONE_HALF, fxpParam); + } + } + } + + if (p->points_mode) { + poly_draw_points(&ctx, p, patch, NumPoints); + return; + } + + /* CONNECTIVITY */ + { + // Generate primitives for all the concentric rings, one side at a time + // for each ring. +1 is so even tess includes the center point + int numPointRowsToCenter[QUAD_AXES] = { + (numPointsForInsideTessFactor[U] + 1) >> 1, + (numPointsForInsideTessFactor[V] + 1) >> 1, + }; + + int numRings = min(numPointRowsToCenter[U], numPointRowsToCenter[V]); + + /* Calculate # of indices so we can allocate */ + { + /* Handle main case */ + int OuterPoints = + numPointsForOutsideEdge[0] + numPointsForOutsideEdge[1] + + numPointsForOutsideEdge[2] + numPointsForOutsideEdge[3]; + + int InnerPoints = + numPointsForInsideTessFactor[U] + numPointsForInsideTessFactor[V]; + + int NumIndices = (OuterPoints * 3) + (12 * numRings * InnerPoints) - + (InnerPoints * 18) - (24 * numRings * (numRings - 1)); + + /* Determine major/minor axes */ + bool U_major = + (numPointsForInsideTessFactor[U] > numPointsForInsideTessFactor[V]); + unsigned M = U_major ? U : V; + unsigned m = U_major ? V : U; + + /* Handle degenerate ring */ + if (insideTessFactorOdd[m]) { + NumIndices += 12 * ((numPointsForInsideTessFactor[M] >> 1) - + (numPointsForInsideTessFactor[m] >> 1)); + NumIndices += (insideTessFactorOdd[M] ? 6 : 12); + } + + // Generate the draw and allocate the index buffer with the size + ctx.Index = poly_draw(p, mode, false, patch, NumIndices); + } + + if (mode == POLY_TESS_MODE_COUNT) + return; + + int degeneratePointRing[QUAD_AXES] = { + // Even partitioning causes degenerate row of points, + // which results in exceptions to the point ordering conventions + // when travelling around the rings counterclockwise. + !insideTessFactorOdd[V] ? numPointRowsToCenter[V] - 1 : -1, + !insideTessFactorOdd[U] ? numPointRowsToCenter[U] - 1 : -1, + }; + + int numPointsForOutsideEdge_[QUAD_EDGES] = { + numPointsForOutsideEdge[Ueq0], + numPointsForOutsideEdge[Veq0], + numPointsForOutsideEdge[Ueq1], + numPointsForOutsideEdge[Veq1], + }; + + int insideEdgePointBaseOffset_ = insideEdgePointBaseOffset; + int outsideEdgePointBaseOffset = 0; + + int NumIndices = 0; + + for (int ring = 1; ring < numRings; ring++) { + int numPointsForInsideEdge[QUAD_AXES] = { + numPointsForInsideTessFactor[U] - 2 * ring, + numPointsForInsideTessFactor[V] - 2 * ring}; + + int edge0InsidePointBaseOffset = insideEdgePointBaseOffset_; + int edge0OutsidePointBaseOffset = outsideEdgePointBaseOffset; + + for (int edge = 0; edge < QUAD_EDGES; edge++) { + int odd = (edge + 1) & 0x1; + + int numTriangles = + numPointsForInsideEdge[odd] + numPointsForOutsideEdge_[edge] - 2; + int insideBaseOffset; + int outsideBaseOffset; + + // We need to patch the indexing so Stitch() can think it sees 2 + // sequentially increasing rows of points, even though we have + // wrapped around to the end of the inner and outer ring's points, + // so the last point is really the first point for the ring. We make + // it so that when Stitch() calls AddIndex(), that function will do + // any necessary index adjustment. + if (edge == 3) { + if (ring == degeneratePointRing[odd]) { + ctx.IndexPatchCtx2.baseIndexToInvert = + insideEdgePointBaseOffset_ + 1; + ctx.IndexPatchCtx2.cornerCaseBadValue = + outsideEdgePointBaseOffset + + numPointsForOutsideEdge_[edge] - 1; + ctx.IndexPatchCtx2.cornerCaseReplacementValue = + edge0OutsidePointBaseOffset; + ctx.IndexPatchCtx2.indexInversionEndPoint = + (ctx.IndexPatchCtx2.baseIndexToInvert << 1) - 1; + insideBaseOffset = ctx.IndexPatchCtx2.baseIndexToInvert; + outsideBaseOffset = outsideEdgePointBaseOffset; + ctx.bUsingPatchedIndices2 = true; + } else { + ctx.IndexPatchCtx.insidePointIndexDeltaToRealValue = + insideEdgePointBaseOffset_; + ctx.IndexPatchCtx.insidePointIndexBadValue = + numPointsForInsideEdge[odd] - 1; + ctx.IndexPatchCtx.insidePointIndexReplacementValue = + edge0InsidePointBaseOffset; + ctx.IndexPatchCtx.outsidePointIndexPatchBase = + ctx.IndexPatchCtx.insidePointIndexBadValue + + 1; // past inside patched index range + ctx.IndexPatchCtx.outsidePointIndexDeltaToRealValue = + outsideEdgePointBaseOffset - + ctx.IndexPatchCtx.outsidePointIndexPatchBase; + ctx.IndexPatchCtx.outsidePointIndexBadValue = + ctx.IndexPatchCtx.outsidePointIndexPatchBase + + numPointsForOutsideEdge_[edge] - 1; + ctx.IndexPatchCtx.outsidePointIndexReplacementValue = + edge0OutsidePointBaseOffset; + + insideBaseOffset = 0; + outsideBaseOffset = + ctx.IndexPatchCtx.outsidePointIndexPatchBase; + ctx.bUsingPatchedIndices = true; + } + } else if ((edge == 2) && (ring == degeneratePointRing[odd])) { + ctx.IndexPatchCtx2.baseIndexToInvert = + insideEdgePointBaseOffset_; + ctx.IndexPatchCtx2.cornerCaseBadValue = -1; // unused + ctx.IndexPatchCtx2.cornerCaseReplacementValue = -1; // unused + ctx.IndexPatchCtx2.indexInversionEndPoint = + ctx.IndexPatchCtx2.baseIndexToInvert << 1; + insideBaseOffset = ctx.IndexPatchCtx2.baseIndexToInvert; + outsideBaseOffset = outsideEdgePointBaseOffset; + ctx.bUsingPatchedIndices2 = true; + } else { + insideBaseOffset = insideEdgePointBaseOffset_; + outsideBaseOffset = outsideEdgePointBaseOffset; + } + if (ring == 1) { + StitchTransition( + &ctx, /*baseIndexOffset: */ NumIndices, insideBaseOffset, + insideTessFactorCtx[odd].numHalfTessFactorPoints, + insideTessFactorOdd[odd], outsideBaseOffset, + outsideTessFactorCtx[edge].numHalfTessFactorPoints, + outsideTessFactorOdd[edge]); + } else { + StitchRegular(&ctx, /*bTrapezoid*/ true, DIAGONALS_MIRRORED, + /*baseIndexOffset: */ NumIndices, + numPointsForInsideEdge[odd], insideBaseOffset, + outsideBaseOffset); + } + ctx.bUsingPatchedIndices = false; + ctx.bUsingPatchedIndices2 = false; + NumIndices += numTriangles * 3; + outsideEdgePointBaseOffset += numPointsForOutsideEdge_[edge] - 1; + if ((edge == 2) && (ring == degeneratePointRing[odd])) { + insideEdgePointBaseOffset_ -= numPointsForInsideEdge[odd] - 1; + } else { + insideEdgePointBaseOffset_ += numPointsForInsideEdge[odd] - 1; + } + numPointsForOutsideEdge_[edge] = numPointsForInsideEdge[odd]; + } + } + + // Triangulate center - a row of quads if odd + // This triangulation may be producing diagonals that are asymmetric about + // the center of the patch in this region. + if ((numPointsForInsideTessFactor[U] > numPointsForInsideTessFactor[V]) && + insideTessFactorOdd[V]) { + ctx.bUsingPatchedIndices2 = true; + int stripNumQuads = (((numPointsForInsideTessFactor[U] >> 1) - + (numPointsForInsideTessFactor[V] >> 1)) + << 1) + + (insideTessFactorOdd[U] ? 1 : 2); + ctx.IndexPatchCtx2.baseIndexToInvert = + outsideEdgePointBaseOffset + stripNumQuads + 2; + ctx.IndexPatchCtx2.cornerCaseBadValue = + ctx.IndexPatchCtx2.baseIndexToInvert; + ctx.IndexPatchCtx2.cornerCaseReplacementValue = + outsideEdgePointBaseOffset; + ctx.IndexPatchCtx2.indexInversionEndPoint = + ctx.IndexPatchCtx2.baseIndexToInvert + + ctx.IndexPatchCtx2.baseIndexToInvert + stripNumQuads; + StitchRegular( + &ctx, /*bTrapezoid*/ false, DIAGONALS_INSIDE_TO_OUTSIDE, + /*baseIndexOffset: */ NumIndices, + /*numInsideEdgePoints:*/ stripNumQuads + 1, + /*insideEdgePointBaseOffset*/ ctx.IndexPatchCtx2.baseIndexToInvert, + outsideEdgePointBaseOffset + 1); + ctx.bUsingPatchedIndices2 = false; + NumIndices += stripNumQuads * 6; + } else if ((numPointsForInsideTessFactor[V] >= + numPointsForInsideTessFactor[U]) && + insideTessFactorOdd[U]) { + ctx.bUsingPatchedIndices2 = true; + int stripNumQuads = (((numPointsForInsideTessFactor[V] >> 1) - + (numPointsForInsideTessFactor[U] >> 1)) + << 1) + + (insideTessFactorOdd[V] ? 1 : 2); + ctx.IndexPatchCtx2.baseIndexToInvert = + outsideEdgePointBaseOffset + stripNumQuads + 1; + ctx.IndexPatchCtx2.cornerCaseBadValue = -1; // unused + ctx.IndexPatchCtx2.indexInversionEndPoint = + ctx.IndexPatchCtx2.baseIndexToInvert + + ctx.IndexPatchCtx2.baseIndexToInvert + stripNumQuads; + DIAGONALS diag = insideTessFactorOdd[V] + ? DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE + : DIAGONALS_INSIDE_TO_OUTSIDE; + StitchRegular( + &ctx, /*bTrapezoid*/ false, diag, + /*baseIndexOffset: */ NumIndices, + /*numInsideEdgePoints:*/ stripNumQuads + 1, + /*insideEdgePointBaseOffset*/ ctx.IndexPatchCtx2.baseIndexToInvert, + outsideEdgePointBaseOffset); + ctx.bUsingPatchedIndices2 = false; + NumIndices += stripNumQuads * 6; + } + } +} diff --git a/src/poly/geometry.h b/src/poly/geometry.h new file mode 100644 index 00000000000..4048b956307 --- /dev/null +++ b/src/poly/geometry.h @@ -0,0 +1,641 @@ +/* + * Copyright 2023 Alyssa Rosenzweig + * Copyright 2023 Valve Corporation + * SPDX-License-Identifier: MIT + */ + +#include "compiler/libcl/libcl.h" +#include "compiler/shader_enums.h" + +#include "util/bitscan.h" +#include "util/u_math.h" + +#ifdef __OPENCL_VERSION__ +#include "compiler/libcl/libcl_vk.h" +#endif + +#pragma once + +#define POLY_MAX_SO_BUFFERS 4 +#define POLY_MAX_VERTEX_STREAMS 4 + +enum poly_gs_shape { + /* Indexed, where indices are encoded as: + * + * round_to_pot(max_indices) * round_to_pot(input_primitives) * + * * instance_count + * + * invoked for max_indices * input_primitives * instance_count indices. + * + * This is used with any dynamic topology. No hardware instancing used. + */ + POLY_GS_SHAPE_DYNAMIC_INDEXED, + + /* Indexed with a static index buffer. Indices ranges up to max_indices. + * Hardware instance count = input_primitives * software instance count. + */ + POLY_GS_SHAPE_STATIC_INDEXED, + + /* Non-indexed. Dispatched as: + * + * (max_indices, input_primitives * instance count). + */ + POLY_GS_SHAPE_STATIC_PER_PRIM, + + /* Non-indexed. Dispatched as: + * + * (max_indices * input_primitives, instance count). + */ + POLY_GS_SHAPE_STATIC_PER_INSTANCE, +}; + +static inline unsigned +poly_gs_rast_vertices(enum poly_gs_shape shape, unsigned max_indices, + unsigned input_primitives, unsigned instance_count) +{ + switch (shape) { + case POLY_GS_SHAPE_DYNAMIC_INDEXED: + return max_indices * input_primitives * instance_count; + + case POLY_GS_SHAPE_STATIC_INDEXED: + case POLY_GS_SHAPE_STATIC_PER_PRIM: + return max_indices; + + case POLY_GS_SHAPE_STATIC_PER_INSTANCE: + return max_indices * input_primitives; + } + + UNREACHABLE("invalid shape"); +} + +static inline unsigned +poly_gs_rast_instances(enum poly_gs_shape shape, unsigned max_indices, + unsigned input_primitives, unsigned instance_count) +{ + switch (shape) { + case POLY_GS_SHAPE_DYNAMIC_INDEXED: + return 1; + + case POLY_GS_SHAPE_STATIC_INDEXED: + case POLY_GS_SHAPE_STATIC_PER_PRIM: + return input_primitives * instance_count; + + case POLY_GS_SHAPE_STATIC_PER_INSTANCE: + return instance_count; + } + + UNREACHABLE("invalid shape"); +} + +static inline bool +poly_gs_indexed(enum poly_gs_shape shape) +{ + return shape == POLY_GS_SHAPE_DYNAMIC_INDEXED || + shape == POLY_GS_SHAPE_STATIC_INDEXED; +} + +static inline unsigned +poly_gs_index_size(enum poly_gs_shape shape) +{ + switch (shape) { + case POLY_GS_SHAPE_DYNAMIC_INDEXED: + return 4; + case POLY_GS_SHAPE_STATIC_INDEXED: + return 1; + default: + return 0; + } +} + +/* Heap to allocate from. */ +struct poly_heap { + DEVICE(uchar) base; + uint32_t bottom, size; +} PACKED; +static_assert(sizeof(struct poly_heap) == 4 * 4); + +#ifdef __OPENCL_VERSION__ +static inline uint +_poly_heap_alloc_offs(global struct poly_heap *heap, uint size_B, bool atomic) +{ + size_B = align(size_B, 16); + + uint offs; + if (atomic) { + offs = atomic_fetch_add((volatile atomic_uint *)(&heap->bottom), size_B); + } else { + offs = heap->bottom; + heap->bottom = offs + size_B; + } + + /* Use printf+abort because assert is stripped from release builds. */ + if (heap->bottom >= heap->size) { + printf( + "FATAL: GPU heap overflow, allocating size %u, at offset %u, heap size %u!", + size_B, offs, heap->size); + + abort(); + } + + return offs; +} + +static inline uint +poly_heap_alloc_nonatomic_offs(global struct poly_heap *heap, uint size_B) +{ + return _poly_heap_alloc_offs(heap, size_B, false); +} + +static inline uint +poly_heap_alloc_atomic_offs(global struct poly_heap *heap, uint size_B) +{ + return _poly_heap_alloc_offs(heap, size_B, true); +} + +static inline global void * +poly_heap_alloc_nonatomic(global struct poly_heap *heap, uint size_B) +{ + return heap->base + poly_heap_alloc_nonatomic_offs(heap, size_B); +} + +uint64_t nir_load_ro_sink_address_poly(void); + +static inline uint64_t +poly_index_buffer(uint64_t index_buffer, uint size_el, uint offset_el, + uint elsize_B) +{ + if (offset_el < size_el) + return index_buffer + (offset_el * elsize_B); + else + return nir_load_ro_sink_address_poly(); +} +#endif + +struct poly_ia_state { + /* Index buffer if present. */ + uint64_t index_buffer; + + /* Size of the bound index buffer for bounds checking */ + uint32_t index_buffer_range_el; + + /* Number of vertices per instance. Written by CPU for direct draw, indirect + * setup kernel for indirect. This is used for VS->GS and VS->TCS indexing. + */ + uint32_t verts_per_instance; +} PACKED; +static_assert(sizeof(struct poly_ia_state) == 4 * 4); + +static inline uint +poly_index_buffer_range_el(uint size_el, uint offset_el) +{ + return offset_el < size_el ? (size_el - offset_el) : 0; +} + +struct poly_geometry_params { + /* Address of associated indirect draw buffer */ + DEVICE(uint) indirect_desc; + + /* Address of count buffer. For an indirect draw, this will be written by the + * indirect setup kernel. + */ + DEVICE(uint) count_buffer; + + /* Address of the primitives generated counters */ + DEVICE(uint) prims_generated_counter[POLY_MAX_VERTEX_STREAMS]; + DEVICE(uint) xfb_prims_generated_counter[POLY_MAX_VERTEX_STREAMS]; + DEVICE(uint) xfb_overflow[POLY_MAX_VERTEX_STREAMS]; + DEVICE(uint) xfb_any_overflow; + + /* Pointers to transform feedback buffer offsets in bytes */ + DEVICE(uint) xfb_offs_ptrs[POLY_MAX_SO_BUFFERS]; + + /* Output index buffer, allocated by pre-GS. */ + DEVICE(uint) output_index_buffer; + + /* Address of transform feedback buffer in general, supplied by the CPU. */ + DEVICE(uchar) xfb_base_original[POLY_MAX_SO_BUFFERS]; + + /* Address of transform feedback for the current primitive. Written by pre-GS + * program. + */ + DEVICE(uchar) xfb_base[POLY_MAX_SO_BUFFERS]; + + /* Address and present mask for the input to the geometry shader. These will + * reflect the vertex shader for VS->GS or instead the tessellation + * evaluation shader for TES->GS. + */ + uint64_t input_buffer; + uint64_t input_mask; + + /* Location-indexed mask of flat outputs, used for lowering GL edge flags. */ + uint64_t flat_outputs; + + uint32_t xfb_size[POLY_MAX_SO_BUFFERS]; + + /* Number of vertices emitted by transform feedback per stream. Written by + * the pre-GS program. + */ + uint32_t xfb_verts[POLY_MAX_VERTEX_STREAMS]; + + /* Within an indirect GS draw, the grids used to dispatch the VS/GS written + * out by the GS indirect setup kernel or the CPU for a direct draw. This is + * the "indirect local" format: first 3 is in threads, second 3 is in grid + * blocks. This lets us use nontrivial workgroups with indirect draws without + * needing any predication. + */ + uint32_t vs_grid[6]; + uint32_t gs_grid[6]; + + /* Number of input primitives across all instances, calculated by the CPU for + * a direct draw or the GS indirect setup kernel for an indirect draw. + */ + uint32_t input_primitives; + + /* Number of input primitives per instance, rounded up to a power-of-two and + * with the base-2 log taken. This is used to partition the output vertex IDs + * efficiently. + */ + uint32_t primitives_log2; + + /* Number of bytes output by the GS count shader per input primitive (may be + * 0), written by CPU and consumed by indirect draw setup shader for + * allocating counts. + */ + uint32_t count_buffer_stride; + + /* Dynamic input topology. Must be compatible with the geometry shader's + * layout() declared input class. + */ + uint32_t input_topology; +} PACKED; +static_assert(sizeof(struct poly_geometry_params) == 86 * 4); + +/* TCS shared memory layout: + * + * vec4 vs_outputs[VERTICES_IN_INPUT_PATCH][TOTAL_VERTEX_OUTPUTS]; + * + * TODO: compact. + */ +static inline uint +poly_tcs_in_offs_el(uint vtx, gl_varying_slot location, + uint64_t crosslane_vs_out_mask) +{ + uint base = vtx * util_bitcount64(crosslane_vs_out_mask); + uint offs = util_bitcount64(crosslane_vs_out_mask & + (((uint64_t)(1) << location) - 1)); + + return base + offs; +} + +static inline uint +poly_tcs_in_size(uint32_t vertices_in_patch, uint64_t crosslane_vs_out_mask) +{ + return vertices_in_patch * util_bitcount64(crosslane_vs_out_mask) * 16; +} + +/* + * TCS out buffer layout, per-patch: + * + * float tess_level_outer[4]; + * float tess_level_inner[2]; + * vec4 patch_out[MAX_PATCH_OUTPUTS]; + * vec4 vtx_out[OUT_PATCH_SIZE][TOTAL_VERTEX_OUTPUTS]; + * + * Vertex out are compacted based on the mask of written out. Patch + * out are used as-is. + * + * Bounding boxes are ignored. + */ +static inline uint +poly_tcs_out_offs_el(uint vtx_id, gl_varying_slot location, uint nr_patch_out, + uint64_t vtx_out_mask) +{ + uint off = 0; + if (location == VARYING_SLOT_TESS_LEVEL_OUTER) + return off; + + off += 4; + if (location == VARYING_SLOT_TESS_LEVEL_INNER) + return off; + + off += 2; + if (location >= VARYING_SLOT_PATCH0) + return off + (4 * (location - VARYING_SLOT_PATCH0)); + + /* Anything else is a per-vtx output */ + off += 4 * nr_patch_out; + off += 4 * vtx_id * util_bitcount64(vtx_out_mask); + + uint idx = util_bitcount64(vtx_out_mask & (((uint64_t)(1) << location) - 1)); + return off + (4 * idx); +} + +static inline uint +poly_tcs_out_stride_el(uint nr_patch_out, uint out_patch_size, + uint64_t vtx_out_mask) +{ + return poly_tcs_out_offs_el(out_patch_size, VARYING_SLOT_POS, nr_patch_out, + vtx_out_mask); +} + +static inline uint +poly_tcs_out_stride(uint nr_patch_out, uint out_patch_size, + uint64_t vtx_out_mask) +{ + return poly_tcs_out_stride_el(nr_patch_out, out_patch_size, vtx_out_mask) * + 4; +} + +/* In a tess eval shader, stride for hw vertex ID */ +#define POLY_TES_PATCH_ID_STRIDE 8192 + +static inline uint +poly_compact_prim(enum mesa_prim prim) +{ + static_assert(MESA_PRIM_QUAD_STRIP == MESA_PRIM_QUADS + 1); + static_assert(MESA_PRIM_POLYGON == MESA_PRIM_QUADS + 2); + +#ifndef __OPENCL_VERSION__ + assert(prim != MESA_PRIM_QUADS); + assert(prim != MESA_PRIM_QUAD_STRIP); + assert(prim != MESA_PRIM_POLYGON); + assert(prim != MESA_PRIM_PATCHES); +#endif + + return (prim >= MESA_PRIM_QUADS) ? (prim - 3) : prim; +} + +static inline enum mesa_prim +poly_uncompact_prim(uint packed) +{ + if (packed >= MESA_PRIM_QUADS) + return (enum mesa_prim)(packed + 3); + + return (enum mesa_prim)packed; +} + +/* + * Write a strip into a 32-bit index buffer. This is the sequence: + * + * (b, b + 1, b + 2, ..., b + n - 1, -1) where -1 is the restart index + * + * For points, we write index buffers without restart just for remapping. + */ +static inline void +_poly_write_strip(GLOBAL uint32_t *index_buffer, uint32_t index_offset, + uint32_t vertex_offset, uint32_t verts_in_prim, + uint32_t stream, uint32_t stream_multiplier, uint32_t n) +{ + bool restart = n > 1; + if (verts_in_prim < n) + return; + + GLOBAL uint32_t *out = &index_buffer[index_offset]; + + /* Write out indices for the strip */ + for (uint32_t i = 0; i < verts_in_prim; ++i) { + out[i] = (vertex_offset + i) * stream_multiplier + stream; + } + + if (restart) + out[verts_in_prim] = -1; +} + +static inline unsigned +poly_decomposed_prims_for_vertices_with_tess(enum mesa_prim prim, int vertices, + unsigned verts_per_patch) +{ + if (prim >= MESA_PRIM_PATCHES) { + return vertices / verts_per_patch; + } else { + return u_decomposed_prims_for_vertices(prim, vertices); + } +} + +#ifdef __OPENCL_VERSION__ +/* + * Returns (work_group_scan_inclusive_add(x), work_group_sum(x)). Implemented + * manually with subgroup ops and local memory since Mesa doesn't do those + * lowerings yet. + */ +static inline uint2 +poly_work_group_scan_inclusive_add(uint x, local uint *scratch) +{ + uint sg_id = get_sub_group_id(); + + /* Partial prefix sum of the subgroup */ + uint sg = sub_group_scan_inclusive_add(x); + + /* Reduction (sum) for the subgroup */ + uint sg_sum = sub_group_broadcast(sg, 31); + + /* Write out all the subgroups sums */ + barrier(CLK_LOCAL_MEM_FENCE); + scratch[sg_id] = sg_sum; + barrier(CLK_LOCAL_MEM_FENCE); + + /* Read all the subgroup sums. Thread T in subgroup G reads the sum of all + * threads in subgroup T. + */ + uint other_sum = scratch[get_sub_group_local_id()]; + + /* Exclusive sum the subgroup sums to get the total before the current group, + * which can be added to the total for the current group. + */ + uint other_sums = sub_group_scan_exclusive_add(other_sum); + uint base = sub_group_broadcast(other_sums, sg_id); + uint prefix = base + sg; + + /* Reduce the workgroup using the prefix sum we already did */ + uint reduction = sub_group_broadcast(other_sums + other_sum, 31); + + return (uint2)(prefix, reduction); +} + +static inline void +poly_prefix_sum(local uint *scratch, global uint *buffer, uint len, uint words, + uint word, uint wg_count) +{ + uint tid = cl_local_id.x; + + /* Main loop: complete workgroups processing multiple values at once */ + uint i, count = 0; + uint len_remainder = len % wg_count; + uint len_rounded_down = len - len_remainder; + + for (i = tid; i < len_rounded_down; i += wg_count) { + global uint *ptr = &buffer[(i * words) + word]; + uint value = *ptr; + uint2 sums = poly_work_group_scan_inclusive_add(value, scratch); + + *ptr = count + sums[0]; + count += sums[1]; + } + + /* The last iteration is special since we won't have a full subgroup unless + * the length is divisible by the subgroup size, and we don't advance count. + */ + global uint *ptr = &buffer[(i * words) + word]; + uint value = (tid < len_remainder) ? *ptr : 0; + uint scan = poly_work_group_scan_inclusive_add(value, scratch)[0]; + + if (tid < len_remainder) { + *ptr = count + scan; + } +} + +static inline void +poly_increment_counters(global uint32_t *a, global uint32_t *b, + global uint32_t *c, uint count) +{ + global uint32_t *ptr[] = {a, b, c}; + + for (uint i = 0; i < 3; ++i) { + if (ptr[i]) { + *(ptr[i]) += count; + } + } +} + +static inline void +poly_increment_ia(global uint32_t *ia_vertices, global uint32_t *ia_primitives, + global uint32_t *vs_invocations, global uint32_t *c_prims, + global uint32_t *c_invs, constant uint32_t *draw, + enum mesa_prim prim, unsigned verts_per_patch) +{ + poly_increment_counters(ia_vertices, vs_invocations, NULL, + draw[0] * draw[1]); + + uint prims = poly_decomposed_prims_for_vertices_with_tess(prim, draw[0], + verts_per_patch) * + draw[1]; + + poly_increment_counters(ia_primitives, c_prims, c_invs, prims); +} + +static inline void +poly_gs_setup_indirect(uint64_t index_buffer, constant uint *draw, + global uintptr_t *vertex_buffer /* output */, + global struct poly_ia_state *ia /* output */, + global struct poly_geometry_params *p /* output */, + global struct poly_heap *heap, + uint64_t vs_outputs /* Vertex (TES) output mask */, + uint32_t index_size_B /* 0 if no index bffer */, + uint32_t index_buffer_range_el, + uint32_t prim /* Input primitive type, enum mesa_prim */, + int is_prefix_summing, uint max_indices, + enum poly_gs_shape shape) +{ + /* Determine the (primitives, instances) grid size. */ + uint vertex_count = draw[0]; + uint instance_count = draw[1]; + + ia->verts_per_instance = vertex_count; + + /* Calculate number of primitives input into the GS */ + uint prim_per_instance = u_decomposed_prims_for_vertices(prim, vertex_count); + p->input_primitives = prim_per_instance * instance_count; + + /* Invoke VS as (vertices, instances); GS as (primitives, instances) */ + p->vs_grid[0] = vertex_count; + p->vs_grid[1] = instance_count; + + p->gs_grid[0] = prim_per_instance; + p->gs_grid[1] = instance_count; + + p->primitives_log2 = util_logbase2_ceil(prim_per_instance); + + /* If indexing is enabled, the third word is the offset into the index buffer + * in elements. Apply that offset now that we have it. For a hardware + * indirect draw, the hardware would do this for us, but for software input + * assembly we need to do it ourselves. + */ + if (index_size_B) { + ia->index_buffer = poly_index_buffer(index_buffer, index_buffer_range_el, + draw[2], index_size_B); + + ia->index_buffer_range_el = + poly_index_buffer_range_el(index_buffer_range_el, draw[2]); + } + + /* We need to allocate VS and GS count buffers, do so now */ + uint vertex_buffer_size = + poly_tcs_in_size(vertex_count * instance_count, vs_outputs); + + if (is_prefix_summing) { + p->count_buffer = poly_heap_alloc_nonatomic( + heap, p->input_primitives * p->count_buffer_stride); + } + + p->input_buffer = + (uintptr_t)poly_heap_alloc_nonatomic(heap, vertex_buffer_size); + *vertex_buffer = p->input_buffer; + + p->input_mask = vs_outputs; + + /* Allocate the index buffer and write the draw consuming it */ + global VkDrawIndexedIndirectCommand *cmd = (global void *)p->indirect_desc; + + *cmd = (VkDrawIndexedIndirectCommand){ + .indexCount = poly_gs_rast_vertices(shape, max_indices, prim_per_instance, + instance_count), + .instanceCount = poly_gs_rast_instances( + shape, max_indices, prim_per_instance, instance_count), + }; + + if (shape == POLY_GS_SHAPE_DYNAMIC_INDEXED) { + cmd->firstIndex = + poly_heap_alloc_nonatomic_offs(heap, cmd->indexCount * 4) / 4; + + p->output_index_buffer = + (global uint *)(heap->base + (cmd->firstIndex * 4)); + } +} + +static uint +poly_load_index(uintptr_t index_buffer, uint32_t index_buffer_range_el, uint id, + uint index_size) +{ + bool oob = id >= index_buffer_range_el; + + /* If the load would be out-of-bounds, load the first element which is + * assumed valid. If the application index buffer is empty with robustness2, + * index_buffer will point to a zero sink where only the first is valid. + */ + if (oob) { + id = 0; + } + + uint el; + if (index_size == 1) { + el = ((constant uint8_t *)index_buffer)[id]; + } else if (index_size == 2) { + el = ((constant uint16_t *)index_buffer)[id]; + } else { + el = ((constant uint32_t *)index_buffer)[id]; + } + + /* D3D robustness semantics. TODO: Optimize? */ + if (oob) { + el = 0; + } + + return el; +} + +static void +poly_store_index(uintptr_t index_buffer, uint index_size_B, uint id, uint value) +{ + global uint32_t *out_32 = (global uint32_t *)index_buffer; + global uint16_t *out_16 = (global uint16_t *)index_buffer; + global uint8_t *out_8 = (global uint8_t *)index_buffer; + + if (index_size_B == 4) + out_32[id] = value; + else if (index_size_B == 2) + out_16[id] = value; + else + out_8[id] = value; +} + +#endif diff --git a/src/poly/meson.build b/src/poly/meson.build new file mode 100644 index 00000000000..3f0e75b6ec5 --- /dev/null +++ b/src/poly/meson.build @@ -0,0 +1,9 @@ +# Copyright © 2025 Collabora Ltd. +# SPDX-License-Identifier: MIT + +inc_poly = include_directories([ + '.', 'nir' +]) + +subdir('cl') +subdir('nir') diff --git a/src/poly/nir/meson.build b/src/poly/nir/meson.build new file mode 100644 index 00000000000..5560f5c860f --- /dev/null +++ b/src/poly/nir/meson.build @@ -0,0 +1,18 @@ +# Copyright © 2025 Collabora Ltd. +# SPDX-License-Identifier: MIT + +libpoly_nir_files = files( + 'poly_nir_lower_gs.c', + 'poly_nir_lower_ia.c', + 'poly_nir_lower_tess.c', +) + +libpoly_nir = static_library( + 'libpoly_nir', + [libpoly_nir_files], + include_directories : [inc_poly], + c_args : [no_override_init_args, '-Wno-c2x-extensions'], + gnu_symbol_visibility : 'hidden', + dependencies: [idep_nir, idep_mesautil, idep_libpoly], + build_by_default : false, +) diff --git a/src/asahi/lib/agx_nir_lower_gs.c b/src/poly/nir/poly_nir_lower_gs.c similarity index 92% rename from src/asahi/lib/agx_nir_lower_gs.c rename to src/poly/nir/poly_nir_lower_gs.c index 30630bedbf5..79e2fb9b038 100644 --- a/src/asahi/lib/agx_nir_lower_gs.c +++ b/src/poly/nir/poly_nir_lower_gs.c @@ -5,11 +5,11 @@ * SPDX-License-Identifier: MIT */ -#include "agx_nir_lower_gs.h" +#include "poly/nir/poly_nir_lower_gs.h" #include "compiler/nir/nir_builder.h" #include "gallium/include/pipe/p_defines.h" -#include "libagx/geometry.h" -#include "libagx/libagx.h" +#include "poly/cl/libpoly.h" +#include "poly/geometry.h" #include "util/bitscan.h" #include "util/list.h" #include "util/macros.h" @@ -85,7 +85,7 @@ rewrite_intrinsics(nir_builder *b, nir_intrinsic_instr *intr, void *state_) } static bool -agx_nir_lower_gs_intrinsics(nir_shader *shader) +lower_gs_intrinsics(nir_shader *shader) { struct state state; nir_function_impl *impl = nir_shader_get_entrypoint(shader); @@ -158,16 +158,16 @@ agx_nir_lower_gs_intrinsics(nir_shader *shader) } struct lower_gs_state { - int static_count[MAX_VERTEX_STREAMS]; + int static_count[POLY_MAX_VERTEX_STREAMS]; /* The index of each counter in the count buffer, or -1 if it's not in the * count buffer. * * Invariant: info->count_words == sum(count_index[i] >= 0). */ - int count_index[MAX_VERTEX_STREAMS]; + int count_index[POLY_MAX_VERTEX_STREAMS]; - struct agx_gs_info *info; + struct poly_gs_info *info; }; /* Helpers for loading from the geometry state buffer */ @@ -184,8 +184,8 @@ load_geometry_param_offset(nir_builder *b, uint32_t offset, uint8_t bytes) #define load_geometry_param(b, field) \ load_geometry_param_offset( \ - b, offsetof(struct agx_geometry_params, field), \ - sizeof(((struct agx_geometry_params *)0)->field)) + b, offsetof(struct poly_geometry_params, field), \ + sizeof(((struct poly_geometry_params *)0)->field)) /* Helpers for lowering I/O to variables */ struct lower_output_to_var_state { @@ -257,18 +257,18 @@ vertex_id_for_topology_class(nir_builder *b, nir_def *vert, enum mesa_prim cls) return prim; case MESA_PRIM_LINES: - return libagx_vertex_id_for_line_class(b, topology, prim, vert, nr); + return poly_vertex_id_for_line_class(b, topology, prim, vert, nr); case MESA_PRIM_TRIANGLES: - return libagx_vertex_id_for_tri_class(b, topology, prim, vert, - flatshade_first); + return poly_vertex_id_for_tri_class(b, topology, prim, vert, + flatshade_first); case MESA_PRIM_LINES_ADJACENCY: - return libagx_vertex_id_for_line_adj_class(b, topology, prim, vert); + return poly_vertex_id_for_line_adj_class(b, topology, prim, vert); case MESA_PRIM_TRIANGLES_ADJACENCY: - return libagx_vertex_id_for_tri_adj_class(b, topology, prim, vert, nr, - flatshade_first); + return poly_vertex_id_for_tri_adj_class(b, topology, prim, vert, nr, + flatshade_first); default: UNREACHABLE("invalid topology class"); @@ -276,8 +276,8 @@ vertex_id_for_topology_class(nir_builder *b, nir_def *vert, enum mesa_prim cls) } nir_def * -agx_load_per_vertex_input(nir_builder *b, nir_intrinsic_instr *intr, - nir_def *vertex) +poly_load_per_vertex_input(nir_builder *b, nir_intrinsic_instr *intr, + nir_def *vertex) { assert(intr->intrinsic == nir_intrinsic_load_per_vertex_input); nir_io_semantics sem = nir_intrinsic_io_semantics(intr); @@ -287,15 +287,15 @@ agx_load_per_vertex_input(nir_builder *b, nir_intrinsic_instr *intr, if (b->shader->info.stage == MESA_SHADER_GEOMETRY) { /* GS may be preceded by VS or TES so specified as param */ - addr = libagx_geometry_input_address( + addr = poly_geometry_input_address( b, nir_load_geometry_param_buffer_poly(b), vertex, location); } else { assert(b->shader->info.stage == MESA_SHADER_TESS_CTRL); /* TCS always preceded by VS so we use the VS state directly */ - addr = libagx_vertex_output_address(b, nir_load_vs_output_buffer_poly(b), - nir_load_vs_outputs_poly(b), vertex, - location); + addr = poly_vertex_output_address(b, nir_load_vs_output_buffer_poly(b), + nir_load_vs_outputs_poly(b), vertex, + location); } addr = nir_iadd_imm(b, addr, 4 * nir_intrinsic_component(intr)); @@ -320,7 +320,7 @@ lower_gs_inputs(nir_builder *b, nir_intrinsic_instr *intr, void *_) nir_def *unrolled = nir_iadd(b, nir_imul(b, nir_load_instance_id(b), verts), vertex); - nir_def *val = agx_load_per_vertex_input(b, intr, unrolled); + nir_def *val = poly_load_per_vertex_input(b, intr, unrolled); nir_def_replace(&intr->def, val); return true; } @@ -377,10 +377,10 @@ write_xfb_counts(nir_builder *b, nir_intrinsic_instr *intr, nir_def *id = state->info->prefix_sum ? calc_unrolled_id(b) : nir_imm_int(b, 0); - nir_def *addr = libagx_load_xfb_count_address( - b, nir_load_geometry_param_buffer_poly(b), - nir_imm_int(b, state->count_index[stream]), - nir_imm_int(b, state->info->count_words), id); + nir_def *addr = + poly_load_xfb_count_address(b, nir_load_geometry_param_buffer_poly(b), + nir_imm_int(b, state->count_index[stream]), + nir_imm_int(b, state->info->count_words), id); if (state->info->prefix_sum) { nir_store_global(b, addr, 4, intr->src[2].ssa, nir_component_mask(1)); @@ -656,7 +656,7 @@ create_gs_rast_shader(const nir_shader *gs, const struct lower_gs_state *state) raw_vertex_id = nir_udiv_imm(b, raw_vertex_id, stream_multiplier(gs)); switch (state->info->shape) { - case AGX_GS_SHAPE_DYNAMIC_INDEXED: { + case POLY_GS_SHAPE_DYNAMIC_INDEXED: { unsigned stride = output_vertex_id_pot_stride(gs); nir_def *unrolled = nir_udiv_imm(b, raw_vertex_id, stride); @@ -669,8 +669,8 @@ create_gs_rast_shader(const nir_shader *gs, const struct lower_gs_state *state) break; } - case AGX_GS_SHAPE_STATIC_INDEXED: - case AGX_GS_SHAPE_STATIC_PER_PRIM: { + case POLY_GS_SHAPE_STATIC_INDEXED: + case POLY_GS_SHAPE_STATIC_PER_PRIM: { nir_def *stride = load_geometry_param(b, gs_grid[0]); rs.output_id = raw_vertex_id; @@ -679,7 +679,7 @@ create_gs_rast_shader(const nir_shader *gs, const struct lower_gs_state *state) break; } - case AGX_GS_SHAPE_STATIC_PER_INSTANCE: { + case POLY_GS_SHAPE_STATIC_PER_INSTANCE: { unsigned stride = MAX2(state->info->max_indices, 1); rs.output_id = nir_umod_imm(b, raw_vertex_id, stride); @@ -733,8 +733,8 @@ create_gs_rast_shader(const nir_shader *gs, const struct lower_gs_state *state) for (unsigned p_ = 0; p_ < n_; ++p_) { nir_def *p = nir_imm_int(b, p_); - nir_push_if(b, libagx_xfb_vertex_copy_in_strip(b, n, id_in_strip, - strip_length, p)); + nir_push_if(b, poly_xfb_vertex_copy_in_strip(b, n, id_in_strip, + strip_length, p)); /* Write XFB for each output */ for (unsigned i = 0; i < xfb->output_count; ++i) { @@ -746,14 +746,14 @@ create_gs_rast_shader(const nir_shader *gs, const struct lower_gs_state *state) * base for this invocation for the stream plus the offset within * this invocation. */ - nir_def *invocation_base = libagx_previous_xfb_primitives( + nir_def *invocation_base = poly_previous_xfb_primitives( b, nir_load_geometry_param_buffer_poly(b), nir_imm_int(b, state->static_count[stream]), nir_imm_int(b, state->count_index[stream]), nir_imm_int(b, state->info->count_words), nir_imm_bool(b, state->info->prefix_sum), unrolled); - nir_def *index = libagx_xfb_vertex_offset( + nir_def *index = poly_xfb_vertex_offset( b, n, invocation_base, base, id_in_strip, p, nir_inot(b, nir_i2b(b, nir_load_provoking_last(b)))); @@ -776,7 +776,7 @@ create_gs_rast_shader(const nir_shader *gs, const struct lower_gs_state *state) */ value = nir_pad_vector_imm_int(b, value, 0, 4); - nir_def *addr = libagx_xfb_vertex_address( + nir_def *addr = poly_xfb_vertex_address( b, nir_load_geometry_param_buffer_poly(b), index, nir_imm_int(b, buffer), nir_imm_int(b, stride), nir_imm_int(b, output.offset)); @@ -842,12 +842,12 @@ lower_gs_instr(nir_builder *b, nir_intrinsic_instr *intr, void *state_) switch (intr->intrinsic) { case nir_intrinsic_set_vertex_and_primitive_count: { - if (state->info->shape != AGX_GS_SHAPE_DYNAMIC_INDEXED) + if (state->info->shape != POLY_GS_SHAPE_DYNAMIC_INDEXED) break; /* All streams are merged, just pick a single instruction */ if (nir_intrinsic_stream_id(intr) == 0) { - libagx_pad_index_gs( + poly_pad_index_gs( b, load_geometry_param(b, output_index_buffer), nir_imul_imm(b, calc_unrolled_id(b), state->info->max_indices), intr->src[1].ssa, nir_imm_int(b, state->info->max_indices)); @@ -857,10 +857,10 @@ lower_gs_instr(nir_builder *b, nir_intrinsic_instr *intr, void *state_) } case nir_intrinsic_emit_primitive_poly: { - if (state->info->shape != AGX_GS_SHAPE_DYNAMIC_INDEXED) + if (state->info->shape != POLY_GS_SHAPE_DYNAMIC_INDEXED) break; - libagx_write_strip( + poly_write_strip( b, load_geometry_param(b, output_index_buffer), nir_imul_imm(b, calc_unrolled_id(b), state->info->max_indices), intr->src[0].ssa, @@ -903,14 +903,14 @@ collect_components(nir_builder *b, nir_intrinsic_instr *intr, void *data) return true; } -struct agx_xfb_key { +struct poly_xfb_key { uint8_t streams; uint8_t buffers_written; uint8_t buffer_to_stream[NIR_MAX_XFB_BUFFERS]; int8_t count_index[4]; uint16_t stride[NIR_MAX_XFB_BUFFERS]; uint16_t output_end[NIR_MAX_XFB_BUFFERS]; - int16_t static_count[MAX_VERTEX_STREAMS]; + int16_t static_count[POLY_MAX_VERTEX_STREAMS]; uint16_t invocations; uint16_t vertices_per_prim; }; @@ -921,14 +921,14 @@ struct agx_xfb_key { * transform feedback offsets and counters as applicable. */ static nir_shader * -create_pre_gs(struct agx_xfb_key *key, +create_pre_gs(struct poly_xfb_key *key, const nir_shader_compiler_options *options) { nir_builder b_ = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "Pre-GS patch up"); nir_builder *b = &b_; - libagx_pre_gs( + poly_pre_gs( b, nir_load_geometry_param_buffer_poly(b), nir_imm_int(b, key->streams), nir_imm_int(b, key->buffers_written), nir_imm_ivec4(b, key->buffer_to_stream[0], key->buffer_to_stream[1], @@ -1033,7 +1033,7 @@ calculate_max_indices(enum mesa_prim prim, unsigned verts) } struct topology_ctx { - struct agx_gs_info *info; + struct poly_gs_info *info; uint32_t topology[384]; }; @@ -1041,7 +1041,7 @@ static bool evaluate_topology(nir_builder *b, nir_intrinsic_instr *intr, void *data) { struct topology_ctx *ctx = data; - struct agx_gs_info *info = ctx->info; + struct poly_gs_info *info = ctx->info; if (intr->intrinsic != nir_intrinsic_emit_primitive_poly) return false; @@ -1050,7 +1050,7 @@ evaluate_topology(nir_builder *b, nir_intrinsic_instr *intr, void *data) * if-statements interleaved with other stuff). */ if (intr->instr.block != nir_start_block(b->impl)) { - info->shape = AGX_GS_SHAPE_DYNAMIC_INDEXED; + info->shape = POLY_GS_SHAPE_DYNAMIC_INDEXED; return false; } @@ -1058,11 +1058,11 @@ evaluate_topology(nir_builder *b, nir_intrinsic_instr *intr, void *data) if (!nir_src_is_const(intr->src[0]) || !nir_src_is_const(intr->src[1]) || !nir_src_is_const(intr->src[2])) { - info->shape = AGX_GS_SHAPE_DYNAMIC_INDEXED; + info->shape = POLY_GS_SHAPE_DYNAMIC_INDEXED; return false; } - _libagx_write_strip( + _poly_write_strip( ctx->topology, nir_src_as_uint(intr->src[0]), nir_src_as_uint(intr->src[1]), nir_src_as_uint(intr->src[2]), nir_intrinsic_stream_id(intr), stream_multiplier(b->shader), @@ -1076,7 +1076,7 @@ evaluate_topology(nir_builder *b, nir_intrinsic_instr *intr, void *data) * 0, 1, 2, -1, 3, 4, 5, ... */ static bool -match_list_topology(struct agx_gs_info *info, uint32_t count, +match_list_topology(struct poly_gs_info *info, uint32_t count, uint32_t *topology, bool has_restart) { unsigned count_with_restart = count + has_restart; @@ -1095,7 +1095,7 @@ match_list_topology(struct agx_gs_info *info, uint32_t count, } /* If we match, rewrite the topology and drop indexing */ - info->shape = AGX_GS_SHAPE_STATIC_PER_INSTANCE; + info->shape = POLY_GS_SHAPE_STATIC_PER_INSTANCE; info->mode = u_decomposed_prim(info->mode); info->max_indices = ((info->max_indices + has_restart) / count_with_restart) * count; @@ -1131,12 +1131,12 @@ is_strip_topology(uint32_t *indices, uint32_t index_count) * VS(compute) + GS(vertex) sequences without auxiliary programs. */ static void -optimize_static_topology(struct agx_gs_info *info, nir_shader *gs) +optimize_static_topology(struct poly_gs_info *info, nir_shader *gs) { struct topology_ctx ctx = {.info = info}; bool has_restart = info->mode != MESA_PRIM_POINTS; nir_shader_intrinsics_pass(gs, evaluate_topology, nir_metadata_all, &ctx); - if (info->shape == AGX_GS_SHAPE_DYNAMIC_INDEXED) + if (info->shape == POLY_GS_SHAPE_DYNAMIC_INDEXED) return; /* We can always drop the trailing restart index */ @@ -1150,7 +1150,7 @@ optimize_static_topology(struct agx_gs_info *info, nir_shader *gs) /* Try to pattern match a strip topology */ if (is_strip_topology(ctx.topology, info->max_indices)) { - info->shape = AGX_GS_SHAPE_STATIC_PER_PRIM; + info->shape = POLY_GS_SHAPE_STATIC_PER_PRIM; return; } @@ -1161,7 +1161,7 @@ optimize_static_topology(struct agx_gs_info *info, nir_shader *gs) * XXX: check if this holds with streams. */ if (info->max_indices >= ARRAY_SIZE(info->topology)) { - info->shape = AGX_GS_SHAPE_DYNAMIC_INDEXED; + info->shape = POLY_GS_SHAPE_DYNAMIC_INDEXED; return; } @@ -1170,12 +1170,12 @@ optimize_static_topology(struct agx_gs_info *info, nir_shader *gs) info->topology[i] = ctx.topology[i]; } - info->shape = AGX_GS_SHAPE_STATIC_INDEXED; + info->shape = POLY_GS_SHAPE_STATIC_INDEXED; } bool -agx_nir_lower_gs(nir_shader *gs, nir_shader **gs_count, nir_shader **gs_copy, - nir_shader **pre_gs, struct agx_gs_info *info) +poly_nir_lower_gs(nir_shader *gs, nir_shader **gs_count, nir_shader **gs_copy, + nir_shader **pre_gs, struct poly_gs_info *info) { /* Lower I/O as assumed by the rest of GS lowering */ if (gs->xfb_info != NULL) { @@ -1212,7 +1212,7 @@ agx_nir_lower_gs(nir_shader *gs, nir_shader **gs_count, nir_shader **gs_copy, /* Lower geometry shader writes to contain all of the required counts, so we * know where in the various buffers we should write vertices. */ - NIR_PASS(_, gs, agx_nir_lower_gs_intrinsics); + NIR_PASS(_, gs, lower_gs_intrinsics); /* Clean up after all that lowering we did */ bool progress = false; @@ -1241,7 +1241,7 @@ agx_nir_lower_gs(nir_shader *gs, nir_shader **gs_count, nir_shader **gs_copy, */ struct lower_gs_state gs_state = {.info = info}; - *info = (struct agx_gs_info){ + *info = (struct poly_gs_info){ .mode = gs->info.gs.output_primitive, .xfb = gs->xfb_info != NULL, .shape = -1, @@ -1252,10 +1252,13 @@ agx_nir_lower_gs(nir_shader *gs, nir_shader **gs_count, nir_shader **gs_copy, nir_gs_count_vertices_and_primitives(gs, NULL, static_indices, gs_state.static_count, 4); + STATIC_ASSERT(ARRAY_SIZE(gs_state.count_index) == + ARRAY_SIZE(gs_state.static_count)); + /* Anything we don't know statically will be tracked by the count buffer. * Determine the layout for it. */ - for (unsigned i = 0; i < MAX_VERTEX_STREAMS; ++i) { + for (unsigned i = 0; i < ARRAY_SIZE(gs_state.count_index); ++i) { gs_state.count_index[i] = (gs_state.static_count[i] < 0) ? info->count_words++ : -1; } @@ -1272,7 +1275,7 @@ agx_nir_lower_gs(nir_shader *gs, nir_shader **gs_count, nir_shader **gs_copy, if (static_indices[0] >= 0) { optimize_static_topology(info, gs); } else { - info->shape = AGX_GS_SHAPE_DYNAMIC_INDEXED; + info->shape = POLY_GS_SHAPE_DYNAMIC_INDEXED; } *gs_copy = create_gs_rast_shader(gs, &gs_state); @@ -1344,20 +1347,22 @@ agx_nir_lower_gs(nir_shader *gs, nir_shader **gs_count, nir_shader **gs_copy, /* Gather information required for transform feedback / query programs */ struct nir_xfb_info *xfb = gs->xfb_info; - struct agx_xfb_key key = { + struct poly_xfb_key key = { .streams = gs->info.gs.active_stream_mask, .invocations = gs->info.gs.invocations, .vertices_per_prim = nir_verts_in_output_prim(gs), }; - for (unsigned i = 0; i < 4; ++i) { + STATIC_ASSERT(ARRAY_SIZE(key.buffer_to_stream) == ARRAY_SIZE(key.stride)); + + for (unsigned i = 0; i < ARRAY_SIZE(gs_state.count_index); ++i) { key.count_index[i] = gs_state.count_index[i]; key.static_count[i] = gs_state.static_count[i]; } if (xfb) { key.buffers_written = xfb->buffers_written; - for (unsigned i = 0; i < 4; ++i) { + for (unsigned i = 0; i < ARRAY_SIZE(key.buffer_to_stream); ++i) { key.buffer_to_stream[i] = xfb->buffer_to_stream[i]; key.stride[i] = xfb->buffers[i].stride; } @@ -1409,14 +1414,13 @@ lower_vs_before_gs(nir_builder *b, nir_intrinsic_instr *intr, void *data) nir_def *buffer, *nr_verts, *instance_id, *primitive_id; if (b->shader->info.stage == MESA_SHADER_VERTEX) { buffer = nir_load_vs_output_buffer_poly(b); - nr_verts = - libagx_input_vertices(b, nir_load_input_assembly_buffer_poly(b)); + nr_verts = poly_input_vertices(b, nir_load_input_assembly_buffer_poly(b)); } else { assert(b->shader->info.stage == MESA_SHADER_TESS_EVAL); /* Instancing is unrolled during tessellation so nr_verts is ignored. */ nr_verts = nir_imm_int(b, 0); - buffer = libagx_tes_buffer(b, nir_load_tess_param_buffer_poly(b)); + buffer = poly_tes_buffer(b, nir_load_tess_param_buffer_poly(b)); } if (b->shader->info.stage == MESA_SHADER_VERTEX && @@ -1431,7 +1435,7 @@ lower_vs_before_gs(nir_builder *b, nir_intrinsic_instr *intr, void *data) nir_def *linear_id = nir_iadd(b, nir_imul(b, instance_id, nr_verts), primitive_id); - nir_def *addr = libagx_vertex_output_address( + nir_def *addr = poly_vertex_output_address( b, buffer, nir_imm_int64(b, b->shader->info.outputs_written), linear_id, location); @@ -1444,7 +1448,7 @@ lower_vs_before_gs(nir_builder *b, nir_intrinsic_instr *intr, void *data) } bool -agx_nir_lower_vs_before_gs(struct nir_shader *vs) +poly_nir_lower_vs_before_gs(struct nir_shader *vs) { /* Lower vertex stores to memory stores */ return nir_shader_intrinsics_pass(vs, lower_vs_before_gs, diff --git a/src/poly/nir/poly_nir_lower_gs.h b/src/poly/nir/poly_nir_lower_gs.h new file mode 100644 index 00000000000..75727a661aa --- /dev/null +++ b/src/poly/nir/poly_nir_lower_gs.h @@ -0,0 +1,61 @@ +/* + * Copyright 2023 Alyssa Rosenzweig + * SPDX-License-Identifier: MIT + */ + +#pragma once + +#include +#include +#include "poly/geometry.h" +#include "nir.h" +#include "shader_enums.h" + +struct nir_def *poly_load_per_vertex_input(struct nir_builder *b, + nir_intrinsic_instr *intr, + struct nir_def *vertex); + +nir_def *poly_nir_load_vertex_id(struct nir_builder *b, nir_def *id, + unsigned index_size_B); + +bool poly_nir_lower_sw_vs(struct nir_shader *s, unsigned index_size_B); + +bool poly_nir_lower_vs_before_gs(struct nir_shader *vs); + +struct poly_gs_info { + /* Output primitive mode for geometry shaders */ + enum mesa_prim mode; + + /* Number of words per primitive in the count buffer */ + unsigned count_words; + + /* Per-input primitive stride of the output index buffer */ + unsigned max_indices; + + /* Whether the GS includes transform feedback at a compile-time level */ + bool xfb; + + /* Whether a prefix sum is required on the count outputs. Implies xfb */ + bool prefix_sum; + + /* Whether the GS writes to a stream other than stream #0 */ + bool multistream; + + /* Shape of the rasterization draw, named by the instance ID */ + enum poly_gs_shape shape; + + /* Static topology used if shape = POLY_GS_SHAPE_STATIC_INDEXED */ + uint8_t topology[64]; +}; + +bool poly_nir_lower_gs(struct nir_shader *gs, struct nir_shader **gs_count, + struct nir_shader **gs_copy, struct nir_shader **pre_gs, + struct poly_gs_info *info); + +bool poly_nir_lower_tcs(struct nir_shader *tcs); + +bool poly_nir_lower_tes(struct nir_shader *tes, bool to_hw_vs); + +uint64_t poly_tcs_per_vertex_outputs(const struct nir_shader *nir); + +unsigned poly_tcs_output_stride(const struct nir_shader *nir); diff --git a/src/asahi/lib/agx_nir_lower_ia.c b/src/poly/nir/poly_nir_lower_ia.c similarity index 70% rename from src/asahi/lib/agx_nir_lower_ia.c rename to src/poly/nir/poly_nir_lower_ia.c index f0c0c45ab52..14bf7e704d7 100644 --- a/src/asahi/lib/agx_nir_lower_ia.c +++ b/src/poly/nir/poly_nir_lower_ia.c @@ -4,25 +4,30 @@ */ #include "compiler/nir/nir_builder.h" -#include "libagx/geometry.h" -#include "libagx/libagx.h" -#include "agx_nir_lower_gs.h" +#include "poly/cl/libpoly.h" +#include "poly/geometry.h" #include "nir.h" +/* XXX: Remove me later */ +nir_def *poly_nir_load_vertex_id(struct nir_builder *b, nir_def *id, + unsigned index_size_B); + +bool poly_nir_lower_sw_vs(struct nir_shader *s, unsigned index_size_B); + /* * This file implements basic input assembly in software. It runs on software * vertex shaders, as part of geometry/tessellation lowering. It does not apply * the topology, which happens in the geometry shader. */ nir_def * -agx_nir_load_vertex_id(nir_builder *b, nir_def *id, unsigned index_size_B) +poly_nir_load_vertex_id(nir_builder *b, nir_def *id, unsigned index_size_B) { /* If drawing with an index buffer, pull the vertex ID. Otherwise, the * vertex ID is just the index as-is. */ if (index_size_B) { nir_def *ia = nir_load_input_assembly_buffer_poly(b); - id = libagx_load_index_buffer(b, ia, id, nir_imm_int(b, index_size_B)); + id = poly_load_index_buffer(b, ia, id, nir_imm_int(b, index_size_B)); } /* Add the "start", either an index bias or a base vertex. This must happen @@ -39,7 +44,8 @@ lower(nir_builder *b, nir_intrinsic_instr *intr, void *data) if (intr->intrinsic == nir_intrinsic_load_vertex_id) { nir_def *id = nir_channel(b, nir_load_global_invocation_id(b, 32), 0); - nir_def_replace(&intr->def, agx_nir_load_vertex_id(b, id, *index_size_B)); + nir_def_replace(&intr->def, + poly_nir_load_vertex_id(b, id, *index_size_B)); return true; } else if (intr->intrinsic == nir_intrinsic_load_instance_id) { nir_def_replace(&intr->def, @@ -51,7 +57,7 @@ lower(nir_builder *b, nir_intrinsic_instr *intr, void *data) } bool -agx_nir_lower_sw_vs(nir_shader *s, unsigned index_size_B) +poly_nir_lower_sw_vs(nir_shader *s, unsigned index_size_B) { return nir_shader_intrinsics_pass(s, lower, nir_metadata_control_flow, &index_size_B); diff --git a/src/asahi/lib/agx_nir_lower_tess.c b/src/poly/nir/poly_nir_lower_tess.c similarity index 78% rename from src/asahi/lib/agx_nir_lower_tess.c rename to src/poly/nir/poly_nir_lower_tess.c index d765d10dda1..70aa21eeeb3 100644 --- a/src/asahi/lib/agx_nir_lower_tess.c +++ b/src/poly/nir/poly_nir_lower_tess.c @@ -3,11 +3,11 @@ * SPDX-License-Identifier: MIT */ -#include "libagx/geometry.h" -#include "libagx/libagx.h" +#include "poly/cl/libpoly.h" +#include "poly/geometry.h" +#include "poly/nir/poly_nir_lower_gs.h" #include "util/bitscan.h" #include "util/macros.h" -#include "agx_nir_lower_gs.h" #include "nir.h" #include "nir_builder.h" #include "nir_builder_opcodes.h" @@ -18,12 +18,12 @@ static nir_def * tcs_unrolled_id(nir_builder *b) { - return libagx_tcs_unrolled_id(b, nir_load_tess_param_buffer_poly(b), - nir_load_workgroup_id(b)); + return poly_tcs_unrolled_id(b, nir_load_tess_param_buffer_poly(b), + nir_load_workgroup_id(b)); } uint64_t -agx_tcs_per_vertex_outputs(const nir_shader *nir) +poly_tcs_per_vertex_outputs(const nir_shader *nir) { return nir->info.outputs_written & ~(VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER | @@ -31,11 +31,11 @@ agx_tcs_per_vertex_outputs(const nir_shader *nir) } unsigned -agx_tcs_output_stride(const nir_shader *nir) +poly_tcs_output_stride(const nir_shader *nir) { - return libagx_tcs_out_stride(util_last_bit(nir->info.patch_outputs_written), - nir->info.tess.tcs_vertices_out, - agx_tcs_per_vertex_outputs(nir)); + return poly_tcs_out_stride(util_last_bit(nir->info.patch_outputs_written), + nir->info.tess.tcs_vertices_out, + poly_tcs_per_vertex_outputs(nir)); } static nir_def * @@ -44,12 +44,12 @@ tcs_out_addr(nir_builder *b, nir_intrinsic_instr *intr, nir_def *vertex_id) nir_io_semantics sem = nir_intrinsic_io_semantics(intr); nir_def *offset = nir_get_io_offset_src(intr)->ssa; - nir_def *addr = libagx_tcs_out_address( + nir_def *addr = poly_tcs_out_address( b, nir_load_tess_param_buffer_poly(b), tcs_unrolled_id(b), vertex_id, nir_iadd_imm(b, offset, sem.location), nir_imm_int(b, util_last_bit(b->shader->info.patch_outputs_written)), nir_imm_int(b, b->shader->info.tess.tcs_vertices_out), - nir_imm_int64(b, agx_tcs_per_vertex_outputs(b->shader))); + nir_imm_int64(b, poly_tcs_per_vertex_outputs(b->shader))); addr = nir_iadd_imm(b, addr, nir_intrinsic_component(intr) * 4); @@ -68,9 +68,9 @@ lower_tes_load(nir_builder *b, nir_intrinsic_instr *intr) if (intr->intrinsic == nir_intrinsic_load_per_vertex_input) vertex = intr->src[0].ssa; - nir_def *addr = libagx_tes_in_address(b, nir_load_tess_param_buffer_poly(b), - nir_load_vertex_id(b), vertex, - nir_iadd_imm(b, offset, location)); + nir_def *addr = poly_tes_in_address(b, nir_load_tess_param_buffer_poly(b), + nir_load_vertex_id(b), vertex, + nir_iadd_imm(b, offset, location)); if (nir_intrinsic_has_component(intr)) addr = nir_iadd_imm(b, addr, nir_intrinsic_component(intr) * 4); @@ -84,10 +84,10 @@ tcs_load_input(nir_builder *b, nir_intrinsic_instr *intr) { nir_def *base = nir_imul( b, tcs_unrolled_id(b), - libagx_tcs_patch_vertices_in(b, nir_load_tess_param_buffer_poly(b))); + poly_tcs_patch_vertices_in(b, nir_load_tess_param_buffer_poly(b))); nir_def *vertex = nir_iadd(b, base, intr->src[0].ssa); - return agx_load_per_vertex_input(b, intr, vertex); + return poly_load_per_vertex_input(b, intr, vertex); } static nir_def * @@ -114,16 +114,15 @@ lower_tcs_impl(nir_builder *b, nir_intrinsic_instr *intr) return tcs_load_input(b, intr); case nir_intrinsic_load_patch_vertices_in: - return libagx_tcs_patch_vertices_in(b, - nir_load_tess_param_buffer_poly(b)); + return poly_tcs_patch_vertices_in(b, nir_load_tess_param_buffer_poly(b)); case nir_intrinsic_load_tess_level_outer_default: - return libagx_tess_level_outer_default( - b, nir_load_tess_param_buffer_poly(b)); + return poly_tess_level_outer_default(b, + nir_load_tess_param_buffer_poly(b)); case nir_intrinsic_load_tess_level_inner_default: - return libagx_tess_level_inner_default( - b, nir_load_tess_param_buffer_poly(b)); + return poly_tess_level_inner_default(b, + nir_load_tess_param_buffer_poly(b)); case nir_intrinsic_load_output: { nir_def *addr = tcs_out_addr(b, intr, nir_undef(b, 1, 32)); @@ -176,7 +175,7 @@ lower_tcs(nir_builder *b, nir_intrinsic_instr *intr, void *data) } bool -agx_nir_lower_tcs(nir_shader *tcs) +poly_nir_lower_tcs(nir_shader *tcs) { return nir_shader_intrinsics_pass(tcs, lower_tcs, nir_metadata_control_flow, NULL); @@ -187,12 +186,12 @@ lower_tes_impl(nir_builder *b, nir_intrinsic_instr *intr, void *data) { switch (intr->intrinsic) { case nir_intrinsic_load_tess_coord_xy: - return libagx_load_tess_coord(b, nir_load_tess_param_buffer_poly(b), - nir_load_vertex_id(b)); + return poly_load_tess_coord(b, nir_load_tess_param_buffer_poly(b), + nir_load_vertex_id(b)); case nir_intrinsic_load_primitive_id: - return libagx_tes_patch_id(b, nir_load_tess_param_buffer_poly(b), - nir_load_vertex_id(b)); + return poly_tes_patch_id(b, nir_load_tess_param_buffer_poly(b), + nir_load_vertex_id(b)); case nir_intrinsic_load_input: case nir_intrinsic_load_per_vertex_input: @@ -201,8 +200,7 @@ lower_tes_impl(nir_builder *b, nir_intrinsic_instr *intr, void *data) return lower_tes_load(b, intr); case nir_intrinsic_load_patch_vertices_in: - return libagx_tes_patch_vertices_in(b, - nir_load_tess_param_buffer_poly(b)); + return poly_tes_patch_vertices_in(b, nir_load_tess_param_buffer_poly(b)); default: return NULL; @@ -232,12 +230,12 @@ lower_tes_indexing(nir_builder *b, nir_intrinsic_instr *intr, void *data) b->cursor = nir_before_instr(&intr->instr); nir_def *p = nir_load_tess_param_buffer_poly(b); nir_def *id = nir_channel(b, nir_load_global_invocation_id(b, 32), 0); - nir_def_replace(&intr->def, libagx_load_tes_index(b, p, id)); + nir_def_replace(&intr->def, poly_load_tes_index(b, p, id)); return true; } bool -agx_nir_lower_tes(nir_shader *tes, bool to_hw_vs) +poly_nir_lower_tes(nir_shader *tes, bool to_hw_vs) { nir_lower_tess_coord_z( tes, tes->info.tess._primitive_mode == TESS_PRIMITIVE_TRIANGLES); diff --git a/src/poly/tessellator.h b/src/poly/tessellator.h new file mode 100644 index 00000000000..f8b722bfef3 --- /dev/null +++ b/src/poly/tessellator.h @@ -0,0 +1,108 @@ +/* + * Copyright 2024 Valve Corporation + * SPDX-License-Identifier: MIT + */ + +#pragma once + +#include "compiler/libcl/libcl.h" + +enum poly_tess_partitioning { + POLY_TESS_PARTITIONING_FRACTIONAL_ODD, + POLY_TESS_PARTITIONING_FRACTIONAL_EVEN, + POLY_TESS_PARTITIONING_INTEGER, +}; + +enum poly_tess_mode { + /* Do not actually tessellate, just write the index counts */ + POLY_TESS_MODE_COUNT, + + /* Tessellate using the count buffers to allocate indices */ + POLY_TESS_MODE_WITH_COUNTS, +}; + +struct poly_tess_point { + uint32_t u; + uint32_t v; +}; +static_assert(sizeof(struct poly_tess_point) == 8); + +struct poly_tess_args { + /* Heap to allocate tessellator outputs in */ + DEVICE(struct poly_heap) heap; + + /* Patch coordinate buffer, indexed as: + * + * coord_allocs[patch_ID] + vertex_in_patch + */ + DEVICE(struct poly_tess_point) patch_coord_buffer; + + /* Per-patch index within the heap for the tess coords, written by the + * tessellator based on the allocated memory. + */ + DEVICE(uint32_t) coord_allocs; + + /* Space for output draws from the tessellator. API draw calls. */ + DEVICE(uint32_t) out_draws; + + /* Tessellation control shader output buffer. */ + DEVICE(float) tcs_buffer; + + /* Count buffer. # of indices per patch written here, then prefix summed. */ + DEVICE(uint32_t) counts; + + /* Allocated index buffer for all patches, if we're prefix summing counts */ + DEVICE(uint32_t) index_buffer; + + /* Address of the tess eval invocation counter for implementing pipeline + * statistics, if active. Zero if inactive. Incremented by tessellator. + */ + DEVICE(uint32_t) statistic; + + /* When geom+tess used together, the buffer containing TES outputs (executed + * as a hardware compute shader). + */ + uint64_t tes_buffer; + + /* Bitfield of TCS per-vertex outputs */ + uint64_t tcs_per_vertex_outputs; + + /* Default tess levels used in OpenGL when there is no TCS in the pipeline. + * Unused in Vulkan and OpenGL ES. + */ + float tess_level_outer_default[4]; + float tess_level_inner_default[2]; + + /* Number of vertices in the input patch */ + uint32_t input_patch_size; + + /* Number of vertices in the TCS output patch */ + uint32_t output_patch_size; + + /* Number of patch constants written by TCS */ + uint32_t tcs_patch_constants; + + /* Number of input patches per instance of the VS/TCS */ + uint32_t patches_per_instance; + + /* Stride between tessellation facotrs in the TCS output buffer. */ + uint32_t tcs_stride_el; + + /* Number of patches being tessellated */ + uint32_t nr_patches; + + /* Partitioning and points mode. These affect per-patch setup code but not + * the hot tessellation loop so we make them dynamic to reduce tessellator + * variants. + */ + enum poly_tess_partitioning partitioning; + uint32_t points_mode; + uint32_t isolines; + + /* When fed into a geometry shader, triangles should be counter-clockwise. + * The tessellator always produces clockwise triangles, but we can swap + * dynamically in the TES. + */ + uint32_t ccw; +} PACKED; +static_assert(sizeof(struct poly_tess_args) == 36 * 4);