poly: Migrate AGX's GS/TESS emulation to common code

This moves most of the code to a new home: src/poly.
Most precomp kernels logic that could be moved are provided by poly now.

Signed-off-by: Mary Guillemard <mary.guillemard@collabora.com>
Acked-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37914>
This commit is contained in:
Mary Guillemard 2025-10-06 12:57:04 +02:00 committed by Marge Bot
parent 8048004238
commit b2accf86d1
35 changed files with 3421 additions and 3117 deletions

View file

@ -845,6 +845,10 @@ endif
with_llvm = with_llvm \
.enable_if(with_clc, error_message : 'CLC requires LLVM')
with_poly = [
with_gallium_asahi, with_asahi_vk, with_tools.contains('asahi'),
].contains(true)
dep_clc = null_dep
if with_clc
dep_clc = dependency('libclc')

View file

@ -237,7 +237,9 @@ ForEachMacros:
- agx_foreach_reg_dest
- agx_foreach_successor
- foreach_next_use
- libagx_foreach_xfb
# poly
- poly_foreach_xfb
# radv
- PHASE

View file

@ -316,16 +316,6 @@ agx_fill_decompress_args(struct ail_layout *layout, unsigned layer,
agx_fill_decompress_args(layout, layer, level, ptr, images), \
util_logbase2(layout->sample_count_sa))
#define libagx_tessellate(context, grid, barrier, prim, mode, state) \
if (prim == TESS_PRIMITIVE_QUADS) { \
libagx_tess_quad(context, grid, barrier, state, mode); \
} else if (prim == TESS_PRIMITIVE_TRIANGLES) { \
libagx_tess_tri(context, grid, barrier, state, mode); \
} else { \
assert(prim == TESS_PRIMITIVE_ISOLINES); \
libagx_tess_isoline(context, grid, barrier, state, mode); \
}
struct agx_border_packed;
void agx_pack_border(struct agx_border_packed *out, const uint32_t in[4],

View file

@ -1,61 +0,0 @@
/*
* Copyright 2023 Alyssa Rosenzweig
* SPDX-License-Identifier: MIT
*/
#pragma once
#include <stdbool.h>
#include <stdint.h>
#include "libagx/geometry.h"
#include "nir.h"
#include "shader_enums.h"
struct nir_def *agx_load_per_vertex_input(struct nir_builder *b,
nir_intrinsic_instr *intr,
struct nir_def *vertex);
nir_def *agx_nir_load_vertex_id(struct nir_builder *b, nir_def *id,
unsigned index_size_B);
bool agx_nir_lower_sw_vs(struct nir_shader *s, unsigned index_size_B);
bool agx_nir_lower_vs_before_gs(struct nir_shader *vs);
struct agx_gs_info {
/* Output primitive mode for geometry shaders */
enum mesa_prim mode;
/* Number of words per primitive in the count buffer */
unsigned count_words;
/* Per-input primitive stride of the output index buffer */
unsigned max_indices;
/* Whether the GS includes transform feedback at a compile-time level */
bool xfb;
/* Whether a prefix sum is required on the count outputs. Implies xfb */
bool prefix_sum;
/* Whether the GS writes to a stream other than stream #0 */
bool multistream;
/* Shape of the rasterization draw, named by the instance ID */
enum agx_gs_shape shape;
/* Static topology used if shape = AGX_GS_SHAPE_STATIC_INDEXED */
uint8_t topology[64];
};
bool agx_nir_lower_gs(struct nir_shader *gs, struct nir_shader **gs_count,
struct nir_shader **gs_copy, struct nir_shader **pre_gs,
struct agx_gs_info *info);
bool agx_nir_lower_tcs(struct nir_shader *tcs);
bool agx_nir_lower_tes(struct nir_shader *tes, bool to_hw_vs);
uint64_t agx_tcs_per_vertex_outputs(const struct nir_shader *nir);
unsigned agx_tcs_output_stride(const struct nir_shader *nir);

View file

@ -5,11 +5,12 @@
*/
#include "gallium/include/pipe/p_defines.h"
#include "poly/cl/libpoly.h"
#include "poly/nir/poly_nir_lower_gs.h"
#include "util/format/u_formats.h"
#include "agx_abi.h"
#include "agx_linker.h"
#include "agx_nir.h"
#include "agx_nir_lower_gs.h"
#include "agx_nir_lower_vbo.h"
#include "agx_pack.h"
#include "agx_tilebuffer.h"
@ -149,11 +150,11 @@ lower_adjacency(nir_builder *b, nir_intrinsic_instr *intr, void *data)
nir_def *id = nir_load_vertex_id(b);
if (key->adjacency == MESA_PRIM_LINES_ADJACENCY) {
id = libagx_map_to_line_adj(b, id);
id = poly_map_to_line_adj(b, id);
} else if (key->adjacency == MESA_PRIM_TRIANGLE_STRIP_ADJACENCY) {
id = libagx_map_to_tri_strip_adj(b, id);
id = poly_map_to_tri_strip_adj(b, id);
} else if (key->adjacency == MESA_PRIM_LINE_STRIP_ADJACENCY) {
id = libagx_map_to_line_strip_adj(b, id);
id = poly_map_to_line_strip_adj(b, id);
} else if (key->adjacency == MESA_PRIM_TRIANGLES_ADJACENCY) {
/* Sequence (0, 2, 4), (6, 8, 10), ... */
id = nir_imul_imm(b, id, 2);
@ -161,7 +162,7 @@ lower_adjacency(nir_builder *b, nir_intrinsic_instr *intr, void *data)
UNREACHABLE("unknown");
}
id = agx_nir_load_vertex_id(b, id, key->sw_index_size_B);
id = poly_nir_load_vertex_id(b, id, key->sw_index_size_B);
nir_def_replace(&intr->def, id);
return true;
@ -215,7 +216,7 @@ agx_nir_vs_prolog(nir_builder *b, const void *key_)
}
if (!key->hw) {
agx_nir_lower_sw_vs(b->shader, key->sw_index_size_B);
poly_nir_lower_sw_vs(b->shader, key->sw_index_size_B);
} else if (key->adjacency) {
nir_shader_intrinsics_pass(b->shader, lower_adjacency,
nir_metadata_control_flow, (void *)key);

View file

@ -11,11 +11,8 @@ libasahi_lib_files = files(
'agx_linker.c',
'agx_bg_eot.c',
'agx_tilebuffer.c',
'agx_nir_lower_gs.c',
'agx_nir_lower_ia.c',
'agx_nir_lower_msaa.c',
'agx_nir_lower_sample_intrinsics.c',
'agx_nir_lower_tess.c',
'agx_nir_lower_tilebuffer.c',
'agx_nir_lower_uvs.c',
'agx_nir_lower_vbo.c',
@ -66,8 +63,8 @@ libasahi_lib = static_library(
include_directories : [inc_asahi, inc_virtio_gpu, inc_virtio_vdrm],
c_args : [no_override_init_args, '-Wno-c2x-extensions'],
gnu_symbol_visibility : 'hidden',
link_with: [libasahi_decode, libvdrm],
dependencies: [dep_libdrm, dep_valgrind, idep_nir, idep_mesautil, idep_libagx],
link_with: [libasahi_decode, libvdrm, libpoly_nir],
dependencies: [dep_libdrm, dep_valgrind, idep_nir, idep_mesautil, idep_libagx, idep_libpoly],
build_by_default : false,
)

View file

@ -4,8 +4,8 @@
*/
#include "asahi/lib/agx_abi.h"
#include "compiler/libcl/libcl_vk.h"
#include "poly/geometry.h"
#include "agx_pack.h"
#include "geometry.h"
#include "libagx_dgc.h"
/*
@ -36,7 +36,7 @@ libagx_predicate_indirect(global uint32_t *out, constant uint32_t *in,
KERNEL(1)
libagx_draw_without_adj(global VkDrawIndirectCommand *out,
global VkDrawIndirectCommand *in,
global struct agx_ia_state *ia, uint64_t index_buffer,
global struct poly_ia_state *ia, uint64_t index_buffer,
uint64_t index_buffer_range_el, int index_size_B,
enum mesa_prim prim)
{
@ -49,11 +49,11 @@ libagx_draw_without_adj(global VkDrawIndirectCommand *out,
if (index_size_B) {
uint offs = in->firstVertex;
ia->index_buffer = libagx_index_buffer(
index_buffer, index_buffer_range_el, offs, index_size_B);
ia->index_buffer = poly_index_buffer(index_buffer, index_buffer_range_el,
offs, index_size_B);
ia->index_buffer_range_el =
libagx_index_buffer_range_el(index_buffer_range_el, offs);
poly_index_buffer_range_el(index_buffer_range_el, offs);
}
}
@ -122,8 +122,7 @@ libagx_memset_small(global uchar *dst, uchar b, int len, uint tid)
* TODO: Handle multiple draws in parallel.
*/
KERNEL(32)
libagx_draw_robust_index(global uint32_t *vdm,
global struct agx_heap *heap,
libagx_draw_robust_index(global uint32_t *vdm, global struct poly_heap *heap,
constant VkDrawIndexedIndirectCommand *cmd,
uint64_t in_buf_ptr, uint32_t in_buf_range_B,
ushort restart, enum agx_primitive topology,
@ -163,7 +162,7 @@ libagx_draw_robust_index(global uint32_t *vdm,
/* Allocate memory for the shadow index buffer */
global uchar *padded;
if (first) {
padded = agx_heap_alloc_nonatomic(heap, out_size_B);
padded = poly_heap_alloc_nonatomic(heap, out_size_B);
}
padded = (global uchar *)sub_group_broadcast((uintptr_t)padded, 0);
@ -172,7 +171,7 @@ libagx_draw_robust_index(global uint32_t *vdm,
draw.start = 0;
/* Clone the index buffer. The destination is aligned as a post-condition
* of agx_heap_alloc_nonatomic.
* of poly_heap_alloc_nonatomic.
*/
libagx_memcpy_to_aligned((global uint *)padded, in_buf, in_size_B, tid,
32);

View file

@ -4,15 +4,11 @@
* SPDX-License-Identifier: MIT
*/
#include "asahi/lib/agx_abi.h"
#include "compiler/libcl/libcl_vk.h"
#include "poly/geometry.h"
#include "poly/tessellator.h"
#include "util/macros.h"
#include "util/u_math.h"
#include "geometry.h"
#include "query.h"
#include "tessellator.h"
uint64_t nir_ro_to_rw_poly(uint64_t address);
/* Swap the two non-provoking vertices in odd triangles. This generates a vertex
* ID list with a consistent winding order.
@ -32,54 +28,6 @@ map_vertex_in_tri_strip(uint prim, uint vert, bool flatshade_first)
return (provoking || even) ? vert : ((3 - pv) - vert);
}
static inline uint
xfb_prim(uint id, uint n, uint copy)
{
return sub_sat(id, n - 1u) + copy;
}
/*
* Determine whether an output vertex has an n'th copy in the transform feedback
* buffer. This is written weirdly to let constant folding remove unnecessary
* stores when length is known statically.
*/
bool
libagx_xfb_vertex_copy_in_strip(uint n, uint id, uint length, uint copy)
{
uint prim = xfb_prim(id, n, copy);
int num_prims = length - (n - 1);
return copy == 0 || (prim < num_prims && id >= copy && copy < num_prims);
}
uint
libagx_xfb_vertex_offset(uint n, uint invocation_base_prim,
uint strip_base_prim, uint id_in_strip, uint copy,
bool flatshade_first)
{
uint prim = xfb_prim(id_in_strip, n, copy);
uint vert_0 = min(id_in_strip, n - 1);
uint vert = vert_0 - copy;
if (n == 3) {
vert = map_vertex_in_tri_strip(prim, vert, flatshade_first);
}
/* Tally up in the whole buffer */
uint base_prim = invocation_base_prim + strip_base_prim;
uint base_vertex = base_prim * n;
return base_vertex + (prim * n) + vert;
}
uint64_t
libagx_xfb_vertex_address(constant struct agx_geometry_params *p, uint index,
uint buffer, uint stride, uint output_offset)
{
uint xfb_offset = (index * stride) + output_offset;
return (uintptr_t)(p->xfb_base[buffer]) + xfb_offset;
}
static uint
vertex_id_for_line_loop(uint prim, uint vert, uint num_prims)
{
@ -90,20 +38,6 @@ vertex_id_for_line_loop(uint prim, uint vert, uint num_prims)
return prim + vert;
}
uint
libagx_vertex_id_for_line_class(enum mesa_prim mode, uint prim, uint vert,
uint num_prims)
{
/* Line list, line strip, or line loop */
if (mode == MESA_PRIM_LINE_LOOP && prim == (num_prims - 1) && vert == 1)
return 0;
if (mode == MESA_PRIM_LINES)
prim *= 2;
return prim + vert;
}
static uint
vertex_id_for_tri_fan(uint prim, uint vert, bool flatshade_first)
{
@ -122,44 +56,6 @@ vertex_id_for_tri_fan(uint prim, uint vert, bool flatshade_first)
return (vert == 0) ? 0 : prim + vert;
}
uint
libagx_vertex_id_for_tri_class(enum mesa_prim mode, uint prim, uint vert,
bool flatshade_first)
{
if (flatshade_first && mode == MESA_PRIM_TRIANGLE_FAN) {
vert = vert + 1;
vert = (vert == 3) ? 0 : vert;
}
if (mode == MESA_PRIM_TRIANGLE_FAN && vert == 0)
return 0;
if (mode == MESA_PRIM_TRIANGLES)
prim *= 3;
/* Triangle list, triangle strip, or triangle fan */
if (mode == MESA_PRIM_TRIANGLE_STRIP) {
unsigned pv = flatshade_first ? 0 : 2;
bool even = (prim & 1) == 0;
bool provoking = vert == pv;
vert = ((provoking || even) ? vert : ((3 - pv) - vert));
}
return prim + vert;
}
uint
libagx_vertex_id_for_line_adj_class(enum mesa_prim mode, uint prim, uint vert)
{
/* Line list adj or line strip adj */
if (mode == MESA_PRIM_LINES_ADJACENCY)
prim *= 4;
return prim + vert;
}
static uint
vertex_id_for_tri_strip_adj(uint prim, uint vert, uint num_prims,
bool flatshade_first)
@ -206,18 +102,6 @@ vertex_id_for_tri_strip_adj(uint prim, uint vert, uint num_prims,
return (prim * 2) + offset;
}
uint
libagx_vertex_id_for_tri_adj_class(enum mesa_prim mode, uint prim, uint vert,
uint nr, bool flatshade_first)
{
/* Tri adj list or tri adj strip */
if (mode == MESA_PRIM_TRIANGLE_STRIP_ADJACENCY) {
return vertex_id_for_tri_strip_adj(prim, vert, nr, flatshade_first);
} else {
return (6 * prim) + vert;
}
}
static uint
vertex_id_for_topology(enum mesa_prim mode, bool flatshade_first, uint prim,
uint vert, uint num_prims)
@ -262,127 +146,6 @@ vertex_id_for_topology(enum mesa_prim mode, bool flatshade_first, uint prim,
}
}
uint
libagx_map_to_line_adj(uint id)
{
/* Sequence (1, 2), (5, 6), (9, 10), ... */
return ((id & ~1) * 2) + (id & 1) + 1;
}
uint
libagx_map_to_line_strip_adj(uint id)
{
/* Sequence (1, 2), (2, 3), (4, 5), .. */
uint prim = id / 2;
uint vert = id & 1;
return prim + vert + 1;
}
uint
libagx_map_to_tri_strip_adj(uint id)
{
/* Sequence (0, 2, 4), (2, 6, 4), (4, 6, 8), (6, 10, 8)
*
* Although tri strips with adjacency have 6 cases in general, after
* disregarding the vertices only available in a geometry shader, there are
* only even/odd cases. In other words, it's just a triangle strip subject to
* extra padding.
*
* Dividing through by two, the sequence is:
*
* (0, 1, 2), (1, 3, 2), (2, 3, 4), (3, 5, 4)
*/
uint prim = id / 3;
uint vtx = id % 3;
/* Flip the winding order of odd triangles */
if ((prim % 2) == 1) {
if (vtx == 1)
vtx = 2;
else if (vtx == 2)
vtx = 1;
}
return 2 * (prim + vtx);
}
static void
store_index(uintptr_t index_buffer, uint index_size_B, uint id, uint value)
{
global uint32_t *out_32 = (global uint32_t *)index_buffer;
global uint16_t *out_16 = (global uint16_t *)index_buffer;
global uint8_t *out_8 = (global uint8_t *)index_buffer;
if (index_size_B == 4)
out_32[id] = value;
else if (index_size_B == 2)
out_16[id] = value;
else
out_8[id] = value;
}
static uint
load_index(uintptr_t index_buffer, uint32_t index_buffer_range_el, uint id,
uint index_size)
{
bool oob = id >= index_buffer_range_el;
/* If the load would be out-of-bounds, load the first element which is
* assumed valid. If the application index buffer is empty with robustness2,
* index_buffer will point to a zero sink where only the first is valid.
*/
if (oob) {
id = 0;
}
uint el;
if (index_size == 1) {
el = ((constant uint8_t *)index_buffer)[id];
} else if (index_size == 2) {
el = ((constant uint16_t *)index_buffer)[id];
} else {
el = ((constant uint32_t *)index_buffer)[id];
}
/* D3D robustness semantics. TODO: Optimize? */
if (oob) {
el = 0;
}
return el;
}
uint
libagx_load_index_buffer(constant struct agx_ia_state *p, uint id,
uint index_size)
{
return load_index(p->index_buffer, p->index_buffer_range_el, id, index_size);
}
static void
increment_counters(global uint32_t *a, global uint32_t *b, global uint32_t *c,
uint count)
{
global uint32_t *ptr[] = {a, b, c};
for (uint i = 0; i < 3; ++i) {
if (ptr[i]) {
*(ptr[i]) += count;
}
}
}
static unsigned
decomposed_prims_for_vertices_with_tess(enum mesa_prim prim, int vertices,
unsigned verts_per_patch)
{
if (prim >= MESA_PRIM_PATCHES) {
return vertices / verts_per_patch;
} else {
return u_decomposed_prims_for_vertices(prim, vertices);
}
}
KERNEL(1)
libagx_increment_ia(global uint32_t *ia_vertices,
global uint32_t *ia_primitives,
@ -390,13 +153,8 @@ libagx_increment_ia(global uint32_t *ia_vertices,
global uint32_t *c_invs, constant uint32_t *draw,
enum mesa_prim prim, unsigned verts_per_patch)
{
increment_counters(ia_vertices, vs_invocations, NULL, draw[0] * draw[1]);
uint prims =
decomposed_prims_for_vertices_with_tess(prim, draw[0], verts_per_patch) *
draw[1];
increment_counters(ia_primitives, c_prims, c_invs, prims);
poly_increment_ia(ia_vertices, ia_primitives, vs_invocations, c_prims,
c_invs, draw, prim, verts_per_patch);
}
KERNEL(1024)
@ -418,8 +176,8 @@ libagx_increment_ia_restart(global uint32_t *ia_vertices,
/* Count non-restart indices */
for (uint i = tid; i < count; i += 1024) {
uint index = load_index(index_buffer, index_buffer_range_el, start + i,
index_size_B);
uint index = poly_load_index(index_buffer, index_buffer_range_el,
start + i, index_size_B);
if (index != restart_index)
partial++;
@ -433,7 +191,8 @@ libagx_increment_ia_restart(global uint32_t *ia_vertices,
/* Elect a single thread from the workgroup to increment the counters */
if (tid == 0) {
increment_counters(ia_vertices, vs_invocations, NULL, scratch * draw[1]);
poly_increment_counters(ia_vertices, vs_invocations, NULL,
scratch * draw[1]);
}
/* TODO: We should vectorize this */
@ -441,22 +200,22 @@ libagx_increment_ia_restart(global uint32_t *ia_vertices,
uint accum = 0;
int last_restart = -1;
for (uint i = 0; i < count; ++i) {
uint index = load_index(index_buffer, index_buffer_range_el, start + i,
index_size_B);
uint index = poly_load_index(index_buffer, index_buffer_range_el,
start + i, index_size_B);
if (index == restart_index) {
accum += decomposed_prims_for_vertices_with_tess(
accum += poly_decomposed_prims_for_vertices_with_tess(
prim, i - last_restart - 1, verts_per_patch);
last_restart = i;
}
}
{
accum += decomposed_prims_for_vertices_with_tess(
accum += poly_decomposed_prims_for_vertices_with_tess(
prim, count - last_restart - 1, verts_per_patch);
}
increment_counters(ia_primitives, c_prims, c_invs, accum * draw[1]);
poly_increment_counters(ia_primitives, c_prims, c_invs, accum * draw[1]);
}
}
@ -483,7 +242,7 @@ first_true_thread_in_workgroup(bool cond, local uint *scratch)
* sets up most of the new draw descriptor.
*/
static global void *
setup_unroll_for_draw(global struct agx_heap *heap, constant uint *in_draw,
setup_unroll_for_draw(global struct poly_heap *heap, constant uint *in_draw,
global uint *out, enum mesa_prim mode, uint index_size_B)
{
/* Determine an upper bound on the memory required for the index buffer.
@ -499,7 +258,7 @@ setup_unroll_for_draw(global struct agx_heap *heap, constant uint *in_draw,
* TODO: For multidraw, should be atomic. But multidraw+unroll isn't
* currently wired up in any driver.
*/
uint old_heap_bottom_B = agx_heap_alloc_nonatomic_offs(heap, alloc_size);
uint old_heap_bottom_B = poly_heap_alloc_nonatomic_offs(heap, alloc_size);
/* Setup most of the descriptor. Count will be determined after unroll. */
out[1] = in_draw[1]; /* instance count */
@ -512,14 +271,14 @@ setup_unroll_for_draw(global struct agx_heap *heap, constant uint *in_draw,
}
KERNEL(1024)
libagx_unroll_restart(global struct agx_heap *heap, uint64_t index_buffer,
libagx_unroll_restart(global struct poly_heap *heap, uint64_t index_buffer,
constant uint *in_draw, global uint32_t *out_draw,
uint32_t max_draws, uint32_t restart_index,
uint32_t index_buffer_size_el, uint32_t index_size_log2,
uint32_t flatshade_first, uint mode__11)
{
uint32_t index_size_B = 1 << index_size_log2;
enum mesa_prim mode = libagx_uncompact_prim(mode__11);
enum mesa_prim mode = poly_uncompact_prim(mode__11);
uint tid = cl_local_id.x;
uint count = in_draw[0];
@ -531,7 +290,7 @@ libagx_unroll_restart(global struct agx_heap *heap, uint64_t index_buffer,
barrier(CLK_LOCAL_MEM_FENCE);
uintptr_t in_ptr = (uintptr_t)(libagx_index_buffer(
uintptr_t in_ptr = (uintptr_t)(poly_index_buffer(
index_buffer, index_buffer_size_el, in_draw[2], index_size_B));
local uint scratch[32];
@ -545,8 +304,8 @@ libagx_unroll_restart(global struct agx_heap *heap, uint64_t index_buffer,
for (;;) {
uint idx = next_restart + tid;
bool restart =
idx >= count || load_index(in_ptr, index_buffer_size_el, idx,
index_size_B) == restart_index;
idx >= count || poly_load_index(in_ptr, index_buffer_size_el, idx,
index_size_B) == restart_index;
uint next_offs = first_true_thread_in_workgroup(restart, scratch);
@ -566,10 +325,10 @@ libagx_unroll_restart(global struct agx_heap *heap, uint64_t index_buffer,
uint offset = needle + id;
uint x = ((out_prims_base + i) * per_prim) + vtx;
uint y =
load_index(in_ptr, index_buffer_size_el, offset, index_size_B);
uint y = poly_load_index(in_ptr, index_buffer_size_el, offset,
index_size_B);
store_index(out_ptr, index_size_B, x, y);
poly_store_index(out_ptr, index_size_B, x, y);
}
}
@ -581,216 +340,39 @@ libagx_unroll_restart(global struct agx_heap *heap, uint64_t index_buffer,
out_draw[0] = out_prims * per_prim;
}
static uint
setup_xfb_buffer(global struct agx_geometry_params *p, uint i, uint stride,
uint max_output_end, uint vertices_per_prim)
{
uint xfb_offset = *(p->xfb_offs_ptrs[i]);
p->xfb_base[i] = p->xfb_base_original[i] + xfb_offset;
/* Let output_end = output_offset + output_size.
*
* Primitive P will write up to (but not including) offset:
*
* xfb_offset + ((P - 1) * (verts_per_prim * stride))
* + ((verts_per_prim - 1) * stride)
* + output_end
*
* To fit all outputs for P, that value must be less than the XFB
* buffer size for the output with maximal output_end, as everything
* else is constant here across outputs within a buffer/primitive:
*
* floor(P) <= (stride + size - xfb_offset - output_end)
* // (stride * verts_per_prim)
*/
int numer_s = p->xfb_size[i] + (stride - max_output_end) - xfb_offset;
uint numer = max(numer_s, 0);
return numer / (stride * vertices_per_prim);
}
void
libagx_write_strip(GLOBAL uint32_t *index_buffer, uint32_t inv_index_offset,
uint32_t prim_index_offset, uint32_t vertex_offset,
uint32_t verts_in_prim, uint3 info)
{
_libagx_write_strip(index_buffer, inv_index_offset + prim_index_offset,
vertex_offset, verts_in_prim, info.x, info.y, info.z);
}
void
libagx_pad_index_gs(global int *index_buffer, uint inv_index_offset,
uint nr_indices, uint alloc)
{
for (uint i = nr_indices; i < alloc; ++i) {
index_buffer[inv_index_offset + i] = -1;
}
}
KERNEL(1)
libagx_gs_setup_indirect(
uint64_t index_buffer, constant uint *draw,
global uintptr_t *vertex_buffer /* output */,
global struct agx_ia_state *ia /* output */,
global struct agx_geometry_params *p /* output */,
global struct agx_heap *heap,
global struct poly_ia_state *ia /* output */,
global struct poly_geometry_params *p /* output */,
global struct poly_heap *heap,
uint64_t vs_outputs /* Vertex (TES) output mask */,
uint32_t index_size_B /* 0 if no index bffer */,
uint32_t index_buffer_range_el,
uint32_t prim /* Input primitive type, enum mesa_prim */,
int is_prefix_summing, uint max_indices, enum agx_gs_shape shape)
int is_prefix_summing, uint max_indices, enum poly_gs_shape shape)
{
/* Determine the (primitives, instances) grid size. */
uint vertex_count = draw[0];
uint instance_count = draw[1];
ia->verts_per_instance = vertex_count;
/* Calculate number of primitives input into the GS */
uint prim_per_instance = u_decomposed_prims_for_vertices(prim, vertex_count);
p->input_primitives = prim_per_instance * instance_count;
/* Invoke VS as (vertices, instances); GS as (primitives, instances) */
p->vs_grid[0] = vertex_count;
p->vs_grid[1] = instance_count;
p->gs_grid[0] = prim_per_instance;
p->gs_grid[1] = instance_count;
p->primitives_log2 = util_logbase2_ceil(prim_per_instance);
/* If indexing is enabled, the third word is the offset into the index buffer
* in elements. Apply that offset now that we have it. For a hardware
* indirect draw, the hardware would do this for us, but for software input
* assembly we need to do it ourselves.
*/
if (index_size_B) {
ia->index_buffer = libagx_index_buffer(
index_buffer, index_buffer_range_el, draw[2], index_size_B);
ia->index_buffer_range_el =
libagx_index_buffer_range_el(index_buffer_range_el, draw[2]);
}
/* We need to allocate VS and GS count buffers, do so now */
uint vertex_buffer_size =
libagx_tcs_in_size(vertex_count * instance_count, vs_outputs);
if (is_prefix_summing) {
p->count_buffer = agx_heap_alloc_nonatomic(
heap, p->input_primitives * p->count_buffer_stride);
}
p->input_buffer =
(uintptr_t)agx_heap_alloc_nonatomic(heap, vertex_buffer_size);
*vertex_buffer = p->input_buffer;
p->input_mask = vs_outputs;
/* Allocate the index buffer and write the draw consuming it */
global VkDrawIndexedIndirectCommand *cmd = (global void *)p->indirect_desc;
*cmd = (VkDrawIndexedIndirectCommand){
.indexCount = agx_gs_rast_vertices(shape, max_indices, prim_per_instance,
instance_count),
.instanceCount = agx_gs_rast_instances(shape, max_indices,
prim_per_instance, instance_count),
};
if (shape == AGX_GS_SHAPE_DYNAMIC_INDEXED) {
cmd->firstIndex =
agx_heap_alloc_nonatomic_offs(heap, cmd->indexCount * 4) / 4;
p->output_index_buffer =
(global uint *)(heap->base + (cmd->firstIndex * 4));
}
}
/*
* Returns (work_group_scan_inclusive_add(x), work_group_sum(x)). Implemented
* manually with subgroup ops and local memory since Mesa doesn't do those
* lowerings yet.
*/
static uint2
libagx_work_group_scan_inclusive_add(uint x, local uint *scratch)
{
uint sg_id = get_sub_group_id();
/* Partial prefix sum of the subgroup */
uint sg = sub_group_scan_inclusive_add(x);
/* Reduction (sum) for the subgroup */
uint sg_sum = sub_group_broadcast(sg, 31);
/* Write out all the subgroups sums */
barrier(CLK_LOCAL_MEM_FENCE);
scratch[sg_id] = sg_sum;
barrier(CLK_LOCAL_MEM_FENCE);
/* Read all the subgroup sums. Thread T in subgroup G reads the sum of all
* threads in subgroup T.
*/
uint other_sum = scratch[get_sub_group_local_id()];
/* Exclusive sum the subgroup sums to get the total before the current group,
* which can be added to the total for the current group.
*/
uint other_sums = sub_group_scan_exclusive_add(other_sum);
uint base = sub_group_broadcast(other_sums, sg_id);
uint prefix = base + sg;
/* Reduce the workgroup using the prefix sum we already did */
uint reduction = sub_group_broadcast(other_sums + other_sum, 31);
return (uint2)(prefix, reduction);
}
static void
_libagx_prefix_sum(local uint *scratch, global uint *buffer, uint len,
uint words, uint word)
{
uint tid = cl_local_id.x;
/* Main loop: complete workgroups processing 1024 values at once */
uint i, count = 0;
uint len_remainder = len % 1024;
uint len_rounded_down = len - len_remainder;
for (i = tid; i < len_rounded_down; i += 1024) {
global uint *ptr = &buffer[(i * words) + word];
uint value = *ptr;
uint2 sums = libagx_work_group_scan_inclusive_add(value, scratch);
*ptr = count + sums[0];
count += sums[1];
}
/* The last iteration is special since we won't have a full subgroup unless
* the length is divisible by the subgroup size, and we don't advance count.
*/
global uint *ptr = &buffer[(i * words) + word];
uint value = (tid < len_remainder) ? *ptr : 0;
uint scan = libagx_work_group_scan_inclusive_add(value, scratch)[0];
if (tid < len_remainder) {
*ptr = count + scan;
}
poly_gs_setup_indirect(index_buffer, draw, vertex_buffer, ia, p, heap,
vs_outputs, index_size_B, index_buffer_range_el, prim,
is_prefix_summing, max_indices, shape);
}
KERNEL(1024)
libagx_prefix_sum_geom(constant struct agx_geometry_params *p)
libagx_prefix_sum_geom(constant struct poly_geometry_params *p)
{
local uint scratch[32];
_libagx_prefix_sum(scratch, p->count_buffer, p->input_primitives,
p->count_buffer_stride / 4, cl_group_id.x);
poly_prefix_sum(scratch, p->count_buffer, p->input_primitives,
p->count_buffer_stride / 4, cl_group_id.x, 1024);
}
KERNEL(1024)
libagx_prefix_sum_tess(global struct libagx_tess_args *p, global uint *c_prims,
libagx_prefix_sum_tess(global struct poly_tess_args *p, global uint *c_prims,
global uint *c_invs, uint increment_stats__2)
{
local uint scratch[32];
_libagx_prefix_sum(scratch, p->counts, p->nr_patches, 1 /* words */,
0 /* word */);
poly_prefix_sum(scratch, p->counts, p->nr_patches, 1 /* words */,
0 /* word */, 1024);
/* After prefix summing, we know the total # of indices, so allocate the
* index buffer now. Elect a thread for the allocation.
@ -805,7 +387,7 @@ libagx_prefix_sum_tess(global struct libagx_tess_args *p, global uint *c_prims,
/* Allocate 4-byte indices */
uint32_t elsize_B = sizeof(uint32_t);
uint32_t size_B = total * elsize_B;
uint alloc_B = agx_heap_alloc_nonatomic_offs(p->heap, size_B);
uint alloc_B = poly_heap_alloc_nonatomic_offs(p->heap, size_B);
p->index_buffer = (global uint32_t *)(((uintptr_t)p->heap->base) + alloc_B);
/* ...and now we can generate the API indexed draw */
@ -818,7 +400,7 @@ libagx_prefix_sum_tess(global struct libagx_tess_args *p, global uint *c_prims,
desc[4] = 0; /* start_instance */
/* If necessary, increment clipper statistics too. This is only used when
* there's no geometry shader following us. See agx_nir_lower_gs.c for more
* there's no geometry shader following us. See poly_nir_lower_gs.c for more
* info on the emulation. We just need to calculate the # of primitives
* tessellated.
*/
@ -827,150 +409,6 @@ libagx_prefix_sum_tess(global struct libagx_tess_args *p, global uint *c_prims,
: p->isolines ? (total / 2)
: (total / 3);
increment_counters(c_prims, c_invs, NULL, prims);
poly_increment_counters(c_prims, c_invs, NULL, prims);
}
}
uintptr_t
libagx_vertex_output_address(uintptr_t buffer, uint64_t mask, uint vtx,
gl_varying_slot location)
{
/* Written like this to let address arithmetic work */
return buffer + ((uintptr_t)libagx_tcs_in_offs_el(vtx, location, mask)) * 16;
}
uintptr_t
libagx_geometry_input_address(constant struct agx_geometry_params *p, uint vtx,
gl_varying_slot location)
{
return libagx_vertex_output_address(p->input_buffer, p->input_mask, vtx,
location);
}
unsigned
libagx_input_vertices(constant struct agx_ia_state *ia)
{
return ia->verts_per_instance;
}
global uint *
libagx_load_xfb_count_address(constant struct agx_geometry_params *p, int index,
int count_words, uint unrolled_id)
{
return &p->count_buffer[(unrolled_id * count_words) + index];
}
uint
libagx_previous_xfb_primitives(global struct agx_geometry_params *p,
int static_count, int count_index,
int count_words, bool prefix_sum,
uint unrolled_id)
{
if (static_count >= 0) {
/* If the number of outputted vertices per invocation is known statically,
* we can calculate the base.
*/
return unrolled_id * static_count;
} else {
/* Otherwise, load from the count buffer buffer. Note that the sums are
* inclusive, so index 0 is nonzero. This requires a little fixup here. We
* use a saturating unsigned subtraction so we don't read out-of-bounds.
*
* If we didn't prefix sum, there's only one element.
*/
uint prim_minus_1 = prefix_sum ? sub_sat(unrolled_id, 1u) : 0;
uint count = p->count_buffer[(prim_minus_1 * count_words) + count_index];
return unrolled_id == 0 ? 0 : count;
}
}
/* Like u_foreach_bit, specialized for XFB to enable loop unrolling */
#define libagx_foreach_xfb(word, index) \
for (uint i = 0; i < 4; ++i) \
if (word & BITFIELD_BIT(i))
void
libagx_pre_gs(global struct agx_geometry_params *p, uint streams,
uint buffers_written, uint4 buffer_to_stream, int4 count_index,
uint4 stride, uint4 output_end, int4 static_count,
uint invocations, uint vertices_per_prim,
global uint *gs_invocations, global uint *gs_primitives,
global uint *c_primitives, global uint *c_invocations)
{
unsigned count_words = !!(count_index[0] >= 0) + !!(count_index[1] >= 0) +
!!(count_index[2] >= 0) + !!(count_index[3] >= 0);
bool prefix_sum = count_words && buffers_written;
uint unrolled_in_prims = p->input_primitives;
/* Determine the number of primitives generated in each stream */
uint4 in_prims = 0;
libagx_foreach_xfb(streams, i) {
in_prims[i] = libagx_previous_xfb_primitives(
p, static_count[i], count_index[i], count_words, prefix_sum,
unrolled_in_prims);
*(p->prims_generated_counter[i]) += in_prims[i];
}
uint4 prims = in_prims;
uint emitted_prims = prims[0] + prims[1] + prims[2] + prims[3];
if (buffers_written) {
libagx_foreach_xfb(buffers_written, i) {
uint max_prims =
setup_xfb_buffer(p, i, stride[i], output_end[i], vertices_per_prim);
unsigned stream = buffer_to_stream[i];
prims[stream] = min(prims[stream], max_prims);
}
int4 overflow = prims < in_prims;
libagx_foreach_xfb(streams, i) {
p->xfb_verts[i] = prims[i] * vertices_per_prim;
*(p->xfb_overflow[i]) += (bool)overflow[i];
*(p->xfb_prims_generated_counter[i]) += prims[i];
}
*(p->xfb_any_overflow) += any(overflow);
/* Update XFB counters */
libagx_foreach_xfb(buffers_written, i) {
uint32_t prim_stride_B = stride[i] * vertices_per_prim;
unsigned stream = buffer_to_stream[i];
global uint *ptr = p->xfb_offs_ptrs[i];
ptr = (global uint *)nir_ro_to_rw_poly((uint64_t)ptr);
*ptr += prims[stream] * prim_stride_B;
}
}
/* The geometry shader is invoked once per primitive (after unrolling
* primitive restart). From the spec:
*
* In case of instanced geometry shaders (see section 11.3.4.2) the
* geometry shader invocations count is incremented for each separate
* instanced invocation.
*/
*gs_invocations += unrolled_in_prims * invocations;
*gs_primitives += emitted_prims;
/* Clipper queries are not well-defined, so we can emulate them in lots of
* silly ways. We need the hardware counters to implement them properly. For
* now, just consider all primitives emitted as passing through the clipper.
* This satisfies spec text:
*
* The number of primitives that reach the primitive clipping stage.
*
* and
*
* If at least one vertex of the primitive lies inside the clipping
* volume, the counter is incremented by one or more. Otherwise, the
* counter is incremented by zero or more.
*/
*c_primitives += emitted_prims;
*c_invocations += emitted_prims;
}

View file

@ -1,410 +0,0 @@
/*
* Copyright 2023 Alyssa Rosenzweig
* Copyright 2023 Valve Corporation
* SPDX-License-Identifier: MIT
*/
#include "asahi/lib/agx_abi.h"
#include "compiler/libcl/libcl.h"
#include "compiler/shader_enums.h"
#include "util/bitscan.h"
#include "util/u_math.h"
#pragma once
#define MAX_SO_BUFFERS 4
#define MAX_VERTEX_STREAMS 4
enum agx_gs_shape {
/* Indexed, where indices are encoded as:
*
* round_to_pot(max_indices) * round_to_pot(input_primitives) *
* * instance_count
*
* invoked for max_indices * input_primitives * instance_count indices.
*
* This is used with any dynamic topology. No hardware instancing used.
*/
AGX_GS_SHAPE_DYNAMIC_INDEXED,
/* Indexed with a static index buffer. Indices ranges up to max_indices.
* Hardware instance count = input_primitives * software instance count.
*/
AGX_GS_SHAPE_STATIC_INDEXED,
/* Non-indexed. Dispatched as:
*
* (max_indices, input_primitives * instance count).
*/
AGX_GS_SHAPE_STATIC_PER_PRIM,
/* Non-indexed. Dispatched as:
*
* (max_indices * input_primitives, instance count).
*/
AGX_GS_SHAPE_STATIC_PER_INSTANCE,
};
static inline unsigned
agx_gs_rast_vertices(enum agx_gs_shape shape, unsigned max_indices,
unsigned input_primitives, unsigned instance_count)
{
switch (shape) {
case AGX_GS_SHAPE_DYNAMIC_INDEXED:
return max_indices * input_primitives * instance_count;
case AGX_GS_SHAPE_STATIC_INDEXED:
case AGX_GS_SHAPE_STATIC_PER_PRIM:
return max_indices;
case AGX_GS_SHAPE_STATIC_PER_INSTANCE:
return max_indices * input_primitives;
}
UNREACHABLE("invalid shape");
}
static inline unsigned
agx_gs_rast_instances(enum agx_gs_shape shape, unsigned max_indices,
unsigned input_primitives, unsigned instance_count)
{
switch (shape) {
case AGX_GS_SHAPE_DYNAMIC_INDEXED:
return 1;
case AGX_GS_SHAPE_STATIC_INDEXED:
case AGX_GS_SHAPE_STATIC_PER_PRIM:
return input_primitives * instance_count;
case AGX_GS_SHAPE_STATIC_PER_INSTANCE:
return instance_count;
}
UNREACHABLE("invalid shape");
}
static inline bool
agx_gs_indexed(enum agx_gs_shape shape)
{
return shape == AGX_GS_SHAPE_DYNAMIC_INDEXED ||
shape == AGX_GS_SHAPE_STATIC_INDEXED;
}
static inline unsigned
agx_gs_index_size(enum agx_gs_shape shape)
{
switch (shape) {
case AGX_GS_SHAPE_DYNAMIC_INDEXED:
return 4;
case AGX_GS_SHAPE_STATIC_INDEXED:
return 1;
default:
return 0;
}
}
/* Heap to allocate from. */
struct agx_heap {
DEVICE(uchar) base;
uint32_t bottom, size;
} PACKED;
static_assert(sizeof(struct agx_heap) == 4 * 4);
#ifdef __OPENCL_VERSION__
static inline uint
_agx_heap_alloc_offs(global struct agx_heap *heap, uint size_B, bool atomic)
{
size_B = align(size_B, 16);
uint offs;
if (atomic) {
offs = atomic_fetch_add((volatile atomic_uint *)(&heap->bottom), size_B);
} else {
offs = heap->bottom;
heap->bottom = offs + size_B;
}
/* Use printf+abort because assert is stripped from release builds. */
if (heap->bottom >= heap->size) {
printf(
"FATAL: GPU heap overflow, allocating size %u, at offset %u, heap size %u!",
size_B, offs, heap->size);
abort();
}
return offs;
}
static inline uint
agx_heap_alloc_nonatomic_offs(global struct agx_heap *heap, uint size_B)
{
return _agx_heap_alloc_offs(heap, size_B, false);
}
static inline uint
agx_heap_alloc_atomic_offs(global struct agx_heap *heap, uint size_B)
{
return _agx_heap_alloc_offs(heap, size_B, true);
}
static inline global void *
agx_heap_alloc_nonatomic(global struct agx_heap *heap, uint size_B)
{
return heap->base + agx_heap_alloc_nonatomic_offs(heap, size_B);
}
uint64_t nir_load_ro_sink_address_poly(void);
static inline uint64_t
libagx_index_buffer(uint64_t index_buffer, uint size_el, uint offset_el,
uint elsize_B)
{
if (offset_el < size_el)
return index_buffer + (offset_el * elsize_B);
else
return nir_load_ro_sink_address_poly();
}
#endif
struct agx_ia_state {
/* Index buffer if present. */
uint64_t index_buffer;
/* Size of the bound index buffer for bounds checking */
uint32_t index_buffer_range_el;
/* Number of vertices per instance. Written by CPU for direct draw, indirect
* setup kernel for indirect. This is used for VS->GS and VS->TCS indexing.
*/
uint32_t verts_per_instance;
} PACKED;
static_assert(sizeof(struct agx_ia_state) == 4 * 4);
static inline uint
libagx_index_buffer_range_el(uint size_el, uint offset_el)
{
return offset_el < size_el ? (size_el - offset_el) : 0;
}
struct agx_geometry_params {
/* Address of associated indirect draw buffer */
DEVICE(uint) indirect_desc;
/* Address of count buffer. For an indirect draw, this will be written by the
* indirect setup kernel.
*/
DEVICE(uint) count_buffer;
/* Address of the primitives generated counters */
DEVICE(uint) prims_generated_counter[MAX_VERTEX_STREAMS];
DEVICE(uint) xfb_prims_generated_counter[MAX_VERTEX_STREAMS];
DEVICE(uint) xfb_overflow[MAX_VERTEX_STREAMS];
DEVICE(uint) xfb_any_overflow;
/* Pointers to transform feedback buffer offsets in bytes */
DEVICE(uint) xfb_offs_ptrs[MAX_SO_BUFFERS];
/* Output index buffer, allocated by pre-GS. */
DEVICE(uint) output_index_buffer;
/* Address of transform feedback buffer in general, supplied by the CPU. */
DEVICE(uchar) xfb_base_original[MAX_SO_BUFFERS];
/* Address of transform feedback for the current primitive. Written by pre-GS
* program.
*/
DEVICE(uchar) xfb_base[MAX_SO_BUFFERS];
/* Address and present mask for the input to the geometry shader. These will
* reflect the vertex shader for VS->GS or instead the tessellation
* evaluation shader for TES->GS.
*/
uint64_t input_buffer;
uint64_t input_mask;
/* Location-indexed mask of flat outputs, used for lowering GL edge flags. */
uint64_t flat_outputs;
uint32_t xfb_size[MAX_SO_BUFFERS];
/* Number of vertices emitted by transform feedback per stream. Written by
* the pre-GS program.
*/
uint32_t xfb_verts[MAX_VERTEX_STREAMS];
/* Within an indirect GS draw, the grids used to dispatch the VS/GS written
* out by the GS indirect setup kernel or the CPU for a direct draw. This is
* the "indirect local" format: first 3 is in threads, second 3 is in grid
* blocks. This lets us use nontrivial workgroups with indirect draws without
* needing any predication.
*/
uint32_t vs_grid[6];
uint32_t gs_grid[6];
/* Number of input primitives across all instances, calculated by the CPU for
* a direct draw or the GS indirect setup kernel for an indirect draw.
*/
uint32_t input_primitives;
/* Number of input primitives per instance, rounded up to a power-of-two and
* with the base-2 log taken. This is used to partition the output vertex IDs
* efficiently.
*/
uint32_t primitives_log2;
/* Number of bytes output by the GS count shader per input primitive (may be
* 0), written by CPU and consumed by indirect draw setup shader for
* allocating counts.
*/
uint32_t count_buffer_stride;
/* Dynamic input topology. Must be compatible with the geometry shader's
* layout() declared input class.
*/
uint32_t input_topology;
} PACKED;
static_assert(sizeof(struct agx_geometry_params) == 86 * 4);
/* TCS shared memory layout:
*
* vec4 vs_outputs[VERTICES_IN_INPUT_PATCH][TOTAL_VERTEX_OUTPUTS];
*
* TODO: compact.
*/
static inline uint
libagx_tcs_in_offs_el(uint vtx, gl_varying_slot location,
uint64_t crosslane_vs_out_mask)
{
uint base = vtx * util_bitcount64(crosslane_vs_out_mask);
uint offs = util_bitcount64(crosslane_vs_out_mask &
(((uint64_t)(1) << location) - 1));
return base + offs;
}
static inline uint
libagx_tcs_in_offs(uint vtx, gl_varying_slot location,
uint64_t crosslane_vs_out_mask)
{
return libagx_tcs_in_offs_el(vtx, location, crosslane_vs_out_mask) * 16;
}
static inline uint
libagx_tcs_in_size(uint32_t vertices_in_patch, uint64_t crosslane_vs_out_mask)
{
return vertices_in_patch * util_bitcount64(crosslane_vs_out_mask) * 16;
}
/*
* TCS out buffer layout, per-patch:
*
* float tess_level_outer[4];
* float tess_level_inner[2];
* vec4 patch_out[MAX_PATCH_OUTPUTS];
* vec4 vtx_out[OUT_PATCH_SIZE][TOTAL_VERTEX_OUTPUTS];
*
* Vertex out are compacted based on the mask of written out. Patch
* out are used as-is.
*
* Bounding boxes are ignored.
*/
static inline uint
libagx_tcs_out_offs_el(uint vtx_id, gl_varying_slot location, uint nr_patch_out,
uint64_t vtx_out_mask)
{
uint off = 0;
if (location == VARYING_SLOT_TESS_LEVEL_OUTER)
return off;
off += 4;
if (location == VARYING_SLOT_TESS_LEVEL_INNER)
return off;
off += 2;
if (location >= VARYING_SLOT_PATCH0)
return off + (4 * (location - VARYING_SLOT_PATCH0));
/* Anything else is a per-vtx output */
off += 4 * nr_patch_out;
off += 4 * vtx_id * util_bitcount64(vtx_out_mask);
uint idx = util_bitcount64(vtx_out_mask & (((uint64_t)(1) << location) - 1));
return off + (4 * idx);
}
static inline uint
libagx_tcs_out_offs(uint vtx_id, gl_varying_slot location, uint nr_patch_out,
uint64_t vtx_out_mask)
{
return libagx_tcs_out_offs_el(vtx_id, location, nr_patch_out, vtx_out_mask) *
4;
}
static inline uint
libagx_tcs_out_stride_el(uint nr_patch_out, uint out_patch_size,
uint64_t vtx_out_mask)
{
return libagx_tcs_out_offs_el(out_patch_size, 0, nr_patch_out, vtx_out_mask);
}
static inline uint
libagx_tcs_out_stride(uint nr_patch_out, uint out_patch_size,
uint64_t vtx_out_mask)
{
return libagx_tcs_out_stride_el(nr_patch_out, out_patch_size, vtx_out_mask) *
4;
}
/* In a tess eval shader, stride for hw vertex ID */
#define LIBAGX_TES_PATCH_ID_STRIDE 8192
static uint
libagx_compact_prim(enum mesa_prim prim)
{
static_assert(MESA_PRIM_QUAD_STRIP == MESA_PRIM_QUADS + 1);
static_assert(MESA_PRIM_POLYGON == MESA_PRIM_QUADS + 2);
#ifndef __OPENCL_VERSION__
assert(prim != MESA_PRIM_QUADS);
assert(prim != MESA_PRIM_QUAD_STRIP);
assert(prim != MESA_PRIM_POLYGON);
assert(prim != MESA_PRIM_PATCHES);
#endif
return (prim >= MESA_PRIM_QUADS) ? (prim - 3) : prim;
}
static enum mesa_prim
libagx_uncompact_prim(uint packed)
{
return (packed >= MESA_PRIM_QUADS) ? (packed + 3) : packed;
}
/*
* Write a strip into a 32-bit index buffer. This is the sequence:
*
* (b, b + 1, b + 2, ..., b + n - 1, -1) where -1 is the restart index
*
* For points, we write index buffers without restart just for remapping.
*/
static inline void
_libagx_write_strip(GLOBAL uint32_t *index_buffer, uint32_t index_offset,
uint32_t vertex_offset, uint32_t verts_in_prim,
uint32_t stream, uint32_t stream_multiplier, uint32_t n)
{
bool restart = n > 1;
if (verts_in_prim < n)
return;
GLOBAL uint32_t *out = &index_buffer[index_offset];
/* Write out indices for the strip */
for (uint32_t i = 0; i < verts_in_prim; ++i) {
out[i] = (vertex_offset + i) * stream_multiplier + stream;
}
if (restart)
out[verts_in_prim] = -1;
}

View file

@ -21,6 +21,7 @@ libagx_spv = custom_target(
libagx_shader_files, '--',
'-I' + join_paths(meson.project_source_root(), 'include'),
'-I' + join_paths(meson.project_source_root(), 'src/compiler/libcl'),
'-I' + join_paths(meson.project_source_root(), 'src/poly/cl'),
'-I' + join_paths(meson.current_source_dir(), '.'),
'-I' + join_paths(meson.current_source_dir(), '../../'),
'-I' + join_paths(meson.current_source_dir(), 'shaders'),

View file

@ -3,148 +3,14 @@
* SPDX-License-Identifier: MIT
*/
#include "geometry.h"
#include "tessellator.h"
#include <agx_pack.h>
uint
libagx_tcs_patch_vertices_in(constant struct libagx_tess_args *p)
{
return p->input_patch_size;
}
uint
libagx_tes_patch_vertices_in(constant struct libagx_tess_args *p)
{
return p->output_patch_size;
}
uint
libagx_tcs_unrolled_id(constant struct libagx_tess_args *p, uint3 wg_id)
{
return (wg_id.y * p->patches_per_instance) + wg_id.x;
}
uint64_t
libagx_tes_buffer(constant struct libagx_tess_args *p)
{
return p->tes_buffer;
}
/*
* Helper to lower indexing for a tess eval shader ran as a compute shader. This
* handles the tess+geom case. This is simpler than the general input assembly
* lowering, as we know:
*
* 1. the index buffer is U32
* 2. the index is in bounds
*
* Therefore we do a simple load. No bounds checking needed.
*/
uint32_t
libagx_load_tes_index(constant struct libagx_tess_args *p, uint32_t index)
{
/* Swap second and third vertices of each triangle to flip winding order
* dynamically if needed.
*/
if (p->ccw) {
uint id = index % 3;
if (id == 1)
index++;
else if (id == 2)
index--;
}
return p->index_buffer[index];
}
ushort
libagx_tcs_in_offset(uint vtx, gl_varying_slot location,
uint64_t crosslane_vs_out_mask)
{
return libagx_tcs_in_offs(vtx, location, crosslane_vs_out_mask);
}
uintptr_t
libagx_tcs_out_address(constant struct libagx_tess_args *p, uint patch_id,
uint vtx_id, gl_varying_slot location, uint nr_patch_out,
uint out_patch_size, uint64_t vtx_out_mask)
{
uint stride_el =
libagx_tcs_out_stride_el(nr_patch_out, out_patch_size, vtx_out_mask);
uint offs_el =
libagx_tcs_out_offs_el(vtx_id, location, nr_patch_out, vtx_out_mask);
offs_el += patch_id * stride_el;
/* Written to match the AGX addressing mode */
return (uintptr_t)(p->tcs_buffer) + (((uintptr_t)offs_el) << 2);
}
static uint
libagx_tes_unrolled_patch_id(uint raw_id)
{
return raw_id / LIBAGX_TES_PATCH_ID_STRIDE;
}
uint
libagx_tes_patch_id(constant struct libagx_tess_args *p, uint raw_id)
{
return libagx_tes_unrolled_patch_id(raw_id) % p->patches_per_instance;
}
static uint
tes_vertex_id_in_patch(uint raw_id)
{
return raw_id % LIBAGX_TES_PATCH_ID_STRIDE;
}
float2
libagx_load_tess_coord(constant struct libagx_tess_args *p, uint raw_id)
{
uint patch = libagx_tes_unrolled_patch_id(raw_id);
uint vtx = tes_vertex_id_in_patch(raw_id);
global struct libagx_tess_point *t =
&p->patch_coord_buffer[p->coord_allocs[patch] + vtx];
/* Written weirdly because NIR struggles with loads of structs */
uint2 fixed = *((global uint2 *)t);
/* Convert fixed point to float */
return convert_float2(fixed) / (1u << 16);
}
uintptr_t
libagx_tes_in_address(constant struct libagx_tess_args *p, uint raw_id,
uint vtx_id, gl_varying_slot location)
{
uint patch = libagx_tes_unrolled_patch_id(raw_id);
return libagx_tcs_out_address(p, patch, vtx_id, location,
p->tcs_patch_constants, p->output_patch_size,
p->tcs_per_vertex_outputs);
}
float4
libagx_tess_level_outer_default(constant struct libagx_tess_args *p)
{
return vload4(0, p->tess_level_outer_default);
}
float2
libagx_tess_level_inner_default(constant struct libagx_tess_args *p)
{
return vload2(0, p->tess_level_inner_default);
}
#include "poly/geometry.h"
#include "poly/tessellator.h"
KERNEL(1)
libagx_tess_setup_indirect(
global struct libagx_tess_args *p,
global struct poly_tess_args *p,
global uint32_t *grids /* output: VS then TCS then tess */,
global struct agx_ia_state *ia /* output */, global uint32_t *indirect,
global struct poly_ia_state *ia /* output */, global uint32_t *indirect,
global uint64_t *vertex_output_buffer_ptr, uint64_t in_index_buffer,
uint32_t in_index_buffer_range_el, uint32_t in_index_size_B,
uint64_t vertex_outputs /* bitfield */,
@ -174,11 +40,11 @@ libagx_tess_setup_indirect(
alloc += unrolled_patches * sizeof(uint32_t);
uint vb_offs = alloc;
uint vb_size = libagx_tcs_in_size(count * instance_count, vertex_outputs);
uint vb_size = poly_tcs_in_size(count * instance_count, vertex_outputs);
alloc += vb_size;
/* Allocate all patch calculations in one go */
global uchar *blob = agx_heap_alloc_nonatomic(p->heap, alloc);
global uchar *blob = poly_heap_alloc_nonatomic(p->heap, alloc);
p->tcs_buffer = (global float *)(blob + tcs_out_offs);
p->patches_per_instance = in_patches;
@ -201,11 +67,11 @@ libagx_tess_setup_indirect(
*/
if (in_index_size_B) {
ia->index_buffer =
libagx_index_buffer(in_index_buffer, in_index_buffer_range_el,
indirect[2], in_index_size_B);
poly_index_buffer(in_index_buffer, in_index_buffer_range_el,
indirect[2], in_index_size_B);
ia->index_buffer_range_el =
libagx_index_buffer_range_el(in_index_buffer_range_el, indirect[2]);
poly_index_buffer_range_el(in_index_buffer_range_el, indirect[2]);
}
/* VS grid size */

File diff suppressed because it is too large Load diff

View file

@ -5,104 +5,14 @@
#pragma once
#include "compiler/libcl/libcl.h"
#include "poly/tessellator.h"
enum libagx_tess_partitioning {
LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD,
LIBAGX_TESS_PARTITIONING_FRACTIONAL_EVEN,
LIBAGX_TESS_PARTITIONING_INTEGER,
};
enum libagx_tess_mode {
/* Do not actually tessellate, just write the index counts */
LIBAGX_TESS_MODE_COUNT,
/* Tessellate using the count buffers to allocate indices */
LIBAGX_TESS_MODE_WITH_COUNTS,
};
struct libagx_tess_point {
uint32_t u;
uint32_t v;
};
static_assert(sizeof(struct libagx_tess_point) == 8);
struct libagx_tess_args {
/* Heap to allocate tessellator outputs in */
DEVICE(struct agx_heap) heap;
/* Patch coordinate buffer, indexed as:
*
* coord_allocs[patch_ID] + vertex_in_patch
*/
DEVICE(struct libagx_tess_point) patch_coord_buffer;
/* Per-patch index within the heap for the tess coords, written by the
* tessellator based on the allocated memory.
*/
DEVICE(uint32_t) coord_allocs;
/* Space for output draws from the tessellator. API draw calls. */
DEVICE(uint32_t) out_draws;
/* Tessellation control shader output buffer. */
DEVICE(float) tcs_buffer;
/* Count buffer. # of indices per patch written here, then prefix summed. */
DEVICE(uint32_t) counts;
/* Allocated index buffer for all patches, if we're prefix summing counts */
DEVICE(uint32_t) index_buffer;
/* Address of the tess eval invocation counter for implementing pipeline
* statistics, if active. Zero if inactive. Incremented by tessellator.
*/
DEVICE(uint32_t) statistic;
/* When geom+tess used together, the buffer containing TES outputs (executed
* as a hardware compute shader).
*/
uint64_t tes_buffer;
/* Bitfield of TCS per-vertex outputs */
uint64_t tcs_per_vertex_outputs;
/* Default tess levels used in OpenGL when there is no TCS in the pipeline.
* Unused in Vulkan and OpenGL ES.
*/
float tess_level_outer_default[4];
float tess_level_inner_default[2];
/* Number of vertices in the input patch */
uint32_t input_patch_size;
/* Number of vertices in the TCS output patch */
uint32_t output_patch_size;
/* Number of patch constants written by TCS */
uint32_t tcs_patch_constants;
/* Number of input patches per instance of the VS/TCS */
uint32_t patches_per_instance;
/* Stride between tessellation facotrs in the TCS output buffer. */
uint32_t tcs_stride_el;
/* Number of patches being tessellated */
uint32_t nr_patches;
/* Partitioning and points mode. These affect per-patch setup code but not
* the hot tessellation loop so we make them dynamic to reduce tessellator
* variants.
*/
enum libagx_tess_partitioning partitioning;
uint32_t points_mode;
uint32_t isolines;
/* When fed into a geometry shader, triangles should be counter-clockwise.
* The tessellator always produces clockwise triangles, but we can swap
* dynamically in the TES.
*/
uint32_t ccw;
} PACKED;
static_assert(sizeof(struct libagx_tess_args) == 36 * 4);
#define libagx_tessellate(context, grid, barrier, prim, mode, state) \
if (prim == TESS_PRIMITIVE_QUADS) { \
libagx_tess_quad(context, grid, barrier, state, mode); \
} else if (prim == TESS_PRIMITIVE_TRIANGLES) { \
libagx_tess_tri(context, grid, barrier, state, mode); \
} else { \
assert(prim == TESS_PRIMITIVE_ISOLINES); \
libagx_tess_isoline(context, grid, barrier, state, mode); \
}

View file

@ -5,10 +5,10 @@
* SPDX-License-Identifier: MIT
*/
#include "libagx/query.h"
#include "poly/nir/poly_nir_lower_gs.h"
#include "vulkan/vulkan_core.h"
#include "agx_helpers.h"
#include "agx_linker.h"
#include "agx_nir_lower_gs.h"
#include "agx_pack.h"
#include "agx_scratch.h"
#include "agx_tilebuffer.h"

View file

@ -5,6 +5,7 @@
* SPDX-License-Identifier: MIT
*/
#include <assert.h>
#include "poly/nir/poly_nir_lower_gs.h"
#include "agx_abi.h"
#include "agx_bg_eot.h"
#include "agx_bo.h"
@ -13,7 +14,6 @@
#include "agx_device.h"
#include "agx_helpers.h"
#include "agx_linker.h"
#include "agx_nir_lower_gs.h"
#include "agx_nir_lower_vbo.h"
#include "agx_ppp.h"
#include "agx_tilebuffer.h"
@ -31,10 +31,10 @@
#include "asahi/genxml/agx_pack.h"
#include "asahi/libagx/compression.h"
#include "asahi/libagx/geometry.h"
#include "asahi/libagx/libagx.h"
#include "asahi/libagx/query.h"
#include "asahi/libagx/tessellator.h"
#include "poly/geometry.h"
#include "util/blend.h"
#include "util/format/format_utils.h"
#include "util/format/u_formats.h"
@ -1007,9 +1007,9 @@ hk_heap(struct hk_cmd_buffer *cmd)
* the CPU as rodata, even though the GPU uses it for scratch internally.
*/
off_t off = dev->rodata.heap - dev->rodata.bo->va->addr;
struct agx_heap *map = agx_bo_map(dev->rodata.bo) + off;
struct poly_heap *map = agx_bo_map(dev->rodata.bo) + off;
*map = (struct agx_heap){
*map = (struct poly_heap){
.base = dev->heap->va->addr,
.size = size,
};
@ -1021,7 +1021,7 @@ hk_heap(struct hk_cmd_buffer *cmd)
uint64_t addr = dev->rodata.heap;
/* Zeroing the allocated index frees everything */
hk_queue_write(cmd, addr + offsetof(struct agx_heap, bottom), 0,
hk_queue_write(cmd, addr + offsetof(struct poly_heap, bottom), 0,
true /* after gfx */);
cmd->uses_heap = true;
@ -1045,7 +1045,7 @@ hk_upload_ia_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
{
assert(!agx_is_indirect(draw.b) && "indirect params written by GPU");
struct agx_ia_state ia = {.verts_per_instance = draw.b.count[0]};
struct poly_ia_state ia = {.verts_per_instance = draw.b.count[0]};
if (draw.indexed) {
unsigned index_size_B = agx_index_size_to_B(draw.index_size);
@ -1115,7 +1115,7 @@ hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
mode = u_decomposed_prim(mode);
}
struct agx_geometry_params params = {
struct poly_geometry_params params = {
.flat_outputs = fs->info.fs.interp.flat,
.input_topology = mode,
@ -1174,7 +1174,7 @@ hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
params.vs_grid[4] = params.gs_grid[4] = 1;
params.vs_grid[5] = params.gs_grid[5] = 1;
struct agx_gs_info *gsi = &count->info.gs;
struct poly_gs_info *gsi = &count->info.gs;
if (indirect) {
/* TODO: size */
@ -1183,7 +1183,7 @@ hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
params.indirect_desc = cmd->geom_indirect;
params.vs_grid[2] = params.gs_grid[2] = 1;
if (gsi->shape == AGX_GS_SHAPE_DYNAMIC_INDEXED) {
if (gsi->shape == POLY_GS_SHAPE_DYNAMIC_INDEXED) {
/* Need to allocate heap if we haven't yet */
hk_heap(cmd);
@ -1191,7 +1191,7 @@ hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
cmd->geom_index_count = dev->heap->size;
} else {
cmd->geom_index_count =
agx_gs_rast_vertices(gsi->shape, gsi->max_indices, 1, 0);
poly_gs_rast_vertices(gsi->shape, gsi->max_indices, 1, 0);
}
} else {
uint32_t verts = draw.b.count[0], instances = draw.b.count[1];
@ -1207,13 +1207,13 @@ hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
params.count_buffer = hk_pool_alloc(cmd, size, 4).gpu;
}
cmd->geom_index_count = agx_gs_rast_vertices(
cmd->geom_index_count = poly_gs_rast_vertices(
gsi->shape, gsi->max_indices, params.gs_grid[0], instances);
cmd->geom_instance_count = agx_gs_rast_instances(
cmd->geom_instance_count = poly_gs_rast_instances(
gsi->shape, gsi->max_indices, params.gs_grid[0], instances);
if (gsi->shape == AGX_GS_SHAPE_DYNAMIC_INDEXED) {
if (gsi->shape == POLY_GS_SHAPE_DYNAMIC_INDEXED) {
params.output_index_buffer =
hk_pool_alloc(cmd, cmd->geom_index_count * 4, 4).gpu;
@ -1221,7 +1221,7 @@ hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
}
}
if (gsi->shape == AGX_GS_SHAPE_STATIC_INDEXED) {
if (gsi->shape == POLY_GS_SHAPE_STATIC_INDEXED) {
cmd->geom_index_buffer =
hk_pool_upload(cmd, count->info.gs.topology, gsi->max_indices * 4, 4);
}
@ -1231,7 +1231,7 @@ hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
}
static void
hk_upload_tess_params(struct hk_cmd_buffer *cmd, struct libagx_tess_args *out,
hk_upload_tess_params(struct hk_cmd_buffer *cmd, struct poly_tess_args *out,
struct agx_draw draw)
{
struct hk_device *dev = hk_cmd_buffer_device(cmd);
@ -1239,14 +1239,14 @@ hk_upload_tess_params(struct hk_cmd_buffer *cmd, struct libagx_tess_args *out,
struct hk_graphics_state *gfx = &cmd->state.gfx;
struct hk_shader *tcs = hk_only_variant(gfx->shaders[MESA_SHADER_TESS_CTRL]);
enum libagx_tess_partitioning partitioning =
enum poly_tess_partitioning partitioning =
gfx->tess.info.spacing == TESS_SPACING_EQUAL
? LIBAGX_TESS_PARTITIONING_INTEGER
? POLY_TESS_PARTITIONING_INTEGER
: gfx->tess.info.spacing == TESS_SPACING_FRACTIONAL_ODD
? LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD
: LIBAGX_TESS_PARTITIONING_FRACTIONAL_EVEN;
? POLY_TESS_PARTITIONING_FRACTIONAL_ODD
: POLY_TESS_PARTITIONING_FRACTIONAL_EVEN;
struct libagx_tess_args args = {
struct poly_tess_args args = {
.heap = hk_heap(cmd),
.tcs_stride_el = tcs->info.tess.tcs_output_stride / 4,
.statistic = hk_pipeline_stat_addr(
@ -1428,7 +1428,7 @@ hk_draw_without_restart(struct hk_cmd_buffer *cmd, struct agx_draw draw,
libagx_unroll_restart_struct(cmd, agx_1d(1024 * draw_count),
AGX_BARRIER_ALL | AGX_PREGFX, ia,
libagx_compact_prim(prim));
poly_compact_prim(prim));
return agx_draw_indexed_indirect(ia.out_draw, dev->heap->va->addr,
dev->heap->size, draw.index_size,
@ -1485,7 +1485,7 @@ hk_launch_gs_prerast(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
if (cmd->state.gfx.shaders[MESA_SHADER_TESS_EVAL]) {
gsi.vertex_buffer = desc->root.draw.tess_params +
offsetof(struct libagx_tess_args, tes_buffer);
offsetof(struct poly_tess_args, tes_buffer);
} else {
gsi.vertex_buffer = desc->root.root_desc_addr +
offsetof(struct hk_root_descriptor_table,
@ -1501,10 +1501,10 @@ hk_launch_gs_prerast(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
AGX_BARRIER_ALL | AGX_PREGFX, gsi);
grid_vs = agx_grid_indirect_local(
geometry_params + offsetof(struct agx_geometry_params, vs_grid));
geometry_params + offsetof(struct poly_geometry_params, vs_grid));
grid_gs = agx_grid_indirect_local(
geometry_params + offsetof(struct agx_geometry_params, gs_grid));
geometry_params + offsetof(struct poly_geometry_params, gs_grid));
} else {
grid_vs = grid_gs = draw.b;
grid_gs.count[0] = u_decomposed_prims_for_vertices(mode, draw.b.count[0]);
@ -1554,9 +1554,9 @@ hk_launch_gs_prerast(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
/* Pre-rast geometry shader */
hk_dispatch_with_local_size(cmd, cs, main, grid_gs, wg);
if (agx_gs_indexed(count->info.gs.shape)) {
if (poly_gs_indexed(count->info.gs.shape)) {
enum agx_index_size index_size =
agx_translate_index_size(agx_gs_index_size(count->info.gs.shape));
agx_translate_index_size(poly_gs_index_size(count->info.gs.shape));
if (agx_is_indirect(draw.b)) {
return agx_draw_indexed_indirect(
@ -1661,13 +1661,13 @@ hk_launch_tess(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
/* First generate counts, then prefix sum them, and then tessellate. */
libagx_tessellate(cmd, grid_tess, AGX_BARRIER_ALL | AGX_PREGFX, info.mode,
LIBAGX_TESS_MODE_COUNT, state);
POLY_TESS_MODE_COUNT, state);
libagx_prefix_sum_tess(cmd, agx_1d(1024), AGX_BARRIER_ALL | AGX_PREGFX,
state, c_prims, c_inv, c_prims || c_inv);
libagx_tessellate(cmd, grid_tess, AGX_BARRIER_ALL | AGX_PREGFX, info.mode,
LIBAGX_TESS_MODE_WITH_COUNTS, state);
POLY_TESS_MODE_WITH_COUNTS, state);
return agx_draw_indexed_indirect(gfx->tess.out_draws, dev->heap->va->addr,
dev->heap->size, AGX_INDEX_SIZE_U32, false);
@ -2219,8 +2219,9 @@ hk_flush_index(struct hk_cmd_buffer *cmd, struct hk_cs *cs)
uint32_t index = cmd->state.gfx.index.restart;
if (gs) {
enum agx_gs_shape shape = gs->variants[HK_GS_VARIANT_COUNT].info.gs.shape;
index = BITFIELD_MASK(8 * agx_gs_index_size(shape));
enum poly_gs_shape shape =
gs->variants[HK_GS_VARIANT_COUNT].info.gs.shape;
index = BITFIELD_MASK(8 * poly_gs_index_size(shape));
}
/* VDM State updates are relatively expensive, so only emit them when the
@ -3061,7 +3062,7 @@ hk_flush_dynamic_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
bool indirect = agx_is_indirect(draw.b) || draw.restart;
desc->root.draw.input_assembly =
indirect ? hk_pool_alloc(cmd, sizeof(struct agx_ia_state), 4).gpu
indirect ? hk_pool_alloc(cmd, sizeof(struct poly_ia_state), 4).gpu
: hk_upload_ia_params(cmd, draw);
desc->root_dirty = true;
}
@ -3078,7 +3079,7 @@ hk_flush_dynamic_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
if (!indirect) {
uint32_t verts = draw.b.count[0], instances = draw.b.count[1];
unsigned vb_size =
libagx_tcs_in_size(verts * instances, vs->b.info.outputs);
poly_tcs_in_size(verts * instances, vs->b.info.outputs);
/* Allocate if there are any outputs, or use the null sink to trap
* reads if there aren't. Those reads are undefined but should not
@ -3094,7 +3095,7 @@ hk_flush_dynamic_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
struct agx_ptr tess_args = {0};
if (gfx->shaders[MESA_SHADER_TESS_EVAL]) {
tess_args = hk_pool_alloc(cmd, sizeof(struct libagx_tess_args), 4);
tess_args = hk_pool_alloc(cmd, sizeof(struct poly_tess_args), 4);
gfx->descriptors.root.draw.tess_params = tess_args.gpu;
gfx->descriptors.root_dirty = true;
}

View file

@ -19,8 +19,8 @@
#include "asahi/genxml/agx_pack.h"
#include "asahi/lib/agx_bo.h"
#include "asahi/lib/agx_device.h"
#include "asahi/libagx/geometry.h"
#include "compiler/nir/nir_builder.h"
#include "poly/geometry.h"
#include "util/hash_table.h"
#include "util/ralloc.h"
#include "util/simple_mtx.h"
@ -86,7 +86,7 @@ hk_upload_rodata(struct hk_device *dev)
*/
offs = align(offs, sizeof(uint64_t));
dev->rodata.heap = dev->rodata.bo->va->addr + offs;
offs += sizeof(struct agx_heap);
offs += sizeof(struct poly_heap);
return VK_SUCCESS;
}

View file

@ -8,10 +8,10 @@
*/
#include "hk_shader.h"
#include "poly/nir/poly_nir_lower_gs.h"
#include "agx_debug.h"
#include "agx_device.h"
#include "agx_helpers.h"
#include "agx_nir_lower_gs.h"
#include "agx_nir_lower_vbo.h"
#include "glsl_types.h"
#include "hk_instance.h"
@ -1114,13 +1114,13 @@ hk_compile_nir(struct hk_device *dev, const VkAllocationCallbacks *pAllocator,
shader->info.tess.tcs_output_patch_size =
nir->info.tess.tcs_vertices_out;
shader->info.tess.tcs_per_vertex_outputs =
agx_tcs_per_vertex_outputs(nir);
poly_tcs_per_vertex_outputs(nir);
shader->info.tess.tcs_nr_patch_outputs =
util_last_bit(nir->info.patch_outputs_written);
shader->info.tess.tcs_output_stride = agx_tcs_output_stride(nir);
shader->info.tess.tcs_output_stride = poly_tcs_output_stride(nir);
} else {
/* This destroys info so it needs to happen after the gather */
NIR_PASS(_, nir, agx_nir_lower_tes, hw);
NIR_PASS(_, nir, poly_nir_lower_tes, hw);
}
}
@ -1137,7 +1137,7 @@ hk_compile_nir(struct hk_device *dev, const VkAllocationCallbacks *pAllocator,
if (hw) {
hk_lower_hw_vs(nir, shader, kill_psiz);
} else {
NIR_PASS(_, nir, agx_nir_lower_vs_before_gs);
NIR_PASS(_, nir, poly_nir_lower_vs_before_gs);
nir->info.stage = MESA_SHADER_COMPUTE;
memset(&nir->info.cs, 0, sizeof(nir->info.cs));
nir->xfb_info = NULL;
@ -1335,7 +1335,7 @@ hk_compile_shader(struct hk_device *dev, struct vk_shader_compile_info *info,
hk_populate_vs_key(&key_tmp.vs, state);
key = &key_tmp;
} else if (sw_stage == MESA_SHADER_TESS_CTRL) {
NIR_PASS(_, nir, agx_nir_lower_tcs);
NIR_PASS(_, nir, poly_nir_lower_tcs);
}
/* Compile all variants up front */
@ -1345,7 +1345,7 @@ hk_compile_shader(struct hk_device *dev, struct vk_shader_compile_info *info,
nir_shader *count = NULL, *rast = NULL, *pre_gs = NULL;
NIR_PASS(_, nir, agx_nir_lower_gs, &count, &rast, &pre_gs,
NIR_PASS(_, nir, poly_nir_lower_gs, &count, &rast, &pre_gs,
&count_variant->info.gs);
agx_preprocess_nir(count);

View file

@ -8,9 +8,9 @@
#pragma once
#include "asahi/compiler/agx_compile.h"
#include "poly/nir/poly_nir_lower_gs.h"
#include "util/macros.h"
#include "agx_linker.h"
#include "agx_nir_lower_gs.h"
#include "agx_nir_lower_vbo.h"
#include "agx_pack.h"
#include "agx_usc.h"
@ -94,7 +94,7 @@ struct hk_shader_info {
struct hk_tess_info info;
} tess;
struct agx_gs_info gs;
struct poly_gs_info gs;
/* Used to initialize the union for other stages */
uint8_t _pad[32];

View file

@ -5,10 +5,10 @@
#include "compiler/nir/nir_builder.h"
#include "pipe/p_defines.h"
#include "poly/nir/poly_nir_lower_gs.h"
#include "util/bitset.h"
#include "util/u_dynarray.h"
#include "agx_abi.h"
#include "agx_nir_lower_gs.h"
#include "agx_state.h"
#include "nir.h"
#include "nir_builder_opcodes.h"

View file

@ -34,6 +34,8 @@
#include "pipe/p_defines.h"
#include "pipe/p_screen.h"
#include "pipe/p_state.h"
#include "poly/geometry.h"
#include "poly/nir/poly_nir_lower_gs.h"
#include "util/bitscan.h"
#include "util/bitset.h"
#include "util/blend.h"
@ -57,10 +59,8 @@
#include "agx_disk_cache.h"
#include "agx_linker.h"
#include "agx_nir.h"
#include "agx_nir_lower_gs.h"
#include "agx_nir_lower_vbo.h"
#include "agx_tilebuffer.h"
#include "geometry.h"
#include "libagx.h"
#include "libagx_dgc.h"
#include "libagx_shaders.h"
@ -1544,7 +1544,7 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx,
nir_shader *nir = nir_deserialize(NULL, &agx_nir_options, &reader);
/* Auxiliary programs */
struct agx_gs_info gs_info = {0};
struct poly_gs_info gs_info = {0};
uint64_t outputs = 0;
struct agx_fs_epilog_link_info epilog_key = {false};
nir_shader *gs_count = NULL;
@ -1564,7 +1564,7 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx,
struct asahi_vs_shader_key *key = &key_->vs;
if (nir->info.vs.tes_poly) {
NIR_PASS(_, nir, agx_nir_lower_tes, key->hw);
NIR_PASS(_, nir, poly_nir_lower_tes, key->hw);
} else {
NIR_PASS(_, nir, agx_nir_gather_vs_inputs, attrib_components_read);
NIR_PASS(_, nir, agx_nir_lower_vs_input_to_prolog);
@ -1580,7 +1580,7 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx,
NIR_PASS(_, nir, agx_nir_lower_cull_distance_vs);
NIR_PASS(_, nir, agx_nir_lower_uvs, &uvs);
} else {
NIR_PASS(_, nir, agx_nir_lower_vs_before_gs);
NIR_PASS(_, nir, poly_nir_lower_vs_before_gs);
/* Turn into a compute shader now that we're free of vertexisms */
nir->info.stage = MESA_SHADER_COMPUTE;
@ -1589,9 +1589,9 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx,
outputs = nir->info.outputs_written;
}
} else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
NIR_PASS(_, nir, agx_nir_lower_tcs);
NIR_PASS(_, nir, poly_nir_lower_tcs);
} else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
NIR_PASS(_, nir, agx_nir_lower_gs, &gs_count, &gs_copy, &pre_gs,
NIR_PASS(_, nir, poly_nir_lower_gs, &gs_count, &gs_copy, &pre_gs,
&gs_info);
agx_preprocess_nir(gs_count);
@ -1932,11 +1932,11 @@ agx_create_shader_state(struct pipe_context *pctx,
so->tess.spacing = nir->info.tess.spacing;
so->tess.output_patch_size = nir->info.tess.tcs_vertices_out;
so->tess.primitive = nir->info.tess._primitive_mode;
so->tess.per_vertex_outputs = agx_tcs_per_vertex_outputs(nir);
so->tess.per_vertex_outputs = poly_tcs_per_vertex_outputs(nir);
so->tess.nr_patch_outputs =
util_last_bit(nir->info.patch_outputs_written);
if (nir->info.stage == MESA_SHADER_TESS_CTRL)
so->tess.output_stride = agx_tcs_output_stride(nir);
so->tess.output_stride = poly_tcs_output_stride(nir);
} else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
so->gs_mode = nir->info.gs.output_primitive;
}
@ -3903,7 +3903,7 @@ agx_batch_heap(struct agx_batch *batch)
PIPE_USAGE_DEFAULT, size);
}
struct agx_heap heap = {
struct poly_heap heap = {
.base = agx_resource(ctx->heap)->bo->va->addr,
.size = size,
};
@ -3924,7 +3924,7 @@ agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer,
const struct pipe_draw_start_count_bias *draw,
const struct pipe_draw_indirect_info *indirect)
{
struct agx_ia_state ia = {
struct poly_ia_state ia = {
.index_buffer = input_index_buffer,
.index_buffer_range_el = index_buffer_size_B / info->index_size,
.verts_per_instance = draw ? draw->count : 0,
@ -3933,7 +3933,7 @@ agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer,
batch->uniforms.input_assembly =
agx_pool_upload_aligned(&batch->pool, &ia, sizeof(ia), 8);
struct agx_geometry_params params = {
struct poly_geometry_params params = {
.indirect_desc = batch->geom_indirect,
.flat_outputs =
batch->ctx->stage[MESA_SHADER_FRAGMENT].shader->info.inputs_flat_shaded,
@ -4017,8 +4017,8 @@ agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer,
params.input_primitives = params.gs_grid[0] * info->instance_count;
unsigned vb_size = libagx_tcs_in_size(draw->count * info->instance_count,
batch->uniforms.vertex_outputs);
unsigned vb_size = poly_tcs_in_size(draw->count * info->instance_count,
batch->uniforms.vertex_outputs);
unsigned size = params.input_primitives * params.count_buffer_stride;
if (size && prefix_sum) {
@ -4034,8 +4034,8 @@ agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer,
params.input_buffer = addr;
}
struct agx_gs_info *gsi = &batch->ctx->gs->gs;
if (gsi->shape == AGX_GS_SHAPE_DYNAMIC_INDEXED) {
struct poly_gs_info *gsi = &batch->ctx->gs->gs;
if (gsi->shape == POLY_GS_SHAPE_DYNAMIC_INDEXED) {
unsigned idx_size = params.input_primitives * gsi->max_indices;
params.output_index_buffer =
@ -4125,10 +4125,10 @@ agx_launch_gs_prerast(struct agx_batch *batch,
libagx_gs_setup_indirect_struct(batch, agx_1d(1), AGX_BARRIER_ALL, gsi);
grid_vs = agx_grid_indirect_local(
gp + offsetof(struct agx_geometry_params, vs_grid));
gp + offsetof(struct poly_geometry_params, vs_grid));
grid_gs = agx_grid_indirect_local(
gp + offsetof(struct agx_geometry_params, gs_grid));
gp + offsetof(struct poly_geometry_params, gs_grid));
} else {
grid_vs = agx_3d(draws->count, info->instance_count, 1);
@ -4246,7 +4246,7 @@ agx_draw_without_restart(struct agx_batch *batch,
/* Unroll the index buffer for each draw */
libagx_unroll_restart_struct(batch, agx_1d(1024 * indirect->draw_count),
AGX_BARRIER_ALL, unroll,
libagx_compact_prim(info->mode));
poly_compact_prim(info->mode));
/* Now draw the results without restart */
struct pipe_draw_info new_info = {
@ -4538,8 +4538,8 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
bool point_mode = MAX2(tcs->tess.point_mode, tes->tess.point_mode);
enum mesa_prim out_prim = agx_tess_output_prim(tcs, tes);
enum libagx_tess_partitioning partitioning =
(enum libagx_tess_partitioning)pspacing;
enum poly_tess_partitioning partitioning =
(enum poly_tess_partitioning)pspacing;
struct agx_bo *draw_bo = NULL;
size_t draw_stride = 5 * sizeof(uint32_t);
@ -4557,7 +4557,7 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
if (info->index_size)
ib = agx_index_buffer_ptr(batch, info, draws, &ib_extent);
struct agx_ia_state ia = {
struct poly_ia_state ia = {
.index_buffer = ib,
.index_buffer_range_el = ib_extent,
.verts_per_instance = draws ? draws->count : 0,
@ -4572,7 +4572,7 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
uint64_t heap = agx_batch_heap(batch);
assert((tcs->tess.output_stride & 3) == 0 && "must be aligned");
struct libagx_tess_args args = {
struct poly_tess_args args = {
.heap = heap,
.tcs_stride_el = tcs->tess.output_stride / 4,
.statistic = agx_get_query_address(
@ -4644,8 +4644,8 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
args.out_draws = blob.gpu + draw_offs;
args.counts = blob.gpu + count_offs;
unsigned vb_size = libagx_tcs_in_size(draws->count * info->instance_count,
batch->uniforms.vertex_outputs);
unsigned vb_size = poly_tcs_in_size(draws->count * info->instance_count,
batch->uniforms.vertex_outputs);
uint64_t addr = agx_pool_alloc_aligned(&batch->pool, vb_size, 4).gpu;
batch->uniforms.vertex_output_buffer_ptr =
agx_pool_upload(&batch->pool, &addr, 8);
@ -4716,11 +4716,11 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
/* Generate counts, then prefix sum them, then finally tessellate. */
libagx_tessellate(batch, tess_grid, AGX_BARRIER_ALL, mode,
LIBAGX_TESS_MODE_COUNT, state);
POLY_TESS_MODE_COUNT, state);
libagx_prefix_sum_tess(batch, agx_1d(1024), AGX_BARRIER_ALL, state, c_prims,
c_invs, c_prims || c_invs);
libagx_tessellate(batch, tess_grid, AGX_BARRIER_ALL, mode,
LIBAGX_TESS_MODE_WITH_COUNTS, state);
POLY_TESS_MODE_WITH_COUNTS, state);
/* Face culling state needs to be specialized for tess */
ctx->dirty |= AGX_DIRTY_RS;
@ -5141,12 +5141,12 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
agx_launch_gs_prerast(batch, info, draws, indirect);
/* Setup to rasterize the GS results */
struct agx_gs_info *gsi = &ctx->gs->gs;
struct poly_gs_info *gsi = &ctx->gs->gs;
info_gs = (struct pipe_draw_info){
.mode = gsi->mode,
.index_size = agx_gs_index_size(gsi->shape),
.primitive_restart = agx_gs_indexed(gsi->shape),
.restart_index = agx_gs_index_size(gsi->shape) == 1 ? 0xFF : ~0,
.index_size = poly_gs_index_size(gsi->shape),
.primitive_restart = poly_gs_indexed(gsi->shape),
.restart_index = poly_gs_index_size(gsi->shape) == 1 ? 0xFF : ~0,
.index.resource = &index_rsrc.base,
.instance_count = 1,
};
@ -5167,11 +5167,11 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
u_decomposed_prims_for_vertices(info->mode, draws->count);
draw_gs = (struct pipe_draw_start_count_bias){
.count = agx_gs_rast_vertices(gsi->shape, gsi->max_indices, prims,
info->instance_count),
.count = poly_gs_rast_vertices(gsi->shape, gsi->max_indices, prims,
info->instance_count),
};
info_gs.instance_count = agx_gs_rast_instances(
info_gs.instance_count = poly_gs_rast_instances(
gsi->shape, gsi->max_indices, prims, info->instance_count);
draws = &draw_gs;
@ -5184,10 +5184,10 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
batch->reduced_prim = u_reduced_prim(info->mode);
ctx->dirty |= AGX_DIRTY_PRIM;
if (gsi->shape == AGX_GS_SHAPE_DYNAMIC_INDEXED) {
if (gsi->shape == POLY_GS_SHAPE_DYNAMIC_INDEXED) {
ib = batch->geom_index;
ib_extent = index_rsrc.bo->size - (batch->geom_index - ib);
} else if (gsi->shape == AGX_GS_SHAPE_STATIC_INDEXED) {
} else if (gsi->shape == POLY_GS_SHAPE_STATIC_INDEXED) {
ib = agx_pool_upload(&batch->pool, gsi->topology, gsi->max_indices);
ib_extent = gsi->max_indices;
}

View file

@ -18,13 +18,14 @@
#include "asahi/lib/agx_tilebuffer.h"
#include "asahi/lib/agx_uvs.h"
#include "asahi/lib/pool.h"
#include "asahi/libagx/geometry.h"
#include "compiler/shader_enums.h"
#include "gallium/auxiliary/util/u_blitter.h"
#include "gallium/include/pipe/p_context.h"
#include "gallium/include/pipe/p_screen.h"
#include "gallium/include/pipe/p_state.h"
#include "pipe/p_defines.h"
#include "poly/geometry.h"
#include "poly/nir/poly_nir_lower_gs.h"
#include "util/bitset.h"
#include "util/disk_cache.h"
#include "util/hash_table.h"
@ -32,7 +33,6 @@
#include "util/u_range.h"
#include "agx_bg_eot.h"
#include "agx_helpers.h"
#include "agx_nir_lower_gs.h"
#include "agx_nir_texture.h"
#ifdef __GLIBC__
@ -248,7 +248,7 @@ struct agx_compiled_shader {
struct agx_compiled_shader *gs_count, *pre_gs;
struct agx_compiled_shader *gs_copy;
struct agx_gs_info gs;
struct poly_gs_info gs;
/* Logical shader stage used for descriptor access. This may differ from the
* physical shader stage of the compiled shader, for example when executing a

View file

@ -53,6 +53,9 @@ if with_gallium_or_lvp or with_gbm or with_platform_wayland
subdir('loader')
endif
subdir('compiler')
if with_poly
subdir('poly')
endif
if with_tools.contains('drm-shim')
subdir('drm-shim')
endif

8
src/poly/.clang-format Normal file
View file

@ -0,0 +1,8 @@
BasedOnStyle: InheritParentConfig
DisableFormat: false
AlignConsecutiveBitFields: Consecutive
ColumnLimit: 80
BreakStringLiterals: false
SpaceBeforeParens: ControlStatementsExceptControlMacros

501
src/poly/cl/geometry.cl Normal file
View file

@ -0,0 +1,501 @@
/*
* Copyright 2023 Alyssa Rosenzweig
* Copyright 2023 Valve Corporation
* Copyright 2025 Collabora Ltd.
* SPDX-License-Identifier: MIT
*/
#include "compiler/libcl/libcl_vk.h"
#include "poly/geometry.h"
#include "poly/tessellator.h"
#include "util/macros.h"
#include "util/u_math.h"
uint64_t nir_ro_to_rw_poly(uint64_t address);
/* Swap the two non-provoking vertices in odd triangles. This generates a vertex
* ID list with a consistent winding order.
*
* Holding prim and flatshade_first constant, the map : [0, 1, 2] -> [0, 1, 2]
* is its own inverse. It is hence used both vertex fetch and transform
* feedback.
*/
static uint
map_vertex_in_tri_strip(uint prim, uint vert, bool flatshade_first)
{
unsigned pv = flatshade_first ? 0 : 2;
bool even = (prim & 1) == 0;
bool provoking = vert == pv;
return (provoking || even) ? vert : ((3 - pv) - vert);
}
static inline uint
xfb_prim(uint id, uint n, uint copy)
{
return sub_sat(id, n - 1u) + copy;
}
/*
* Determine whether an output vertex has an n'th copy in the transform feedback
* buffer. This is written weirdly to let constant folding remove unnecessary
* stores when length is known statically.
*/
bool
poly_xfb_vertex_copy_in_strip(uint n, uint id, uint length, uint copy)
{
uint prim = xfb_prim(id, n, copy);
int num_prims = length - (n - 1);
return copy == 0 || (prim < num_prims && id >= copy && copy < num_prims);
}
uint
poly_xfb_vertex_offset(uint n, uint invocation_base_prim, uint strip_base_prim,
uint id_in_strip, uint copy, bool flatshade_first)
{
uint prim = xfb_prim(id_in_strip, n, copy);
uint vert_0 = min(id_in_strip, n - 1);
uint vert = vert_0 - copy;
if (n == 3) {
vert = map_vertex_in_tri_strip(prim, vert, flatshade_first);
}
/* Tally up in the whole buffer */
uint base_prim = invocation_base_prim + strip_base_prim;
uint base_vertex = base_prim * n;
return base_vertex + (prim * n) + vert;
}
uint64_t
poly_xfb_vertex_address(constant struct poly_geometry_params *p, uint index,
uint buffer, uint stride, uint output_offset)
{
uint xfb_offset = (index * stride) + output_offset;
return (uintptr_t)(p->xfb_base[buffer]) + xfb_offset;
}
static uint
vertex_id_for_line_loop(uint prim, uint vert, uint num_prims)
{
/* (0, 1), (1, 2), (2, 0) */
if (prim == (num_prims - 1) && vert == 1)
return 0;
else
return prim + vert;
}
uint
poly_vertex_id_for_line_class(enum mesa_prim mode, uint prim, uint vert,
uint num_prims)
{
/* Line list, line strip, or line loop */
if (mode == MESA_PRIM_LINE_LOOP && prim == (num_prims - 1) && vert == 1)
return 0;
if (mode == MESA_PRIM_LINES)
prim *= 2;
return prim + vert;
}
static uint
vertex_id_for_tri_fan(uint prim, uint vert, bool flatshade_first)
{
/* Vulkan spec section 20.1.7 gives (i + 1, i + 2, 0) for a provoking
* first. OpenGL instead wants (0, i + 1, i + 2) with a provoking last.
* Piglit clipflat expects us to switch between these orders depending on
* provoking vertex, to avoid trivializing the fan.
*
* Rotate accordingly.
*/
if (flatshade_first) {
vert = (vert == 2) ? 0 : (vert + 1);
}
/* The simpler form assuming last is provoking. */
return (vert == 0) ? 0 : prim + vert;
}
uint
poly_vertex_id_for_tri_class(enum mesa_prim mode, uint prim, uint vert,
bool flatshade_first)
{
if (flatshade_first && mode == MESA_PRIM_TRIANGLE_FAN) {
vert = vert + 1;
vert = (vert == 3) ? 0 : vert;
}
if (mode == MESA_PRIM_TRIANGLE_FAN && vert == 0)
return 0;
if (mode == MESA_PRIM_TRIANGLES)
prim *= 3;
/* Triangle list, triangle strip, or triangle fan */
if (mode == MESA_PRIM_TRIANGLE_STRIP) {
unsigned pv = flatshade_first ? 0 : 2;
bool even = (prim & 1) == 0;
bool provoking = vert == pv;
vert = ((provoking || even) ? vert : ((3 - pv) - vert));
}
return prim + vert;
}
uint
poly_vertex_id_for_line_adj_class(enum mesa_prim mode, uint prim, uint vert)
{
/* Line list adj or line strip adj */
if (mode == MESA_PRIM_LINES_ADJACENCY)
prim *= 4;
return prim + vert;
}
static uint
vertex_id_for_tri_strip_adj(uint prim, uint vert, uint num_prims,
bool flatshade_first)
{
/* See Vulkan spec section 20.1.11 "Triangle Strips With Adjancency".
*
* There are different cases for first/middle/last/only primitives and for
* odd/even primitives. Determine which case we're in.
*/
bool last = prim == (num_prims - 1);
bool first = prim == 0;
bool even = (prim & 1) == 0;
bool even_or_first = even || first;
/* When the last vertex is provoking, we rotate the primitives
* accordingly. This seems required for OpenGL.
*/
if (!flatshade_first && !even_or_first) {
vert = (vert + 4u) % 6u;
}
/* Offsets per the spec. The spec lists 6 cases with 6 offsets. Luckily,
* there are lots of patterns we can exploit, avoiding a full 6x6 LUT.
*
* Here we assume the first vertex is provoking, the Vulkan default.
*/
uint offsets[6] = {
0,
first ? 1 : (even ? -2 : 3),
even_or_first ? 2 : 4,
last ? 5 : 6,
even_or_first ? 4 : 2,
even_or_first ? 3 : -2,
};
/* Ensure NIR can see thru the local array */
uint offset = 0;
for (uint i = 1; i < 6; ++i) {
if (i == vert)
offset = offsets[i];
}
/* Finally add to the base of the primitive */
return (prim * 2) + offset;
}
uint
poly_vertex_id_for_tri_adj_class(enum mesa_prim mode, uint prim, uint vert,
uint nr, bool flatshade_first)
{
/* Tri adj list or tri adj strip */
if (mode == MESA_PRIM_TRIANGLE_STRIP_ADJACENCY) {
return vertex_id_for_tri_strip_adj(prim, vert, nr, flatshade_first);
} else {
return (6 * prim) + vert;
}
}
static uint
vertex_id_for_topology(enum mesa_prim mode, bool flatshade_first, uint prim,
uint vert, uint num_prims)
{
switch (mode) {
case MESA_PRIM_POINTS:
case MESA_PRIM_LINES:
case MESA_PRIM_TRIANGLES:
case MESA_PRIM_LINES_ADJACENCY:
case MESA_PRIM_TRIANGLES_ADJACENCY:
/* Regular primitive: every N vertices defines a primitive */
return (prim * mesa_vertices_per_prim(mode)) + vert;
case MESA_PRIM_LINE_LOOP:
return vertex_id_for_line_loop(prim, vert, num_prims);
case MESA_PRIM_LINE_STRIP:
case MESA_PRIM_LINE_STRIP_ADJACENCY:
/* (i, i + 1) or (i, ..., i + 3) */
return prim + vert;
case MESA_PRIM_TRIANGLE_STRIP: {
/* Order depends on the provoking vert.
*
* First: (0, 1, 2), (1, 3, 2), (2, 3, 4).
* Last: (0, 1, 2), (2, 1, 3), (2, 3, 4).
*
* Pull the (maybe swapped) vert from the corresponding primitive
*/
return prim + map_vertex_in_tri_strip(prim, vert, flatshade_first);
}
case MESA_PRIM_TRIANGLE_FAN:
return vertex_id_for_tri_fan(prim, vert, flatshade_first);
case MESA_PRIM_TRIANGLE_STRIP_ADJACENCY:
return vertex_id_for_tri_strip_adj(prim, vert, num_prims,
flatshade_first);
default:
return 0;
}
}
uint
poly_map_to_line_adj(uint id)
{
/* Sequence (1, 2), (5, 6), (9, 10), ... */
return ((id & ~1) * 2) + (id & 1) + 1;
}
uint
poly_map_to_line_strip_adj(uint id)
{
/* Sequence (1, 2), (2, 3), (4, 5), .. */
uint prim = id / 2;
uint vert = id & 1;
return prim + vert + 1;
}
uint
poly_map_to_tri_strip_adj(uint id)
{
/* Sequence (0, 2, 4), (2, 6, 4), (4, 6, 8), (6, 10, 8)
*
* Although tri strips with adjacency have 6 cases in general, after
* disregarding the vertices only available in a geometry shader, there are
* only even/odd cases. In other words, it's just a triangle strip subject to
* extra padding.
*
* Dividing through by two, the sequence is:
*
* (0, 1, 2), (1, 3, 2), (2, 3, 4), (3, 5, 4)
*/
uint prim = id / 3;
uint vtx = id % 3;
/* Flip the winding order of odd triangles */
if ((prim % 2) == 1) {
if (vtx == 1)
vtx = 2;
else if (vtx == 2)
vtx = 1;
}
return 2 * (prim + vtx);
}
uint
poly_load_index_buffer(constant struct poly_ia_state *p, uint id,
uint index_size)
{
return poly_load_index(p->index_buffer, p->index_buffer_range_el, id,
index_size);
}
static uint
setup_xfb_buffer(global struct poly_geometry_params *p, uint i, uint stride,
uint max_output_end, uint vertices_per_prim)
{
uint xfb_offset = *(p->xfb_offs_ptrs[i]);
p->xfb_base[i] = p->xfb_base_original[i] + xfb_offset;
/* Let output_end = output_offset + output_size.
*
* Primitive P will write up to (but not including) offset:
*
* xfb_offset + ((P - 1) * (verts_per_prim * stride))
* + ((verts_per_prim - 1) * stride)
* + output_end
*
* To fit all outputs for P, that value must be less than the XFB
* buffer size for the output with maximal output_end, as everything
* else is constant here across outputs within a buffer/primitive:
*
* floor(P) <= (stride + size - xfb_offset - output_end)
* // (stride * verts_per_prim)
*/
int numer_s = p->xfb_size[i] + (stride - max_output_end) - xfb_offset;
uint numer = max(numer_s, 0);
return numer / (stride * vertices_per_prim);
}
void
poly_write_strip(GLOBAL uint32_t *index_buffer, uint32_t inv_index_offset,
uint32_t prim_index_offset, uint32_t vertex_offset,
uint32_t verts_in_prim, uint3 info)
{
_poly_write_strip(index_buffer, inv_index_offset + prim_index_offset,
vertex_offset, verts_in_prim, info.x, info.y, info.z);
}
void
poly_pad_index_gs(global int *index_buffer, uint inv_index_offset,
uint nr_indices, uint alloc)
{
for (uint i = nr_indices; i < alloc; ++i) {
index_buffer[inv_index_offset + i] = -1;
}
}
uintptr_t
poly_vertex_output_address(uintptr_t buffer, uint64_t mask, uint vtx,
gl_varying_slot location)
{
/* Written like this to let address arithmetic work */
return buffer + ((uintptr_t)poly_tcs_in_offs_el(vtx, location, mask)) * 16;
}
uintptr_t
poly_geometry_input_address(constant struct poly_geometry_params *p, uint vtx,
gl_varying_slot location)
{
return poly_vertex_output_address(p->input_buffer, p->input_mask, vtx,
location);
}
unsigned
poly_input_vertices(constant struct poly_ia_state *ia)
{
return ia->verts_per_instance;
}
global uint *
poly_load_xfb_count_address(constant struct poly_geometry_params *p, int index,
int count_words, uint unrolled_id)
{
return &p->count_buffer[(unrolled_id * count_words) + index];
}
uint
poly_previous_xfb_primitives(global struct poly_geometry_params *p,
int static_count, int count_index, int count_words,
bool prefix_sum, uint unrolled_id)
{
if (static_count >= 0) {
/* If the number of outputted vertices per invocation is known statically,
* we can calculate the base.
*/
return unrolled_id * static_count;
} else {
/* Otherwise, load from the count buffer buffer. Note that the sums are
* inclusive, so index 0 is nonzero. This requires a little fixup here. We
* use a saturating unsigned subtraction so we don't read out-of-bounds.
*
* If we didn't prefix sum, there's only one element.
*/
uint prim_minus_1 = prefix_sum ? sub_sat(unrolled_id, 1u) : 0;
uint count = p->count_buffer[(prim_minus_1 * count_words) + count_index];
return unrolled_id == 0 ? 0 : count;
}
}
/* Like u_foreach_bit, specialized for XFB to enable loop unrolling */
#define poly_foreach_xfb(word, index) \
for (uint i = 0; i < 4; ++i) \
if (word & BITFIELD_BIT(i))
void
poly_pre_gs(global struct poly_geometry_params *p, uint streams,
uint buffers_written, uint4 buffer_to_stream, int4 count_index,
uint4 stride, uint4 output_end, int4 static_count, uint invocations,
uint vertices_per_prim, global uint *gs_invocations,
global uint *gs_primitives, global uint *c_primitives,
global uint *c_invocations)
{
unsigned count_words = !!(count_index[0] >= 0) + !!(count_index[1] >= 0) +
!!(count_index[2] >= 0) + !!(count_index[3] >= 0);
bool prefix_sum = count_words && buffers_written;
uint unrolled_in_prims = p->input_primitives;
/* Determine the number of primitives generated in each stream */
uint4 in_prims = 0;
poly_foreach_xfb(streams, i) {
in_prims[i] = poly_previous_xfb_primitives(p, static_count[i],
count_index[i], count_words,
prefix_sum, unrolled_in_prims);
*(p->prims_generated_counter[i]) += in_prims[i];
}
uint4 prims = in_prims;
uint emitted_prims = prims[0] + prims[1] + prims[2] + prims[3];
if (buffers_written) {
poly_foreach_xfb(buffers_written, i) {
uint max_prims =
setup_xfb_buffer(p, i, stride[i], output_end[i], vertices_per_prim);
unsigned stream = buffer_to_stream[i];
prims[stream] = min(prims[stream], max_prims);
}
int4 overflow = prims < in_prims;
poly_foreach_xfb(streams, i) {
p->xfb_verts[i] = prims[i] * vertices_per_prim;
*(p->xfb_overflow[i]) += (bool)overflow[i];
*(p->xfb_prims_generated_counter[i]) += prims[i];
}
*(p->xfb_any_overflow) += any(overflow);
/* Update XFB counters */
poly_foreach_xfb(buffers_written, i) {
uint32_t prim_stride_B = stride[i] * vertices_per_prim;
unsigned stream = buffer_to_stream[i];
global uint *ptr = p->xfb_offs_ptrs[i];
ptr = (global uint *)nir_ro_to_rw_poly((uint64_t)ptr);
*ptr += prims[stream] * prim_stride_B;
}
}
/* The geometry shader is invoked once per primitive (after unrolling
* primitive restart). From the spec:
*
* In case of instanced geometry shaders (see section 11.3.4.2) the
* geometry shader invocations count is incremented for each separate
* instanced invocation.
*/
*gs_invocations += unrolled_in_prims * invocations;
*gs_primitives += emitted_prims;
/* Clipper queries are not well-defined, so we can emulate them in lots of
* silly ways. We need the hardware counters to implement them properly. For
* now, just consider all primitives emitted as passing through the clipper.
* This satisfies spec text:
*
* The number of primitives that reach the primitive clipping stage.
*
* and
*
* If at least one vertex of the primitive lies inside the clipping
* volume, the counter is incremented by one or more. Otherwise, the
* counter is incremented by zero or more.
*/
*c_primitives += emitted_prims;
*c_invocations += emitted_prims;
}

35
src/poly/cl/meson.build Normal file
View file

@ -0,0 +1,35 @@
# Copyright 2024 Valve Corporation
# Copyright © 2025 Collabora Ltd.
# SPDX-License-Identifier: MIT
libpoly_shader_files = files(
'geometry.cl',
'tessellation.cl',
)
libpoly_shaders_spv = custom_target(
input : libpoly_shader_files,
output : 'libpoly.spv',
command : [
prog_mesa_clc, '-o', '@OUTPUT@', '--depfile', '@DEPFILE@',
libpoly_shader_files, '--',
'-I' + join_paths(meson.project_source_root(), 'include'),
'-I' + join_paths(meson.project_source_root(), 'src/compiler/libcl'),
'-I' + join_paths(meson.current_source_dir(), '.'),
'-I' + join_paths(meson.current_source_dir(), '../../'),
cl_args,
],
depends : [],
depfile : 'libpoly_shaders.h.d',
)
libpoly_shaders = custom_target(
input : libpoly_shaders_spv,
output : ['libpoly.cpp', 'libpoly.h'],
command : [prog_vtn_bindgen2, libpoly_shaders_spv, '@OUTPUT0@', '@OUTPUT1@'],
)
idep_libpoly = declare_dependency(
sources : [libpoly_shaders],
include_directories : include_directories('.'),
)

133
src/poly/cl/tessellation.cl Normal file
View file

@ -0,0 +1,133 @@
/*
* Copyright 2023 Alyssa Rosenzweig
* SPDX-License-Identifier: MIT
*/
#include "poly/geometry.h"
#include "poly/tessellator.h"
uint
poly_tcs_patch_vertices_in(constant struct poly_tess_args *p)
{
return p->input_patch_size;
}
uint
poly_tes_patch_vertices_in(constant struct poly_tess_args *p)
{
return p->output_patch_size;
}
uint
poly_tcs_unrolled_id(constant struct poly_tess_args *p, uint3 wg_id)
{
return (wg_id.y * p->patches_per_instance) + wg_id.x;
}
uint64_t
poly_tes_buffer(constant struct poly_tess_args *p)
{
return p->tes_buffer;
}
/*
* Helper to lower indexing for a tess eval shader ran as a compute shader. This
* handles the tess+geom case. This is simpler than the general input assembly
* lowering, as we know:
*
* 1. the index buffer is U32
* 2. the index is in bounds
*
* Therefore we do a simple load. No bounds checking needed.
*/
uint32_t
poly_load_tes_index(constant struct poly_tess_args *p, uint32_t index)
{
/* Swap second and third vertices of each triangle to flip winding order
* dynamically if needed.
*/
if (p->ccw) {
uint id = index % 3;
if (id == 1)
index++;
else if (id == 2)
index--;
}
return p->index_buffer[index];
}
uintptr_t
poly_tcs_out_address(constant struct poly_tess_args *p, uint patch_id,
uint vtx_id, gl_varying_slot location, uint nr_patch_out,
uint out_patch_size, uint64_t vtx_out_mask)
{
uint stride_el =
poly_tcs_out_stride_el(nr_patch_out, out_patch_size, vtx_out_mask);
uint offs_el =
poly_tcs_out_offs_el(vtx_id, location, nr_patch_out, vtx_out_mask);
offs_el += patch_id * stride_el;
/* Written to match the AGX addressing mode */
return (uintptr_t)(p->tcs_buffer) + (((uintptr_t)offs_el) << 2);
}
static uint
tes_unrolled_patch_id(uint raw_id)
{
return raw_id / POLY_TES_PATCH_ID_STRIDE;
}
uint
poly_tes_patch_id(constant struct poly_tess_args *p, uint raw_id)
{
return tes_unrolled_patch_id(raw_id) % p->patches_per_instance;
}
static uint
tes_vertex_id_in_patch(uint raw_id)
{
return raw_id % POLY_TES_PATCH_ID_STRIDE;
}
float2
poly_load_tess_coord(constant struct poly_tess_args *p, uint raw_id)
{
uint patch = tes_unrolled_patch_id(raw_id);
uint vtx = tes_vertex_id_in_patch(raw_id);
global struct poly_tess_point *t =
&p->patch_coord_buffer[p->coord_allocs[patch] + vtx];
/* Written weirdly because NIR struggles with loads of structs */
uint2 fixed = *((global uint2 *)t);
/* Convert fixed point to float */
return convert_float2(fixed) / (1u << 16);
}
uintptr_t
poly_tes_in_address(constant struct poly_tess_args *p, uint raw_id, uint vtx_id,
gl_varying_slot location)
{
uint patch = tes_unrolled_patch_id(raw_id);
return poly_tcs_out_address(p, patch, vtx_id, location,
p->tcs_patch_constants, p->output_patch_size,
p->tcs_per_vertex_outputs);
}
float4
poly_tess_level_outer_default(constant struct poly_tess_args *p)
{
return vload4(0, p->tess_level_outer_default);
}
float2
poly_tess_level_inner_default(constant struct poly_tess_args *p)
{
return vload2(0, p->tess_level_inner_default);
}

1609
src/poly/cl/tessellator.h Normal file

File diff suppressed because it is too large Load diff

641
src/poly/geometry.h Normal file
View file

@ -0,0 +1,641 @@
/*
* Copyright 2023 Alyssa Rosenzweig
* Copyright 2023 Valve Corporation
* SPDX-License-Identifier: MIT
*/
#include "compiler/libcl/libcl.h"
#include "compiler/shader_enums.h"
#include "util/bitscan.h"
#include "util/u_math.h"
#ifdef __OPENCL_VERSION__
#include "compiler/libcl/libcl_vk.h"
#endif
#pragma once
#define POLY_MAX_SO_BUFFERS 4
#define POLY_MAX_VERTEX_STREAMS 4
enum poly_gs_shape {
/* Indexed, where indices are encoded as:
*
* round_to_pot(max_indices) * round_to_pot(input_primitives) *
* * instance_count
*
* invoked for max_indices * input_primitives * instance_count indices.
*
* This is used with any dynamic topology. No hardware instancing used.
*/
POLY_GS_SHAPE_DYNAMIC_INDEXED,
/* Indexed with a static index buffer. Indices ranges up to max_indices.
* Hardware instance count = input_primitives * software instance count.
*/
POLY_GS_SHAPE_STATIC_INDEXED,
/* Non-indexed. Dispatched as:
*
* (max_indices, input_primitives * instance count).
*/
POLY_GS_SHAPE_STATIC_PER_PRIM,
/* Non-indexed. Dispatched as:
*
* (max_indices * input_primitives, instance count).
*/
POLY_GS_SHAPE_STATIC_PER_INSTANCE,
};
static inline unsigned
poly_gs_rast_vertices(enum poly_gs_shape shape, unsigned max_indices,
unsigned input_primitives, unsigned instance_count)
{
switch (shape) {
case POLY_GS_SHAPE_DYNAMIC_INDEXED:
return max_indices * input_primitives * instance_count;
case POLY_GS_SHAPE_STATIC_INDEXED:
case POLY_GS_SHAPE_STATIC_PER_PRIM:
return max_indices;
case POLY_GS_SHAPE_STATIC_PER_INSTANCE:
return max_indices * input_primitives;
}
UNREACHABLE("invalid shape");
}
static inline unsigned
poly_gs_rast_instances(enum poly_gs_shape shape, unsigned max_indices,
unsigned input_primitives, unsigned instance_count)
{
switch (shape) {
case POLY_GS_SHAPE_DYNAMIC_INDEXED:
return 1;
case POLY_GS_SHAPE_STATIC_INDEXED:
case POLY_GS_SHAPE_STATIC_PER_PRIM:
return input_primitives * instance_count;
case POLY_GS_SHAPE_STATIC_PER_INSTANCE:
return instance_count;
}
UNREACHABLE("invalid shape");
}
static inline bool
poly_gs_indexed(enum poly_gs_shape shape)
{
return shape == POLY_GS_SHAPE_DYNAMIC_INDEXED ||
shape == POLY_GS_SHAPE_STATIC_INDEXED;
}
static inline unsigned
poly_gs_index_size(enum poly_gs_shape shape)
{
switch (shape) {
case POLY_GS_SHAPE_DYNAMIC_INDEXED:
return 4;
case POLY_GS_SHAPE_STATIC_INDEXED:
return 1;
default:
return 0;
}
}
/* Heap to allocate from. */
struct poly_heap {
DEVICE(uchar) base;
uint32_t bottom, size;
} PACKED;
static_assert(sizeof(struct poly_heap) == 4 * 4);
#ifdef __OPENCL_VERSION__
static inline uint
_poly_heap_alloc_offs(global struct poly_heap *heap, uint size_B, bool atomic)
{
size_B = align(size_B, 16);
uint offs;
if (atomic) {
offs = atomic_fetch_add((volatile atomic_uint *)(&heap->bottom), size_B);
} else {
offs = heap->bottom;
heap->bottom = offs + size_B;
}
/* Use printf+abort because assert is stripped from release builds. */
if (heap->bottom >= heap->size) {
printf(
"FATAL: GPU heap overflow, allocating size %u, at offset %u, heap size %u!",
size_B, offs, heap->size);
abort();
}
return offs;
}
static inline uint
poly_heap_alloc_nonatomic_offs(global struct poly_heap *heap, uint size_B)
{
return _poly_heap_alloc_offs(heap, size_B, false);
}
static inline uint
poly_heap_alloc_atomic_offs(global struct poly_heap *heap, uint size_B)
{
return _poly_heap_alloc_offs(heap, size_B, true);
}
static inline global void *
poly_heap_alloc_nonatomic(global struct poly_heap *heap, uint size_B)
{
return heap->base + poly_heap_alloc_nonatomic_offs(heap, size_B);
}
uint64_t nir_load_ro_sink_address_poly(void);
static inline uint64_t
poly_index_buffer(uint64_t index_buffer, uint size_el, uint offset_el,
uint elsize_B)
{
if (offset_el < size_el)
return index_buffer + (offset_el * elsize_B);
else
return nir_load_ro_sink_address_poly();
}
#endif
struct poly_ia_state {
/* Index buffer if present. */
uint64_t index_buffer;
/* Size of the bound index buffer for bounds checking */
uint32_t index_buffer_range_el;
/* Number of vertices per instance. Written by CPU for direct draw, indirect
* setup kernel for indirect. This is used for VS->GS and VS->TCS indexing.
*/
uint32_t verts_per_instance;
} PACKED;
static_assert(sizeof(struct poly_ia_state) == 4 * 4);
static inline uint
poly_index_buffer_range_el(uint size_el, uint offset_el)
{
return offset_el < size_el ? (size_el - offset_el) : 0;
}
struct poly_geometry_params {
/* Address of associated indirect draw buffer */
DEVICE(uint) indirect_desc;
/* Address of count buffer. For an indirect draw, this will be written by the
* indirect setup kernel.
*/
DEVICE(uint) count_buffer;
/* Address of the primitives generated counters */
DEVICE(uint) prims_generated_counter[POLY_MAX_VERTEX_STREAMS];
DEVICE(uint) xfb_prims_generated_counter[POLY_MAX_VERTEX_STREAMS];
DEVICE(uint) xfb_overflow[POLY_MAX_VERTEX_STREAMS];
DEVICE(uint) xfb_any_overflow;
/* Pointers to transform feedback buffer offsets in bytes */
DEVICE(uint) xfb_offs_ptrs[POLY_MAX_SO_BUFFERS];
/* Output index buffer, allocated by pre-GS. */
DEVICE(uint) output_index_buffer;
/* Address of transform feedback buffer in general, supplied by the CPU. */
DEVICE(uchar) xfb_base_original[POLY_MAX_SO_BUFFERS];
/* Address of transform feedback for the current primitive. Written by pre-GS
* program.
*/
DEVICE(uchar) xfb_base[POLY_MAX_SO_BUFFERS];
/* Address and present mask for the input to the geometry shader. These will
* reflect the vertex shader for VS->GS or instead the tessellation
* evaluation shader for TES->GS.
*/
uint64_t input_buffer;
uint64_t input_mask;
/* Location-indexed mask of flat outputs, used for lowering GL edge flags. */
uint64_t flat_outputs;
uint32_t xfb_size[POLY_MAX_SO_BUFFERS];
/* Number of vertices emitted by transform feedback per stream. Written by
* the pre-GS program.
*/
uint32_t xfb_verts[POLY_MAX_VERTEX_STREAMS];
/* Within an indirect GS draw, the grids used to dispatch the VS/GS written
* out by the GS indirect setup kernel or the CPU for a direct draw. This is
* the "indirect local" format: first 3 is in threads, second 3 is in grid
* blocks. This lets us use nontrivial workgroups with indirect draws without
* needing any predication.
*/
uint32_t vs_grid[6];
uint32_t gs_grid[6];
/* Number of input primitives across all instances, calculated by the CPU for
* a direct draw or the GS indirect setup kernel for an indirect draw.
*/
uint32_t input_primitives;
/* Number of input primitives per instance, rounded up to a power-of-two and
* with the base-2 log taken. This is used to partition the output vertex IDs
* efficiently.
*/
uint32_t primitives_log2;
/* Number of bytes output by the GS count shader per input primitive (may be
* 0), written by CPU and consumed by indirect draw setup shader for
* allocating counts.
*/
uint32_t count_buffer_stride;
/* Dynamic input topology. Must be compatible with the geometry shader's
* layout() declared input class.
*/
uint32_t input_topology;
} PACKED;
static_assert(sizeof(struct poly_geometry_params) == 86 * 4);
/* TCS shared memory layout:
*
* vec4 vs_outputs[VERTICES_IN_INPUT_PATCH][TOTAL_VERTEX_OUTPUTS];
*
* TODO: compact.
*/
static inline uint
poly_tcs_in_offs_el(uint vtx, gl_varying_slot location,
uint64_t crosslane_vs_out_mask)
{
uint base = vtx * util_bitcount64(crosslane_vs_out_mask);
uint offs = util_bitcount64(crosslane_vs_out_mask &
(((uint64_t)(1) << location) - 1));
return base + offs;
}
static inline uint
poly_tcs_in_size(uint32_t vertices_in_patch, uint64_t crosslane_vs_out_mask)
{
return vertices_in_patch * util_bitcount64(crosslane_vs_out_mask) * 16;
}
/*
* TCS out buffer layout, per-patch:
*
* float tess_level_outer[4];
* float tess_level_inner[2];
* vec4 patch_out[MAX_PATCH_OUTPUTS];
* vec4 vtx_out[OUT_PATCH_SIZE][TOTAL_VERTEX_OUTPUTS];
*
* Vertex out are compacted based on the mask of written out. Patch
* out are used as-is.
*
* Bounding boxes are ignored.
*/
static inline uint
poly_tcs_out_offs_el(uint vtx_id, gl_varying_slot location, uint nr_patch_out,
uint64_t vtx_out_mask)
{
uint off = 0;
if (location == VARYING_SLOT_TESS_LEVEL_OUTER)
return off;
off += 4;
if (location == VARYING_SLOT_TESS_LEVEL_INNER)
return off;
off += 2;
if (location >= VARYING_SLOT_PATCH0)
return off + (4 * (location - VARYING_SLOT_PATCH0));
/* Anything else is a per-vtx output */
off += 4 * nr_patch_out;
off += 4 * vtx_id * util_bitcount64(vtx_out_mask);
uint idx = util_bitcount64(vtx_out_mask & (((uint64_t)(1) << location) - 1));
return off + (4 * idx);
}
static inline uint
poly_tcs_out_stride_el(uint nr_patch_out, uint out_patch_size,
uint64_t vtx_out_mask)
{
return poly_tcs_out_offs_el(out_patch_size, VARYING_SLOT_POS, nr_patch_out,
vtx_out_mask);
}
static inline uint
poly_tcs_out_stride(uint nr_patch_out, uint out_patch_size,
uint64_t vtx_out_mask)
{
return poly_tcs_out_stride_el(nr_patch_out, out_patch_size, vtx_out_mask) *
4;
}
/* In a tess eval shader, stride for hw vertex ID */
#define POLY_TES_PATCH_ID_STRIDE 8192
static inline uint
poly_compact_prim(enum mesa_prim prim)
{
static_assert(MESA_PRIM_QUAD_STRIP == MESA_PRIM_QUADS + 1);
static_assert(MESA_PRIM_POLYGON == MESA_PRIM_QUADS + 2);
#ifndef __OPENCL_VERSION__
assert(prim != MESA_PRIM_QUADS);
assert(prim != MESA_PRIM_QUAD_STRIP);
assert(prim != MESA_PRIM_POLYGON);
assert(prim != MESA_PRIM_PATCHES);
#endif
return (prim >= MESA_PRIM_QUADS) ? (prim - 3) : prim;
}
static inline enum mesa_prim
poly_uncompact_prim(uint packed)
{
if (packed >= MESA_PRIM_QUADS)
return (enum mesa_prim)(packed + 3);
return (enum mesa_prim)packed;
}
/*
* Write a strip into a 32-bit index buffer. This is the sequence:
*
* (b, b + 1, b + 2, ..., b + n - 1, -1) where -1 is the restart index
*
* For points, we write index buffers without restart just for remapping.
*/
static inline void
_poly_write_strip(GLOBAL uint32_t *index_buffer, uint32_t index_offset,
uint32_t vertex_offset, uint32_t verts_in_prim,
uint32_t stream, uint32_t stream_multiplier, uint32_t n)
{
bool restart = n > 1;
if (verts_in_prim < n)
return;
GLOBAL uint32_t *out = &index_buffer[index_offset];
/* Write out indices for the strip */
for (uint32_t i = 0; i < verts_in_prim; ++i) {
out[i] = (vertex_offset + i) * stream_multiplier + stream;
}
if (restart)
out[verts_in_prim] = -1;
}
static inline unsigned
poly_decomposed_prims_for_vertices_with_tess(enum mesa_prim prim, int vertices,
unsigned verts_per_patch)
{
if (prim >= MESA_PRIM_PATCHES) {
return vertices / verts_per_patch;
} else {
return u_decomposed_prims_for_vertices(prim, vertices);
}
}
#ifdef __OPENCL_VERSION__
/*
* Returns (work_group_scan_inclusive_add(x), work_group_sum(x)). Implemented
* manually with subgroup ops and local memory since Mesa doesn't do those
* lowerings yet.
*/
static inline uint2
poly_work_group_scan_inclusive_add(uint x, local uint *scratch)
{
uint sg_id = get_sub_group_id();
/* Partial prefix sum of the subgroup */
uint sg = sub_group_scan_inclusive_add(x);
/* Reduction (sum) for the subgroup */
uint sg_sum = sub_group_broadcast(sg, 31);
/* Write out all the subgroups sums */
barrier(CLK_LOCAL_MEM_FENCE);
scratch[sg_id] = sg_sum;
barrier(CLK_LOCAL_MEM_FENCE);
/* Read all the subgroup sums. Thread T in subgroup G reads the sum of all
* threads in subgroup T.
*/
uint other_sum = scratch[get_sub_group_local_id()];
/* Exclusive sum the subgroup sums to get the total before the current group,
* which can be added to the total for the current group.
*/
uint other_sums = sub_group_scan_exclusive_add(other_sum);
uint base = sub_group_broadcast(other_sums, sg_id);
uint prefix = base + sg;
/* Reduce the workgroup using the prefix sum we already did */
uint reduction = sub_group_broadcast(other_sums + other_sum, 31);
return (uint2)(prefix, reduction);
}
static inline void
poly_prefix_sum(local uint *scratch, global uint *buffer, uint len, uint words,
uint word, uint wg_count)
{
uint tid = cl_local_id.x;
/* Main loop: complete workgroups processing multiple values at once */
uint i, count = 0;
uint len_remainder = len % wg_count;
uint len_rounded_down = len - len_remainder;
for (i = tid; i < len_rounded_down; i += wg_count) {
global uint *ptr = &buffer[(i * words) + word];
uint value = *ptr;
uint2 sums = poly_work_group_scan_inclusive_add(value, scratch);
*ptr = count + sums[0];
count += sums[1];
}
/* The last iteration is special since we won't have a full subgroup unless
* the length is divisible by the subgroup size, and we don't advance count.
*/
global uint *ptr = &buffer[(i * words) + word];
uint value = (tid < len_remainder) ? *ptr : 0;
uint scan = poly_work_group_scan_inclusive_add(value, scratch)[0];
if (tid < len_remainder) {
*ptr = count + scan;
}
}
static inline void
poly_increment_counters(global uint32_t *a, global uint32_t *b,
global uint32_t *c, uint count)
{
global uint32_t *ptr[] = {a, b, c};
for (uint i = 0; i < 3; ++i) {
if (ptr[i]) {
*(ptr[i]) += count;
}
}
}
static inline void
poly_increment_ia(global uint32_t *ia_vertices, global uint32_t *ia_primitives,
global uint32_t *vs_invocations, global uint32_t *c_prims,
global uint32_t *c_invs, constant uint32_t *draw,
enum mesa_prim prim, unsigned verts_per_patch)
{
poly_increment_counters(ia_vertices, vs_invocations, NULL,
draw[0] * draw[1]);
uint prims = poly_decomposed_prims_for_vertices_with_tess(prim, draw[0],
verts_per_patch) *
draw[1];
poly_increment_counters(ia_primitives, c_prims, c_invs, prims);
}
static inline void
poly_gs_setup_indirect(uint64_t index_buffer, constant uint *draw,
global uintptr_t *vertex_buffer /* output */,
global struct poly_ia_state *ia /* output */,
global struct poly_geometry_params *p /* output */,
global struct poly_heap *heap,
uint64_t vs_outputs /* Vertex (TES) output mask */,
uint32_t index_size_B /* 0 if no index bffer */,
uint32_t index_buffer_range_el,
uint32_t prim /* Input primitive type, enum mesa_prim */,
int is_prefix_summing, uint max_indices,
enum poly_gs_shape shape)
{
/* Determine the (primitives, instances) grid size. */
uint vertex_count = draw[0];
uint instance_count = draw[1];
ia->verts_per_instance = vertex_count;
/* Calculate number of primitives input into the GS */
uint prim_per_instance = u_decomposed_prims_for_vertices(prim, vertex_count);
p->input_primitives = prim_per_instance * instance_count;
/* Invoke VS as (vertices, instances); GS as (primitives, instances) */
p->vs_grid[0] = vertex_count;
p->vs_grid[1] = instance_count;
p->gs_grid[0] = prim_per_instance;
p->gs_grid[1] = instance_count;
p->primitives_log2 = util_logbase2_ceil(prim_per_instance);
/* If indexing is enabled, the third word is the offset into the index buffer
* in elements. Apply that offset now that we have it. For a hardware
* indirect draw, the hardware would do this for us, but for software input
* assembly we need to do it ourselves.
*/
if (index_size_B) {
ia->index_buffer = poly_index_buffer(index_buffer, index_buffer_range_el,
draw[2], index_size_B);
ia->index_buffer_range_el =
poly_index_buffer_range_el(index_buffer_range_el, draw[2]);
}
/* We need to allocate VS and GS count buffers, do so now */
uint vertex_buffer_size =
poly_tcs_in_size(vertex_count * instance_count, vs_outputs);
if (is_prefix_summing) {
p->count_buffer = poly_heap_alloc_nonatomic(
heap, p->input_primitives * p->count_buffer_stride);
}
p->input_buffer =
(uintptr_t)poly_heap_alloc_nonatomic(heap, vertex_buffer_size);
*vertex_buffer = p->input_buffer;
p->input_mask = vs_outputs;
/* Allocate the index buffer and write the draw consuming it */
global VkDrawIndexedIndirectCommand *cmd = (global void *)p->indirect_desc;
*cmd = (VkDrawIndexedIndirectCommand){
.indexCount = poly_gs_rast_vertices(shape, max_indices, prim_per_instance,
instance_count),
.instanceCount = poly_gs_rast_instances(
shape, max_indices, prim_per_instance, instance_count),
};
if (shape == POLY_GS_SHAPE_DYNAMIC_INDEXED) {
cmd->firstIndex =
poly_heap_alloc_nonatomic_offs(heap, cmd->indexCount * 4) / 4;
p->output_index_buffer =
(global uint *)(heap->base + (cmd->firstIndex * 4));
}
}
static uint
poly_load_index(uintptr_t index_buffer, uint32_t index_buffer_range_el, uint id,
uint index_size)
{
bool oob = id >= index_buffer_range_el;
/* If the load would be out-of-bounds, load the first element which is
* assumed valid. If the application index buffer is empty with robustness2,
* index_buffer will point to a zero sink where only the first is valid.
*/
if (oob) {
id = 0;
}
uint el;
if (index_size == 1) {
el = ((constant uint8_t *)index_buffer)[id];
} else if (index_size == 2) {
el = ((constant uint16_t *)index_buffer)[id];
} else {
el = ((constant uint32_t *)index_buffer)[id];
}
/* D3D robustness semantics. TODO: Optimize? */
if (oob) {
el = 0;
}
return el;
}
static void
poly_store_index(uintptr_t index_buffer, uint index_size_B, uint id, uint value)
{
global uint32_t *out_32 = (global uint32_t *)index_buffer;
global uint16_t *out_16 = (global uint16_t *)index_buffer;
global uint8_t *out_8 = (global uint8_t *)index_buffer;
if (index_size_B == 4)
out_32[id] = value;
else if (index_size_B == 2)
out_16[id] = value;
else
out_8[id] = value;
}
#endif

9
src/poly/meson.build Normal file
View file

@ -0,0 +1,9 @@
# Copyright © 2025 Collabora Ltd.
# SPDX-License-Identifier: MIT
inc_poly = include_directories([
'.', 'nir'
])
subdir('cl')
subdir('nir')

18
src/poly/nir/meson.build Normal file
View file

@ -0,0 +1,18 @@
# Copyright © 2025 Collabora Ltd.
# SPDX-License-Identifier: MIT
libpoly_nir_files = files(
'poly_nir_lower_gs.c',
'poly_nir_lower_ia.c',
'poly_nir_lower_tess.c',
)
libpoly_nir = static_library(
'libpoly_nir',
[libpoly_nir_files],
include_directories : [inc_poly],
c_args : [no_override_init_args, '-Wno-c2x-extensions'],
gnu_symbol_visibility : 'hidden',
dependencies: [idep_nir, idep_mesautil, idep_libpoly],
build_by_default : false,
)

View file

@ -5,11 +5,11 @@
* SPDX-License-Identifier: MIT
*/
#include "agx_nir_lower_gs.h"
#include "poly/nir/poly_nir_lower_gs.h"
#include "compiler/nir/nir_builder.h"
#include "gallium/include/pipe/p_defines.h"
#include "libagx/geometry.h"
#include "libagx/libagx.h"
#include "poly/cl/libpoly.h"
#include "poly/geometry.h"
#include "util/bitscan.h"
#include "util/list.h"
#include "util/macros.h"
@ -85,7 +85,7 @@ rewrite_intrinsics(nir_builder *b, nir_intrinsic_instr *intr, void *state_)
}
static bool
agx_nir_lower_gs_intrinsics(nir_shader *shader)
lower_gs_intrinsics(nir_shader *shader)
{
struct state state;
nir_function_impl *impl = nir_shader_get_entrypoint(shader);
@ -158,16 +158,16 @@ agx_nir_lower_gs_intrinsics(nir_shader *shader)
}
struct lower_gs_state {
int static_count[MAX_VERTEX_STREAMS];
int static_count[POLY_MAX_VERTEX_STREAMS];
/* The index of each counter in the count buffer, or -1 if it's not in the
* count buffer.
*
* Invariant: info->count_words == sum(count_index[i] >= 0).
*/
int count_index[MAX_VERTEX_STREAMS];
int count_index[POLY_MAX_VERTEX_STREAMS];
struct agx_gs_info *info;
struct poly_gs_info *info;
};
/* Helpers for loading from the geometry state buffer */
@ -184,8 +184,8 @@ load_geometry_param_offset(nir_builder *b, uint32_t offset, uint8_t bytes)
#define load_geometry_param(b, field) \
load_geometry_param_offset( \
b, offsetof(struct agx_geometry_params, field), \
sizeof(((struct agx_geometry_params *)0)->field))
b, offsetof(struct poly_geometry_params, field), \
sizeof(((struct poly_geometry_params *)0)->field))
/* Helpers for lowering I/O to variables */
struct lower_output_to_var_state {
@ -257,18 +257,18 @@ vertex_id_for_topology_class(nir_builder *b, nir_def *vert, enum mesa_prim cls)
return prim;
case MESA_PRIM_LINES:
return libagx_vertex_id_for_line_class(b, topology, prim, vert, nr);
return poly_vertex_id_for_line_class(b, topology, prim, vert, nr);
case MESA_PRIM_TRIANGLES:
return libagx_vertex_id_for_tri_class(b, topology, prim, vert,
flatshade_first);
return poly_vertex_id_for_tri_class(b, topology, prim, vert,
flatshade_first);
case MESA_PRIM_LINES_ADJACENCY:
return libagx_vertex_id_for_line_adj_class(b, topology, prim, vert);
return poly_vertex_id_for_line_adj_class(b, topology, prim, vert);
case MESA_PRIM_TRIANGLES_ADJACENCY:
return libagx_vertex_id_for_tri_adj_class(b, topology, prim, vert, nr,
flatshade_first);
return poly_vertex_id_for_tri_adj_class(b, topology, prim, vert, nr,
flatshade_first);
default:
UNREACHABLE("invalid topology class");
@ -276,8 +276,8 @@ vertex_id_for_topology_class(nir_builder *b, nir_def *vert, enum mesa_prim cls)
}
nir_def *
agx_load_per_vertex_input(nir_builder *b, nir_intrinsic_instr *intr,
nir_def *vertex)
poly_load_per_vertex_input(nir_builder *b, nir_intrinsic_instr *intr,
nir_def *vertex)
{
assert(intr->intrinsic == nir_intrinsic_load_per_vertex_input);
nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
@ -287,15 +287,15 @@ agx_load_per_vertex_input(nir_builder *b, nir_intrinsic_instr *intr,
if (b->shader->info.stage == MESA_SHADER_GEOMETRY) {
/* GS may be preceded by VS or TES so specified as param */
addr = libagx_geometry_input_address(
addr = poly_geometry_input_address(
b, nir_load_geometry_param_buffer_poly(b), vertex, location);
} else {
assert(b->shader->info.stage == MESA_SHADER_TESS_CTRL);
/* TCS always preceded by VS so we use the VS state directly */
addr = libagx_vertex_output_address(b, nir_load_vs_output_buffer_poly(b),
nir_load_vs_outputs_poly(b), vertex,
location);
addr = poly_vertex_output_address(b, nir_load_vs_output_buffer_poly(b),
nir_load_vs_outputs_poly(b), vertex,
location);
}
addr = nir_iadd_imm(b, addr, 4 * nir_intrinsic_component(intr));
@ -320,7 +320,7 @@ lower_gs_inputs(nir_builder *b, nir_intrinsic_instr *intr, void *_)
nir_def *unrolled =
nir_iadd(b, nir_imul(b, nir_load_instance_id(b), verts), vertex);
nir_def *val = agx_load_per_vertex_input(b, intr, unrolled);
nir_def *val = poly_load_per_vertex_input(b, intr, unrolled);
nir_def_replace(&intr->def, val);
return true;
}
@ -377,10 +377,10 @@ write_xfb_counts(nir_builder *b, nir_intrinsic_instr *intr,
nir_def *id =
state->info->prefix_sum ? calc_unrolled_id(b) : nir_imm_int(b, 0);
nir_def *addr = libagx_load_xfb_count_address(
b, nir_load_geometry_param_buffer_poly(b),
nir_imm_int(b, state->count_index[stream]),
nir_imm_int(b, state->info->count_words), id);
nir_def *addr =
poly_load_xfb_count_address(b, nir_load_geometry_param_buffer_poly(b),
nir_imm_int(b, state->count_index[stream]),
nir_imm_int(b, state->info->count_words), id);
if (state->info->prefix_sum) {
nir_store_global(b, addr, 4, intr->src[2].ssa, nir_component_mask(1));
@ -656,7 +656,7 @@ create_gs_rast_shader(const nir_shader *gs, const struct lower_gs_state *state)
raw_vertex_id = nir_udiv_imm(b, raw_vertex_id, stream_multiplier(gs));
switch (state->info->shape) {
case AGX_GS_SHAPE_DYNAMIC_INDEXED: {
case POLY_GS_SHAPE_DYNAMIC_INDEXED: {
unsigned stride = output_vertex_id_pot_stride(gs);
nir_def *unrolled = nir_udiv_imm(b, raw_vertex_id, stride);
@ -669,8 +669,8 @@ create_gs_rast_shader(const nir_shader *gs, const struct lower_gs_state *state)
break;
}
case AGX_GS_SHAPE_STATIC_INDEXED:
case AGX_GS_SHAPE_STATIC_PER_PRIM: {
case POLY_GS_SHAPE_STATIC_INDEXED:
case POLY_GS_SHAPE_STATIC_PER_PRIM: {
nir_def *stride = load_geometry_param(b, gs_grid[0]);
rs.output_id = raw_vertex_id;
@ -679,7 +679,7 @@ create_gs_rast_shader(const nir_shader *gs, const struct lower_gs_state *state)
break;
}
case AGX_GS_SHAPE_STATIC_PER_INSTANCE: {
case POLY_GS_SHAPE_STATIC_PER_INSTANCE: {
unsigned stride = MAX2(state->info->max_indices, 1);
rs.output_id = nir_umod_imm(b, raw_vertex_id, stride);
@ -733,8 +733,8 @@ create_gs_rast_shader(const nir_shader *gs, const struct lower_gs_state *state)
for (unsigned p_ = 0; p_ < n_; ++p_) {
nir_def *p = nir_imm_int(b, p_);
nir_push_if(b, libagx_xfb_vertex_copy_in_strip(b, n, id_in_strip,
strip_length, p));
nir_push_if(b, poly_xfb_vertex_copy_in_strip(b, n, id_in_strip,
strip_length, p));
/* Write XFB for each output */
for (unsigned i = 0; i < xfb->output_count; ++i) {
@ -746,14 +746,14 @@ create_gs_rast_shader(const nir_shader *gs, const struct lower_gs_state *state)
* base for this invocation for the stream plus the offset within
* this invocation.
*/
nir_def *invocation_base = libagx_previous_xfb_primitives(
nir_def *invocation_base = poly_previous_xfb_primitives(
b, nir_load_geometry_param_buffer_poly(b),
nir_imm_int(b, state->static_count[stream]),
nir_imm_int(b, state->count_index[stream]),
nir_imm_int(b, state->info->count_words),
nir_imm_bool(b, state->info->prefix_sum), unrolled);
nir_def *index = libagx_xfb_vertex_offset(
nir_def *index = poly_xfb_vertex_offset(
b, n, invocation_base, base, id_in_strip, p,
nir_inot(b, nir_i2b(b, nir_load_provoking_last(b))));
@ -776,7 +776,7 @@ create_gs_rast_shader(const nir_shader *gs, const struct lower_gs_state *state)
*/
value = nir_pad_vector_imm_int(b, value, 0, 4);
nir_def *addr = libagx_xfb_vertex_address(
nir_def *addr = poly_xfb_vertex_address(
b, nir_load_geometry_param_buffer_poly(b), index,
nir_imm_int(b, buffer), nir_imm_int(b, stride),
nir_imm_int(b, output.offset));
@ -842,12 +842,12 @@ lower_gs_instr(nir_builder *b, nir_intrinsic_instr *intr, void *state_)
switch (intr->intrinsic) {
case nir_intrinsic_set_vertex_and_primitive_count: {
if (state->info->shape != AGX_GS_SHAPE_DYNAMIC_INDEXED)
if (state->info->shape != POLY_GS_SHAPE_DYNAMIC_INDEXED)
break;
/* All streams are merged, just pick a single instruction */
if (nir_intrinsic_stream_id(intr) == 0) {
libagx_pad_index_gs(
poly_pad_index_gs(
b, load_geometry_param(b, output_index_buffer),
nir_imul_imm(b, calc_unrolled_id(b), state->info->max_indices),
intr->src[1].ssa, nir_imm_int(b, state->info->max_indices));
@ -857,10 +857,10 @@ lower_gs_instr(nir_builder *b, nir_intrinsic_instr *intr, void *state_)
}
case nir_intrinsic_emit_primitive_poly: {
if (state->info->shape != AGX_GS_SHAPE_DYNAMIC_INDEXED)
if (state->info->shape != POLY_GS_SHAPE_DYNAMIC_INDEXED)
break;
libagx_write_strip(
poly_write_strip(
b, load_geometry_param(b, output_index_buffer),
nir_imul_imm(b, calc_unrolled_id(b), state->info->max_indices),
intr->src[0].ssa,
@ -903,14 +903,14 @@ collect_components(nir_builder *b, nir_intrinsic_instr *intr, void *data)
return true;
}
struct agx_xfb_key {
struct poly_xfb_key {
uint8_t streams;
uint8_t buffers_written;
uint8_t buffer_to_stream[NIR_MAX_XFB_BUFFERS];
int8_t count_index[4];
uint16_t stride[NIR_MAX_XFB_BUFFERS];
uint16_t output_end[NIR_MAX_XFB_BUFFERS];
int16_t static_count[MAX_VERTEX_STREAMS];
int16_t static_count[POLY_MAX_VERTEX_STREAMS];
uint16_t invocations;
uint16_t vertices_per_prim;
};
@ -921,14 +921,14 @@ struct agx_xfb_key {
* transform feedback offsets and counters as applicable.
*/
static nir_shader *
create_pre_gs(struct agx_xfb_key *key,
create_pre_gs(struct poly_xfb_key *key,
const nir_shader_compiler_options *options)
{
nir_builder b_ = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options,
"Pre-GS patch up");
nir_builder *b = &b_;
libagx_pre_gs(
poly_pre_gs(
b, nir_load_geometry_param_buffer_poly(b), nir_imm_int(b, key->streams),
nir_imm_int(b, key->buffers_written),
nir_imm_ivec4(b, key->buffer_to_stream[0], key->buffer_to_stream[1],
@ -1033,7 +1033,7 @@ calculate_max_indices(enum mesa_prim prim, unsigned verts)
}
struct topology_ctx {
struct agx_gs_info *info;
struct poly_gs_info *info;
uint32_t topology[384];
};
@ -1041,7 +1041,7 @@ static bool
evaluate_topology(nir_builder *b, nir_intrinsic_instr *intr, void *data)
{
struct topology_ctx *ctx = data;
struct agx_gs_info *info = ctx->info;
struct poly_gs_info *info = ctx->info;
if (intr->intrinsic != nir_intrinsic_emit_primitive_poly)
return false;
@ -1050,7 +1050,7 @@ evaluate_topology(nir_builder *b, nir_intrinsic_instr *intr, void *data)
* if-statements interleaved with other stuff).
*/
if (intr->instr.block != nir_start_block(b->impl)) {
info->shape = AGX_GS_SHAPE_DYNAMIC_INDEXED;
info->shape = POLY_GS_SHAPE_DYNAMIC_INDEXED;
return false;
}
@ -1058,11 +1058,11 @@ evaluate_topology(nir_builder *b, nir_intrinsic_instr *intr, void *data)
if (!nir_src_is_const(intr->src[0]) || !nir_src_is_const(intr->src[1]) ||
!nir_src_is_const(intr->src[2])) {
info->shape = AGX_GS_SHAPE_DYNAMIC_INDEXED;
info->shape = POLY_GS_SHAPE_DYNAMIC_INDEXED;
return false;
}
_libagx_write_strip(
_poly_write_strip(
ctx->topology, nir_src_as_uint(intr->src[0]),
nir_src_as_uint(intr->src[1]), nir_src_as_uint(intr->src[2]),
nir_intrinsic_stream_id(intr), stream_multiplier(b->shader),
@ -1076,7 +1076,7 @@ evaluate_topology(nir_builder *b, nir_intrinsic_instr *intr, void *data)
* 0, 1, 2, -1, 3, 4, 5, ...
*/
static bool
match_list_topology(struct agx_gs_info *info, uint32_t count,
match_list_topology(struct poly_gs_info *info, uint32_t count,
uint32_t *topology, bool has_restart)
{
unsigned count_with_restart = count + has_restart;
@ -1095,7 +1095,7 @@ match_list_topology(struct agx_gs_info *info, uint32_t count,
}
/* If we match, rewrite the topology and drop indexing */
info->shape = AGX_GS_SHAPE_STATIC_PER_INSTANCE;
info->shape = POLY_GS_SHAPE_STATIC_PER_INSTANCE;
info->mode = u_decomposed_prim(info->mode);
info->max_indices =
((info->max_indices + has_restart) / count_with_restart) * count;
@ -1131,12 +1131,12 @@ is_strip_topology(uint32_t *indices, uint32_t index_count)
* VS(compute) + GS(vertex) sequences without auxiliary programs.
*/
static void
optimize_static_topology(struct agx_gs_info *info, nir_shader *gs)
optimize_static_topology(struct poly_gs_info *info, nir_shader *gs)
{
struct topology_ctx ctx = {.info = info};
bool has_restart = info->mode != MESA_PRIM_POINTS;
nir_shader_intrinsics_pass(gs, evaluate_topology, nir_metadata_all, &ctx);
if (info->shape == AGX_GS_SHAPE_DYNAMIC_INDEXED)
if (info->shape == POLY_GS_SHAPE_DYNAMIC_INDEXED)
return;
/* We can always drop the trailing restart index */
@ -1150,7 +1150,7 @@ optimize_static_topology(struct agx_gs_info *info, nir_shader *gs)
/* Try to pattern match a strip topology */
if (is_strip_topology(ctx.topology, info->max_indices)) {
info->shape = AGX_GS_SHAPE_STATIC_PER_PRIM;
info->shape = POLY_GS_SHAPE_STATIC_PER_PRIM;
return;
}
@ -1161,7 +1161,7 @@ optimize_static_topology(struct agx_gs_info *info, nir_shader *gs)
* XXX: check if this holds with streams.
*/
if (info->max_indices >= ARRAY_SIZE(info->topology)) {
info->shape = AGX_GS_SHAPE_DYNAMIC_INDEXED;
info->shape = POLY_GS_SHAPE_DYNAMIC_INDEXED;
return;
}
@ -1170,12 +1170,12 @@ optimize_static_topology(struct agx_gs_info *info, nir_shader *gs)
info->topology[i] = ctx.topology[i];
}
info->shape = AGX_GS_SHAPE_STATIC_INDEXED;
info->shape = POLY_GS_SHAPE_STATIC_INDEXED;
}
bool
agx_nir_lower_gs(nir_shader *gs, nir_shader **gs_count, nir_shader **gs_copy,
nir_shader **pre_gs, struct agx_gs_info *info)
poly_nir_lower_gs(nir_shader *gs, nir_shader **gs_count, nir_shader **gs_copy,
nir_shader **pre_gs, struct poly_gs_info *info)
{
/* Lower I/O as assumed by the rest of GS lowering */
if (gs->xfb_info != NULL) {
@ -1212,7 +1212,7 @@ agx_nir_lower_gs(nir_shader *gs, nir_shader **gs_count, nir_shader **gs_copy,
/* Lower geometry shader writes to contain all of the required counts, so we
* know where in the various buffers we should write vertices.
*/
NIR_PASS(_, gs, agx_nir_lower_gs_intrinsics);
NIR_PASS(_, gs, lower_gs_intrinsics);
/* Clean up after all that lowering we did */
bool progress = false;
@ -1241,7 +1241,7 @@ agx_nir_lower_gs(nir_shader *gs, nir_shader **gs_count, nir_shader **gs_copy,
*/
struct lower_gs_state gs_state = {.info = info};
*info = (struct agx_gs_info){
*info = (struct poly_gs_info){
.mode = gs->info.gs.output_primitive,
.xfb = gs->xfb_info != NULL,
.shape = -1,
@ -1252,10 +1252,13 @@ agx_nir_lower_gs(nir_shader *gs, nir_shader **gs_count, nir_shader **gs_copy,
nir_gs_count_vertices_and_primitives(gs, NULL, static_indices,
gs_state.static_count, 4);
STATIC_ASSERT(ARRAY_SIZE(gs_state.count_index) ==
ARRAY_SIZE(gs_state.static_count));
/* Anything we don't know statically will be tracked by the count buffer.
* Determine the layout for it.
*/
for (unsigned i = 0; i < MAX_VERTEX_STREAMS; ++i) {
for (unsigned i = 0; i < ARRAY_SIZE(gs_state.count_index); ++i) {
gs_state.count_index[i] =
(gs_state.static_count[i] < 0) ? info->count_words++ : -1;
}
@ -1272,7 +1275,7 @@ agx_nir_lower_gs(nir_shader *gs, nir_shader **gs_count, nir_shader **gs_copy,
if (static_indices[0] >= 0) {
optimize_static_topology(info, gs);
} else {
info->shape = AGX_GS_SHAPE_DYNAMIC_INDEXED;
info->shape = POLY_GS_SHAPE_DYNAMIC_INDEXED;
}
*gs_copy = create_gs_rast_shader(gs, &gs_state);
@ -1344,20 +1347,22 @@ agx_nir_lower_gs(nir_shader *gs, nir_shader **gs_count, nir_shader **gs_copy,
/* Gather information required for transform feedback / query programs */
struct nir_xfb_info *xfb = gs->xfb_info;
struct agx_xfb_key key = {
struct poly_xfb_key key = {
.streams = gs->info.gs.active_stream_mask,
.invocations = gs->info.gs.invocations,
.vertices_per_prim = nir_verts_in_output_prim(gs),
};
for (unsigned i = 0; i < 4; ++i) {
STATIC_ASSERT(ARRAY_SIZE(key.buffer_to_stream) == ARRAY_SIZE(key.stride));
for (unsigned i = 0; i < ARRAY_SIZE(gs_state.count_index); ++i) {
key.count_index[i] = gs_state.count_index[i];
key.static_count[i] = gs_state.static_count[i];
}
if (xfb) {
key.buffers_written = xfb->buffers_written;
for (unsigned i = 0; i < 4; ++i) {
for (unsigned i = 0; i < ARRAY_SIZE(key.buffer_to_stream); ++i) {
key.buffer_to_stream[i] = xfb->buffer_to_stream[i];
key.stride[i] = xfb->buffers[i].stride;
}
@ -1409,14 +1414,13 @@ lower_vs_before_gs(nir_builder *b, nir_intrinsic_instr *intr, void *data)
nir_def *buffer, *nr_verts, *instance_id, *primitive_id;
if (b->shader->info.stage == MESA_SHADER_VERTEX) {
buffer = nir_load_vs_output_buffer_poly(b);
nr_verts =
libagx_input_vertices(b, nir_load_input_assembly_buffer_poly(b));
nr_verts = poly_input_vertices(b, nir_load_input_assembly_buffer_poly(b));
} else {
assert(b->shader->info.stage == MESA_SHADER_TESS_EVAL);
/* Instancing is unrolled during tessellation so nr_verts is ignored. */
nr_verts = nir_imm_int(b, 0);
buffer = libagx_tes_buffer(b, nir_load_tess_param_buffer_poly(b));
buffer = poly_tes_buffer(b, nir_load_tess_param_buffer_poly(b));
}
if (b->shader->info.stage == MESA_SHADER_VERTEX &&
@ -1431,7 +1435,7 @@ lower_vs_before_gs(nir_builder *b, nir_intrinsic_instr *intr, void *data)
nir_def *linear_id =
nir_iadd(b, nir_imul(b, instance_id, nr_verts), primitive_id);
nir_def *addr = libagx_vertex_output_address(
nir_def *addr = poly_vertex_output_address(
b, buffer, nir_imm_int64(b, b->shader->info.outputs_written), linear_id,
location);
@ -1444,7 +1448,7 @@ lower_vs_before_gs(nir_builder *b, nir_intrinsic_instr *intr, void *data)
}
bool
agx_nir_lower_vs_before_gs(struct nir_shader *vs)
poly_nir_lower_vs_before_gs(struct nir_shader *vs)
{
/* Lower vertex stores to memory stores */
return nir_shader_intrinsics_pass(vs, lower_vs_before_gs,

View file

@ -0,0 +1,61 @@
/*
* Copyright 2023 Alyssa Rosenzweig
* SPDX-License-Identifier: MIT
*/
#pragma once
#include <stdbool.h>
#include <stdint.h>
#include "poly/geometry.h"
#include "nir.h"
#include "shader_enums.h"
struct nir_def *poly_load_per_vertex_input(struct nir_builder *b,
nir_intrinsic_instr *intr,
struct nir_def *vertex);
nir_def *poly_nir_load_vertex_id(struct nir_builder *b, nir_def *id,
unsigned index_size_B);
bool poly_nir_lower_sw_vs(struct nir_shader *s, unsigned index_size_B);
bool poly_nir_lower_vs_before_gs(struct nir_shader *vs);
struct poly_gs_info {
/* Output primitive mode for geometry shaders */
enum mesa_prim mode;
/* Number of words per primitive in the count buffer */
unsigned count_words;
/* Per-input primitive stride of the output index buffer */
unsigned max_indices;
/* Whether the GS includes transform feedback at a compile-time level */
bool xfb;
/* Whether a prefix sum is required on the count outputs. Implies xfb */
bool prefix_sum;
/* Whether the GS writes to a stream other than stream #0 */
bool multistream;
/* Shape of the rasterization draw, named by the instance ID */
enum poly_gs_shape shape;
/* Static topology used if shape = POLY_GS_SHAPE_STATIC_INDEXED */
uint8_t topology[64];
};
bool poly_nir_lower_gs(struct nir_shader *gs, struct nir_shader **gs_count,
struct nir_shader **gs_copy, struct nir_shader **pre_gs,
struct poly_gs_info *info);
bool poly_nir_lower_tcs(struct nir_shader *tcs);
bool poly_nir_lower_tes(struct nir_shader *tes, bool to_hw_vs);
uint64_t poly_tcs_per_vertex_outputs(const struct nir_shader *nir);
unsigned poly_tcs_output_stride(const struct nir_shader *nir);

View file

@ -4,25 +4,30 @@
*/
#include "compiler/nir/nir_builder.h"
#include "libagx/geometry.h"
#include "libagx/libagx.h"
#include "agx_nir_lower_gs.h"
#include "poly/cl/libpoly.h"
#include "poly/geometry.h"
#include "nir.h"
/* XXX: Remove me later */
nir_def *poly_nir_load_vertex_id(struct nir_builder *b, nir_def *id,
unsigned index_size_B);
bool poly_nir_lower_sw_vs(struct nir_shader *s, unsigned index_size_B);
/*
* This file implements basic input assembly in software. It runs on software
* vertex shaders, as part of geometry/tessellation lowering. It does not apply
* the topology, which happens in the geometry shader.
*/
nir_def *
agx_nir_load_vertex_id(nir_builder *b, nir_def *id, unsigned index_size_B)
poly_nir_load_vertex_id(nir_builder *b, nir_def *id, unsigned index_size_B)
{
/* If drawing with an index buffer, pull the vertex ID. Otherwise, the
* vertex ID is just the index as-is.
*/
if (index_size_B) {
nir_def *ia = nir_load_input_assembly_buffer_poly(b);
id = libagx_load_index_buffer(b, ia, id, nir_imm_int(b, index_size_B));
id = poly_load_index_buffer(b, ia, id, nir_imm_int(b, index_size_B));
}
/* Add the "start", either an index bias or a base vertex. This must happen
@ -39,7 +44,8 @@ lower(nir_builder *b, nir_intrinsic_instr *intr, void *data)
if (intr->intrinsic == nir_intrinsic_load_vertex_id) {
nir_def *id = nir_channel(b, nir_load_global_invocation_id(b, 32), 0);
nir_def_replace(&intr->def, agx_nir_load_vertex_id(b, id, *index_size_B));
nir_def_replace(&intr->def,
poly_nir_load_vertex_id(b, id, *index_size_B));
return true;
} else if (intr->intrinsic == nir_intrinsic_load_instance_id) {
nir_def_replace(&intr->def,
@ -51,7 +57,7 @@ lower(nir_builder *b, nir_intrinsic_instr *intr, void *data)
}
bool
agx_nir_lower_sw_vs(nir_shader *s, unsigned index_size_B)
poly_nir_lower_sw_vs(nir_shader *s, unsigned index_size_B)
{
return nir_shader_intrinsics_pass(s, lower, nir_metadata_control_flow,
&index_size_B);

View file

@ -3,11 +3,11 @@
* SPDX-License-Identifier: MIT
*/
#include "libagx/geometry.h"
#include "libagx/libagx.h"
#include "poly/cl/libpoly.h"
#include "poly/geometry.h"
#include "poly/nir/poly_nir_lower_gs.h"
#include "util/bitscan.h"
#include "util/macros.h"
#include "agx_nir_lower_gs.h"
#include "nir.h"
#include "nir_builder.h"
#include "nir_builder_opcodes.h"
@ -18,12 +18,12 @@
static nir_def *
tcs_unrolled_id(nir_builder *b)
{
return libagx_tcs_unrolled_id(b, nir_load_tess_param_buffer_poly(b),
nir_load_workgroup_id(b));
return poly_tcs_unrolled_id(b, nir_load_tess_param_buffer_poly(b),
nir_load_workgroup_id(b));
}
uint64_t
agx_tcs_per_vertex_outputs(const nir_shader *nir)
poly_tcs_per_vertex_outputs(const nir_shader *nir)
{
return nir->info.outputs_written &
~(VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER |
@ -31,11 +31,11 @@ agx_tcs_per_vertex_outputs(const nir_shader *nir)
}
unsigned
agx_tcs_output_stride(const nir_shader *nir)
poly_tcs_output_stride(const nir_shader *nir)
{
return libagx_tcs_out_stride(util_last_bit(nir->info.patch_outputs_written),
nir->info.tess.tcs_vertices_out,
agx_tcs_per_vertex_outputs(nir));
return poly_tcs_out_stride(util_last_bit(nir->info.patch_outputs_written),
nir->info.tess.tcs_vertices_out,
poly_tcs_per_vertex_outputs(nir));
}
static nir_def *
@ -44,12 +44,12 @@ tcs_out_addr(nir_builder *b, nir_intrinsic_instr *intr, nir_def *vertex_id)
nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
nir_def *offset = nir_get_io_offset_src(intr)->ssa;
nir_def *addr = libagx_tcs_out_address(
nir_def *addr = poly_tcs_out_address(
b, nir_load_tess_param_buffer_poly(b), tcs_unrolled_id(b), vertex_id,
nir_iadd_imm(b, offset, sem.location),
nir_imm_int(b, util_last_bit(b->shader->info.patch_outputs_written)),
nir_imm_int(b, b->shader->info.tess.tcs_vertices_out),
nir_imm_int64(b, agx_tcs_per_vertex_outputs(b->shader)));
nir_imm_int64(b, poly_tcs_per_vertex_outputs(b->shader)));
addr = nir_iadd_imm(b, addr, nir_intrinsic_component(intr) * 4);
@ -68,9 +68,9 @@ lower_tes_load(nir_builder *b, nir_intrinsic_instr *intr)
if (intr->intrinsic == nir_intrinsic_load_per_vertex_input)
vertex = intr->src[0].ssa;
nir_def *addr = libagx_tes_in_address(b, nir_load_tess_param_buffer_poly(b),
nir_load_vertex_id(b), vertex,
nir_iadd_imm(b, offset, location));
nir_def *addr = poly_tes_in_address(b, nir_load_tess_param_buffer_poly(b),
nir_load_vertex_id(b), vertex,
nir_iadd_imm(b, offset, location));
if (nir_intrinsic_has_component(intr))
addr = nir_iadd_imm(b, addr, nir_intrinsic_component(intr) * 4);
@ -84,10 +84,10 @@ tcs_load_input(nir_builder *b, nir_intrinsic_instr *intr)
{
nir_def *base = nir_imul(
b, tcs_unrolled_id(b),
libagx_tcs_patch_vertices_in(b, nir_load_tess_param_buffer_poly(b)));
poly_tcs_patch_vertices_in(b, nir_load_tess_param_buffer_poly(b)));
nir_def *vertex = nir_iadd(b, base, intr->src[0].ssa);
return agx_load_per_vertex_input(b, intr, vertex);
return poly_load_per_vertex_input(b, intr, vertex);
}
static nir_def *
@ -114,16 +114,15 @@ lower_tcs_impl(nir_builder *b, nir_intrinsic_instr *intr)
return tcs_load_input(b, intr);
case nir_intrinsic_load_patch_vertices_in:
return libagx_tcs_patch_vertices_in(b,
nir_load_tess_param_buffer_poly(b));
return poly_tcs_patch_vertices_in(b, nir_load_tess_param_buffer_poly(b));
case nir_intrinsic_load_tess_level_outer_default:
return libagx_tess_level_outer_default(
b, nir_load_tess_param_buffer_poly(b));
return poly_tess_level_outer_default(b,
nir_load_tess_param_buffer_poly(b));
case nir_intrinsic_load_tess_level_inner_default:
return libagx_tess_level_inner_default(
b, nir_load_tess_param_buffer_poly(b));
return poly_tess_level_inner_default(b,
nir_load_tess_param_buffer_poly(b));
case nir_intrinsic_load_output: {
nir_def *addr = tcs_out_addr(b, intr, nir_undef(b, 1, 32));
@ -176,7 +175,7 @@ lower_tcs(nir_builder *b, nir_intrinsic_instr *intr, void *data)
}
bool
agx_nir_lower_tcs(nir_shader *tcs)
poly_nir_lower_tcs(nir_shader *tcs)
{
return nir_shader_intrinsics_pass(tcs, lower_tcs, nir_metadata_control_flow,
NULL);
@ -187,12 +186,12 @@ lower_tes_impl(nir_builder *b, nir_intrinsic_instr *intr, void *data)
{
switch (intr->intrinsic) {
case nir_intrinsic_load_tess_coord_xy:
return libagx_load_tess_coord(b, nir_load_tess_param_buffer_poly(b),
nir_load_vertex_id(b));
return poly_load_tess_coord(b, nir_load_tess_param_buffer_poly(b),
nir_load_vertex_id(b));
case nir_intrinsic_load_primitive_id:
return libagx_tes_patch_id(b, nir_load_tess_param_buffer_poly(b),
nir_load_vertex_id(b));
return poly_tes_patch_id(b, nir_load_tess_param_buffer_poly(b),
nir_load_vertex_id(b));
case nir_intrinsic_load_input:
case nir_intrinsic_load_per_vertex_input:
@ -201,8 +200,7 @@ lower_tes_impl(nir_builder *b, nir_intrinsic_instr *intr, void *data)
return lower_tes_load(b, intr);
case nir_intrinsic_load_patch_vertices_in:
return libagx_tes_patch_vertices_in(b,
nir_load_tess_param_buffer_poly(b));
return poly_tes_patch_vertices_in(b, nir_load_tess_param_buffer_poly(b));
default:
return NULL;
@ -232,12 +230,12 @@ lower_tes_indexing(nir_builder *b, nir_intrinsic_instr *intr, void *data)
b->cursor = nir_before_instr(&intr->instr);
nir_def *p = nir_load_tess_param_buffer_poly(b);
nir_def *id = nir_channel(b, nir_load_global_invocation_id(b, 32), 0);
nir_def_replace(&intr->def, libagx_load_tes_index(b, p, id));
nir_def_replace(&intr->def, poly_load_tes_index(b, p, id));
return true;
}
bool
agx_nir_lower_tes(nir_shader *tes, bool to_hw_vs)
poly_nir_lower_tes(nir_shader *tes, bool to_hw_vs)
{
nir_lower_tess_coord_z(
tes, tes->info.tess._primitive_mode == TESS_PRIMITIVE_TRIANGLES);

108
src/poly/tessellator.h Normal file
View file

@ -0,0 +1,108 @@
/*
* Copyright 2024 Valve Corporation
* SPDX-License-Identifier: MIT
*/
#pragma once
#include "compiler/libcl/libcl.h"
enum poly_tess_partitioning {
POLY_TESS_PARTITIONING_FRACTIONAL_ODD,
POLY_TESS_PARTITIONING_FRACTIONAL_EVEN,
POLY_TESS_PARTITIONING_INTEGER,
};
enum poly_tess_mode {
/* Do not actually tessellate, just write the index counts */
POLY_TESS_MODE_COUNT,
/* Tessellate using the count buffers to allocate indices */
POLY_TESS_MODE_WITH_COUNTS,
};
struct poly_tess_point {
uint32_t u;
uint32_t v;
};
static_assert(sizeof(struct poly_tess_point) == 8);
struct poly_tess_args {
/* Heap to allocate tessellator outputs in */
DEVICE(struct poly_heap) heap;
/* Patch coordinate buffer, indexed as:
*
* coord_allocs[patch_ID] + vertex_in_patch
*/
DEVICE(struct poly_tess_point) patch_coord_buffer;
/* Per-patch index within the heap for the tess coords, written by the
* tessellator based on the allocated memory.
*/
DEVICE(uint32_t) coord_allocs;
/* Space for output draws from the tessellator. API draw calls. */
DEVICE(uint32_t) out_draws;
/* Tessellation control shader output buffer. */
DEVICE(float) tcs_buffer;
/* Count buffer. # of indices per patch written here, then prefix summed. */
DEVICE(uint32_t) counts;
/* Allocated index buffer for all patches, if we're prefix summing counts */
DEVICE(uint32_t) index_buffer;
/* Address of the tess eval invocation counter for implementing pipeline
* statistics, if active. Zero if inactive. Incremented by tessellator.
*/
DEVICE(uint32_t) statistic;
/* When geom+tess used together, the buffer containing TES outputs (executed
* as a hardware compute shader).
*/
uint64_t tes_buffer;
/* Bitfield of TCS per-vertex outputs */
uint64_t tcs_per_vertex_outputs;
/* Default tess levels used in OpenGL when there is no TCS in the pipeline.
* Unused in Vulkan and OpenGL ES.
*/
float tess_level_outer_default[4];
float tess_level_inner_default[2];
/* Number of vertices in the input patch */
uint32_t input_patch_size;
/* Number of vertices in the TCS output patch */
uint32_t output_patch_size;
/* Number of patch constants written by TCS */
uint32_t tcs_patch_constants;
/* Number of input patches per instance of the VS/TCS */
uint32_t patches_per_instance;
/* Stride between tessellation facotrs in the TCS output buffer. */
uint32_t tcs_stride_el;
/* Number of patches being tessellated */
uint32_t nr_patches;
/* Partitioning and points mode. These affect per-patch setup code but not
* the hot tessellation loop so we make them dynamic to reduce tessellator
* variants.
*/
enum poly_tess_partitioning partitioning;
uint32_t points_mode;
uint32_t isolines;
/* When fed into a geometry shader, triangles should be counter-clockwise.
* The tessellator always produces clockwise triangles, but we can swap
* dynamically in the TES.
*/
uint32_t ccw;
} PACKED;
static_assert(sizeof(struct poly_tess_args) == 36 * 4);