kk: Implement tessellation

Same approach as HK for tessellation. It also handles instance_id lowering.
instance_id_includes_base_index is not taken into account in multiple
other passes that use instance id. These passes expect instance id to
actually be instance id. This change adds a pass to work around this.

Signed-off-by: Aitor Camacho <aitor@lunarg.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41038>
This commit is contained in:
Aitor Camacho 2026-04-18 12:53:15 +09:00 committed by Marge Bot
parent 84929be129
commit 68048759f0
20 changed files with 797 additions and 150 deletions

View file

@ -61,6 +61,9 @@ void mtl_dispatch_threadgroups_with_indirect_buffer(
mtl_compute_encoder *encoder, mtl_buffer *buffer, uint32_t offset,
struct mtl_size local_size);
void mtl_memory_barrier_with_scope(mtl_compute_encoder *encoder,
enum mtl_barrier_scope scope);
/* MTLRenderEncoder */
mtl_render_encoder *mtl_new_render_command_encoder_with_descriptor(
mtl_command_buffer *command_buffer, mtl_render_pass_descriptor *descriptor);

View file

@ -234,6 +234,16 @@ mtl_dispatch_threadgroups_with_indirect_buffer(mtl_compute_encoder *encoder,
}
}
void
mtl_memory_barrier_with_scope(mtl_compute_encoder *encoder,
enum mtl_barrier_scope scope)
{
@autoreleasepool {
id<MTLComputeCommandEncoder> enc = (id<MTLComputeCommandEncoder>)encoder;
[enc memoryBarrierWithScope:(MTLBarrierScope)scope];
}
}
/* MTLRenderEncoder */
/* Encoder commands */

View file

@ -220,6 +220,12 @@ enum mtl_depth_clip_mode {
MTL_DEPTH_CLIP_MODE_CLAMP = 1,
};
enum mtl_barrier_scope {
MTL_BARRIER_SCOPE_BUFFERS = 1 << 0,
MTL_BARRIER_SCOPE_TEXTURES = 1 << 1,
MTL_BARRIER_SCOPE_RENDER_TARGETS = 1 << 2,
};
/** STRUCTURES */
struct mtl_range {
size_t offset;

View file

@ -59,6 +59,12 @@ mtl_copy_from_texture_to_texture(mtl_blit_encoder *blit_enc_handle,
{
}
void
mtl_memory_barrier_with_scope(mtl_compute_encoder *encoder,
enum mtl_barrier_scope scope)
{
}
/* MTLComputeEncoder */
mtl_compute_encoder *
mtl_new_compute_command_encoder(mtl_command_buffer *cmd_buffer)

View file

@ -250,3 +250,16 @@ index_size_in_bytes_to_mtl_index_type(unsigned bytes)
UNREACHABLE("Unsupported byte size for index");
}
}
unsigned
mtl_index_type_to_size_B(enum mtl_index_type type)
{
switch (type) {
case MTL_INDEX_TYPE_UINT16:
return 2u;
case MTL_INDEX_TYPE_UINT32:
return 4u;
default:
UNREACHABLE("Unhandled index type");
}
}

View file

@ -76,4 +76,6 @@ enum mtl_cull_mode vk_front_face_to_mtl_cull_mode(enum VkCullModeFlagBits mode);
enum mtl_index_type index_size_in_bytes_to_mtl_index_type(unsigned bytes);
unsigned mtl_index_type_to_size_B(enum mtl_index_type type);
#endif /* KK_MTL_TO_VK_MAP_H */

View file

@ -574,3 +574,25 @@ msl_nir_lower_clip_cull_distance(nir_shader *nir, unsigned num_cull_distances)
else
NIR_PASS(_, nir, msl_nir_lower_clip_cull_distance_vs);
}
static bool
lower_instance_id(nir_builder *b, nir_intrinsic_instr *intr, UNUSED void *data)
{
if (intr->intrinsic != nir_intrinsic_load_instance_id)
return false;
b->cursor = nir_after_instr(&intr->instr);
nir_def *base_instance = nir_load_base_instance(b);
nir_def *instance_id = nir_isub(b, &intr->def, base_instance);
nir_def_rewrite_uses_after(&intr->def, instance_id);
BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_BASE_INSTANCE);
return true;
}
bool
msl_nir_lower_instance_id(nir_shader *nir)
{
return nir_shader_intrinsics_pass(nir, lower_instance_id,
nir_metadata_control_flow, NULL);
}

View file

@ -79,6 +79,7 @@ bool msl_nir_fake_guard_for_discards(struct nir_shader *nir);
bool msl_nir_lower_sample_shading(nir_shader *nir);
void msl_nir_lower_clip_cull_distance(nir_shader *nir,
unsigned num_cull_distances);
bool msl_nir_lower_instance_id(nir_shader *nir);
bool msl_gather_uses_per_draw_data(nir_shader *nir);
@ -94,7 +95,6 @@ static const nir_shader_compiler_options kk_nir_options = {
.lower_insert_byte = true,
.lower_fmod = true,
.discard_is_demote = true,
.instance_id_includes_base_index = true,
.lower_device_index_to_zero = true,
.lower_pack_64_2x32_split = true,
.lower_unpack_64_2x32_split = true,

View file

@ -0,0 +1,43 @@
/*
* Copyright 2026 LunarG, Inc.
* Copyright 2026 Google LLC
* Copyright 2023 Alyssa Rosenzweig
* Copyright 2023 Valve Corporation
* SPDX-License-Identifier: MIT
*/
#include "compiler/libcl/libcl_vk.h"
#include "poly/geometry.h"
#include "poly/tessellator.h"
KERNEL(1)
libkk_prefix_sum_tess(global struct poly_tess_params *p)
{
if (cl_local_id.x != 0)
return;
/* The last element of an inclusive prefix sum is the total sum */
uint total = 0;
if (p->nr_patches > 0) {
for (uint32_t i = 0u; i < p->nr_patches; ++i) {
total += p->counts[i];
p->counts[i] = total;
}
}
/* Allocate 4-byte indices */
uint32_t elsize_B = sizeof(uint32_t);
uint32_t size_B = total * elsize_B;
uint alloc_B = poly_heap_alloc_offs(p->heap, size_B);
p->index_buffer = (global uint32_t *)(((uintptr_t)p->heap->base) + alloc_B);
/* ...and now we can generate the API indexed draw */
global uint32_t *desc = p->out_draws;
desc[0] = total; /* count */
desc[1] = 1; /* instance_count */
desc[2] = alloc_B / elsize_B; /* start */
desc[3] = 0; /* index_bias */
desc[4] = 0; /* start_instance */
}

View file

@ -0,0 +1,93 @@
/*
* Copyright 2026 LunarG, Inc.
* Copyright 2026 Google LLC
* Copyright 2023 Alyssa Rosenzweig
* SPDX-License-Identifier: MIT
*/
#include "poly/geometry.h"
#include "poly/tessellator.h"
KERNEL(1)
libkk_tess_setup_indirect(
global struct poly_tess_params *p,
global uint32_t *grids /* output: VS then TCS then tess */,
global struct poly_vertex_params *vp /* output */, global uint32_t *indirect,
uint64_t in_index_buffer, uint32_t in_index_buffer_range_el,
uint32_t in_index_size_B, uint64_t vertex_outputs /* bitfield */,
/* Tess control invocation counter if active, else zero */
global uint32_t *tcs_statistic)
{
uint count = indirect[0], instance_count = indirect[1];
unsigned in_patches = count / p->input_patch_size;
/* TCS invocation counter increments once per-patch */
if (tcs_statistic) {
*tcs_statistic += in_patches;
}
size_t draw_stride = 5 * sizeof(uint32_t);
unsigned unrolled_patches = in_patches * instance_count;
uint32_t alloc = 0;
uint32_t tcs_out_offs = alloc;
alloc += unrolled_patches * p->tcs_stride_el * 4;
uint32_t patch_coord_offs = alloc;
alloc += unrolled_patches * 4;
uint32_t count_offs = alloc;
alloc += unrolled_patches * sizeof(uint32_t);
uint vb_offs = alloc;
uint vb_size = poly_tcs_in_size(count * instance_count, vertex_outputs);
alloc += vb_size;
/* Allocate all patch calculations in one go */
global uchar *blob = poly_heap_alloc(p->heap, alloc);
p->tcs_buffer = (global float *)(blob + tcs_out_offs);
p->patches_per_instance = in_patches;
p->coord_allocs = (global uint *)(blob + patch_coord_offs);
p->nr_patches = unrolled_patches;
vp->output_buffer = (uintptr_t)(blob + vb_offs);
vp->outputs = vertex_outputs;
p->counts = (global uint32_t *)(blob + count_offs);
if (vp) {
vp->verts_per_instance = count;
}
/* If indexing is enabled, the third word is the offset into the index buffer
* in elements. Apply that offset now that we have it. For a hardware
* indirect draw, the hardware would do this for us, but for software input
* assembly we need to do it ourselves.
*
* XXX: Deduplicate?
*/
if (in_index_size_B) {
/* TODO_KOSMICKRISP Use poly_index_buffer and implement
* load_ro_sink_address_poly */
vp->index_buffer = in_index_buffer + (indirect[2] * in_index_size_B);
vp->index_buffer_range_el =
poly_index_buffer_range_el(in_index_buffer_range_el, indirect[2]);
}
/* VS grid size */
grids[0] = count;
grids[1] = instance_count;
grids[2] = 1;
/* TCS grid size */
grids[3] = in_patches * p->output_patch_size;
grids[4] = instance_count;
grids[5] = 1;
/* Tess grid size */
grids[6] = unrolled_patches;
grids[7] = 1;
grids[8] = 1;
}

View file

@ -0,0 +1,31 @@
/*
* Copyright 2026 LunarG, Inc.
* Copyright 2026 Google LLC
* SPDX-License-Identifier: MIT
*/
#include "poly/cl/tessellator.h"
KERNEL(1)
libkk_tess_isoline(constant struct poly_tess_params *p,
enum poly_tess_mode tess_mode)
{
uint patch = cl_global_id.x;
poly_tess_isoline_process(p, patch, tess_mode);
}
KERNEL(1)
libkk_tess_tri(constant struct poly_tess_params *p,
enum poly_tess_mode tess_mode)
{
uint patch = cl_global_id.x;
poly_tess_tri_process(p, patch, tess_mode);
}
KERNEL(1)
libkk_tess_quad(constant struct poly_tess_params *p,
enum poly_tess_mode tess_mode)
{
uint patch = cl_global_id.x;
poly_tess_quad_process(p, patch, tess_mode);
}

View file

@ -0,0 +1,20 @@
/*
* Copyright 2026 LunarG, Inc.
* Copyright 2026 Google LLC
* Copyright 2024 Valve Corporation
* SPDX-License-Identifier: MIT
*/
#pragma once
#include "poly/tessellator.h"
#define libkk_tessellate(context, grid, barrier, prim, mode, state) \
if (prim == TESS_PRIMITIVE_QUADS) { \
libkk_tess_quad(context, grid, barrier, state, mode); \
} else if (prim == TESS_PRIMITIVE_TRIANGLES) { \
libkk_tess_tri(context, grid, barrier, state, mode); \
} else { \
assert(prim == TESS_PRIMITIVE_ISOLINES); \
libkk_tess_isoline(context, grid, barrier, state, mode); \
}

View file

@ -4,7 +4,10 @@
libkk_shader_files = files(
'kk_draws.cl',
'kk_geometry.cl',
'kk_query.cl',
'kk_tessellation.cl',
'kk_tessellator.cl',
)
libkk_spv = custom_target(

View file

@ -48,8 +48,9 @@ struct kk_root_descriptor_table {
float blend_constant[4];
float clip_z_coeff;
uint32_t base_vertex;
uint32_t index_size;
uint64_t base_vertex_addr;
uint64_t base_instance_addr;
} draw;
struct {
uint32_t base_group[3];
@ -158,6 +159,16 @@ struct kk_graphics_state {
mtl_buffer *handles[KK_MAX_VBUFS];
} vb;
/* Tessellation state */
struct {
/* Grid buffer for when the draw is indirect */
struct kk_ptr indirect_ptr;
mtl_buffer *out_draws_buffer;
uint64_t out_draws_offset;
struct kk_tess_info info;
enum mesa_prim prim;
} tess;
/* Needed by vk_command_buffer::dynamic_graphics_state */
struct vk_vertex_input_state _dynamic_vi;
struct vk_sample_locations_state _dynamic_sl;

View file

@ -19,7 +19,10 @@
#include "kosmickrisp/bridge/mtl_bridge.h"
#include "kosmickrisp/bridge/vk_to_mtl_map.h"
#include "kosmickrisp/libkk/kk_tessellator.h"
#include "poly/geometry.h"
#include "poly/tessellator.h"
#include "vulkan/runtime/vk_render_pass.h"
#include "vulkan/util/vk_format.h"
@ -782,6 +785,255 @@ kk_flush_pipeline(struct kk_cmd_buffer *cmd)
if (gfx->depth_stencil_state)
mtl_set_depth_stencil_state(enc, gfx->depth_stencil_state);
}
/* Merge tess info before GS construction since that depends on
* gfx->tess.prim
*/
if ((IS_SHADER_DIRTY(TESS_CTRL) || IS_SHADER_DIRTY(TESS_EVAL)) &&
cmd->state.shaders[MESA_SHADER_TESS_CTRL]) {
struct kk_shader *tesc = cmd->state.shaders[MESA_SHADER_TESS_CTRL];
struct kk_shader *tese = cmd->state.shaders[MESA_SHADER_TESS_EVAL];
gfx->tess.info =
kk_tess_info_merge(tese->info.tess.info, tesc->info.tess.info);
/* Determine primitive based on the merged state */
if (gfx->tess.info.points) {
gfx->tess.prim = MESA_PRIM_POINTS;
} else if (gfx->tess.info.mode == TESS_PRIMITIVE_ISOLINES) {
gfx->tess.prim = MESA_PRIM_LINES;
} else {
gfx->tess.prim = MESA_PRIM_TRIANGLES;
}
}
}
static void
kk_init_heap(const void *data)
{
struct kk_cmd_buffer *cmd = (struct kk_cmd_buffer *)data;
struct kk_device *dev = kk_cmd_buffer_device(cmd);
size_t size = 128 * 1024 * 1024;
kk_alloc_bo(dev, &dev->vk.base, size, 0, &dev->heap);
struct poly_heap *map = (struct poly_heap *)dev->heap->cpu;
/* TODO_KOSMICKRISP Self-contained until we have rodata at the device. */
*map = (struct poly_heap){
.base = dev->heap->gpu + sizeof(struct poly_heap),
.size = size - sizeof(struct poly_heap),
};
}
static uint64_t
kk_heap(struct kk_cmd_buffer *cmd)
{
struct kk_device *dev = kk_cmd_buffer_device(cmd);
util_call_once_data(&dev->heap_init_once, kk_init_heap, cmd);
/* We need to free all allocations after each command buffer execution */
if (!cmd->uses_heap) {
uint64_t addr = dev->heap->gpu;
/* Zeroing the allocated index frees everything */
kk_cmd_write(cmd, (struct libkk_imm_write){
addr + offsetof(struct poly_heap, bottom), 0});
cmd->uses_heap = true;
}
return dev->heap->gpu;
}
enum kk_predicate_op : uint16_t {
/* value > draw_id */
KK_PREDICATE_GT_DRAW_ID,
/* value == 0 */
KK_PREDICATE_EQ_ZERO,
/* value != 0 */
KK_PREDICATE_NEQ_ZERO,
};
struct kk_draw_command {
enum mesa_prim prim;
/* Mask of stages that need per-draw data uploaded */
uint32_t upload_mask;
mtl_buffer *index_buffer;
uint64_t index_buffer_offset;
uint64_t index_buffer_range_B;
uint64_t index_buffer_size_B;
uint32_t restart_index;
uint8_t index_buffer_el_size_B;
bool indirect;
bool indexed;
bool restart;
uint32_t predicate_count;
enum kk_predicate_op predicate_op[2];
uint32_t draw_count;
uint32_t pad_;
uint64_t predicate_addr[2];
union {
struct {
mtl_buffer *buffer;
uint64_t offset;
uint32_t stride;
} indirect_command;
/* These arrays will be >1 when draw_count is >1 as this struct is
* dynamically allocated. */
VkDrawIndirectCommand draws[1];
VkDrawIndexedIndirectCommand indexed_draws[1];
};
};
static_assert(sizeof(struct kk_draw_command) == 104u, "Packed struct");
struct kk_draw_data {
/* For non-indirect, 0 is vertex/index count, 1 instance count and 2 first
* instance */
struct kk_grid grid;
struct {
mtl_buffer *buffer;
uint64_t offset;
uint64_t range;
enum mtl_index_type type;
} index;
uint32_t vertex_offset;
enum mtl_primitive_type primitive_type;
};
static uint64_t
kk_upload_vertex_params(struct kk_cmd_buffer *cmd, struct kk_draw_data data)
{
struct kk_descriptor_state *desc = &cmd->state.gfx.descriptors;
const uint32_t wg_size[3] = {1, 1, 1};
struct poly_vertex_params params;
poly_vertex_params_init(&params, 0, wg_size);
/* XXX: We should deduplicate this logic */
bool indirect = kk_grid_is_indirect(data.grid);
if (!indirect)
poly_vertex_params_set_draw(&params, data.grid.size.x, data.grid.size.y);
if (data.index.buffer) {
params.index_buffer =
mtl_buffer_get_gpu_address(data.index.buffer) + data.index.offset;
params.index_buffer_range_el =
data.index.range / mtl_index_type_to_size_B(data.index.type);
}
struct kk_shader *vs = cmd->state.shaders[MESA_SHADER_VERTEX];
params.outputs = vs->info.vs.outputs_written;
if (!indirect) {
uint32_t verts = data.grid.size.x, instances = data.grid.size.y;
unsigned vb_size =
poly_tcs_in_size(verts * instances, vs->info.vs.outputs_written);
/* Allocate if there are any outputs, or use the null sink to trap
* reads if there aren't. Those reads are undefined but should not
* fault. Affects:
*
* dEQP-VK.pipeline.monolithic.no_position.explicit_declarations.basic.single_view.v0_g1
*/
if (vb_size)
params.output_buffer = kk_pool_alloc(cmd, vb_size, 4).gpu;
else
params.output_buffer = 0u;
}
desc->root.draw.vertex_outputs = params.outputs;
return kk_pool_upload(cmd, &params, sizeof(params), 8).gpu;
}
static void
kk_upload_tess_params(struct kk_cmd_buffer *cmd, struct poly_tess_params *out,
struct kk_draw_data draw)
{
struct kk_device *dev = kk_cmd_buffer_device(cmd);
struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
struct kk_graphics_state *gfx = &cmd->state.gfx;
struct kk_shader *tcs = cmd->state.shaders[MESA_SHADER_TESS_CTRL];
enum poly_tess_partitioning partitioning =
gfx->tess.info.spacing == TESS_SPACING_EQUAL
? POLY_TESS_PARTITIONING_INTEGER
: gfx->tess.info.spacing == TESS_SPACING_FRACTIONAL_ODD
? POLY_TESS_PARTITIONING_FRACTIONAL_ODD
: POLY_TESS_PARTITIONING_FRACTIONAL_EVEN;
struct poly_tess_params args = {
.heap = kk_heap(cmd),
.tcs_stride_el = tcs->info.tess.tcs_output_stride / 4,
.statistic = 0u,
.input_patch_size = dyn->ts.patch_control_points,
.output_patch_size = tcs->info.tess.tcs_output_patch_size,
.tcs_patch_constants = tcs->info.tess.tcs_nr_patch_outputs,
.tcs_per_vertex_outputs = tcs->info.tess.tcs_per_vertex_outputs,
.partitioning = partitioning,
.points_mode = gfx->tess.info.points,
.isolines = gfx->tess.info.mode == TESS_PRIMITIVE_ISOLINES,
};
if (!args.points_mode && gfx->tess.info.mode != TESS_PRIMITIVE_ISOLINES) {
args.ccw = gfx->tess.info.ccw;
args.ccw ^=
dyn->ts.domain_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT;
}
uint32_t draw_stride_el = 5;
size_t draw_stride_B = draw_stride_el * sizeof(uint32_t);
/* heap is allocated by kk_heap */
/* TODO_KOSMICKRISP Self-contained until we have rodata at the device. */
args.patch_coord_buffer = dev->heap->gpu + sizeof(struct poly_heap);
if (!kk_grid_is_indirect(draw.grid)) {
unsigned in_patches = draw.grid.size.x / args.input_patch_size;
unsigned unrolled_patches = in_patches * draw.grid.size.y;
uint32_t alloc = 0;
uint32_t tcs_out_offs = alloc;
alloc += unrolled_patches * args.tcs_stride_el * sizeof(uint32_t);
uint32_t patch_coord_offs = alloc;
alloc += unrolled_patches * sizeof(uint32_t);
uint32_t count_offs = alloc;
alloc += unrolled_patches * sizeof(uint32_t);
/* Single API draw */
uint32_t draw_offs = alloc;
alloc += draw_stride_B;
struct kk_ptr ptr = kk_pool_alloc(cmd, alloc, 4);
gfx->tess.out_draws_buffer = ptr.buffer;
gfx->tess.out_draws_offset = ptr.offset + draw_offs;
uint64_t addr = ptr.gpu;
args.tcs_buffer = addr + tcs_out_offs;
args.patches_per_instance = in_patches;
args.coord_allocs = addr + patch_coord_offs;
args.nr_patches = unrolled_patches;
args.out_draws = addr + draw_offs;
args.counts = addr + count_offs;
} else {
/* Allocate 3x indirect global+local grids for VS/TCS/tess */
uint32_t grid_stride = sizeof(uint32_t) * 3;
gfx->tess.indirect_ptr = kk_pool_alloc(cmd, grid_stride * 3, 4);
struct kk_ptr ptr = kk_pool_alloc(cmd, draw_stride_B, 4);
gfx->tess.out_draws_buffer = ptr.buffer;
gfx->tess.out_draws_offset = ptr.offset;
args.out_draws = ptr.gpu;
}
memcpy(out, &args, sizeof(args));
}
static void
@ -835,10 +1087,17 @@ kk_flush_dynamic_state(struct kk_cmd_buffer *cmd)
desc->root_dirty = true;
}
if (IS_DIRTY(RS_FRONT_FACE)) {
mtl_set_front_face_winding(
enc, vk_front_face_to_mtl_winding(
cmd->vk.dynamic_graphics_state.rs.front_face));
if (IS_DIRTY(RS_FRONT_FACE) || IS_DIRTY(TS_DOMAIN_ORIGIN) ||
IS_SHADER_DIRTY(TESS_CTRL) || IS_SHADER_DIRTY(TESS_EVAL)) {
bool front_face_ccw = dyn->rs.front_face != VK_FRONT_FACE_CLOCKWISE;
if (cmd->state.shaders[MESA_SHADER_TESS_EVAL]) {
front_face_ccw ^= gfx->tess.info.ccw;
front_face_ccw ^=
dyn->ts.domain_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT;
}
mtl_set_front_face_winding(enc, front_face_ccw
? MTL_WINDING_COUNTER_CLOCKWISE
: MTL_WINDING_CLOCKWISE);
}
if (IS_DIRTY(RS_DEPTH_BIAS_FACTORS) || IS_DIRTY(RS_DEPTH_BIAS_ENABLE)) {
@ -928,87 +1187,6 @@ kk_flush_gfx_state(struct kk_cmd_buffer *cmd)
#undef IS_SHADER_DIRTY
#undef IS_DIRTY
enum kk_predicate_op : uint16_t {
/* value > draw_id */
KK_PREDICATE_GT_DRAW_ID,
/* value == 0 */
KK_PREDICATE_EQ_ZERO,
/* value != 0 */
KK_PREDICATE_NEQ_ZERO,
};
struct kk_draw_command {
enum mesa_prim prim;
/* Mask of stages that need per-draw data uploaded */
uint32_t upload_mask;
mtl_buffer *index_buffer;
uint64_t index_buffer_offset;
uint64_t index_buffer_range_B;
uint64_t index_buffer_size_B;
uint32_t restart_index;
uint8_t index_buffer_el_size_B;
bool indirect;
bool indexed;
bool restart;
uint32_t predicate_count;
enum kk_predicate_op predicate_op[2];
uint32_t draw_count;
uint32_t pad_;
uint64_t predicate_addr[2];
union {
struct {
mtl_buffer *buffer;
uint64_t offset;
uint32_t stride;
} indirect_command;
/* These arrays will be >1 when draw_count is >1 as this struct is
* dynamically allocated. */
VkDrawIndirectCommand draws[1];
VkDrawIndexedIndirectCommand indexed_draws[1];
};
};
static_assert(sizeof(struct kk_draw_command) == 104u, "Packed struct");
static void
kk_init_heap(const void *data)
{
struct kk_cmd_buffer *cmd = (struct kk_cmd_buffer *)data;
struct kk_device *dev = kk_cmd_buffer_device(cmd);
size_t size = 128 * 1024 * 1024;
kk_alloc_bo(dev, &dev->vk.base, size, 0, &dev->heap);
struct poly_heap *map = (struct poly_heap *)dev->heap->cpu;
/* TODO_KOSMICKRISP Self-contained until we have rodata at the device. */
*map = (struct poly_heap){
.base = dev->heap->gpu + sizeof(struct poly_heap),
.size = size - sizeof(struct poly_heap),
};
}
static uint64_t
kk_heap(struct kk_cmd_buffer *cmd)
{
struct kk_device *dev = kk_cmd_buffer_device(cmd);
util_call_once_data(&dev->heap_init_once, kk_init_heap, cmd);
/* We need to free all allocations after each command buffer execution */
if (!cmd->uses_heap) {
uint64_t addr = dev->heap->gpu;
/* Zeroing the allocated index frees everything */
kk_cmd_write(cmd, (struct libkk_imm_write){
addr + offsetof(struct poly_heap, bottom), 0});
cmd->uses_heap = true;
}
return dev->heap->gpu;
}
/* Returns true if the draw was successfully converted. */
static bool
kk_convert_to_indirect_draw(struct kk_cmd_buffer *cmd,
@ -1193,19 +1371,6 @@ build_per_draw_upload_mask(struct kk_cmd_buffer *cmd)
return mask;
}
struct kk_draw_data {
/* For non-indirect, 0 is vertex/index count, 1 instance count and 2 first
* instance */
struct kk_grid grid;
struct {
mtl_buffer *buffer;
uint64_t offset;
enum mtl_index_type type;
} index;
uint32_t vertex_offset;
enum mtl_primitive_type primitive_type;
};
static void
kk_dispatch_draw(mtl_render_encoder *enc, struct kk_draw_data data)
{
@ -1225,9 +1390,12 @@ kk_dispatch_draw(mtl_render_encoder *enc, struct kk_draw_data data)
data.index.offset, data.grid.size.y,
data.vertex_offset, data.grid.size.z);
} else {
mtl_draw_primitives(enc, data.primitive_type, data.vertex_offset,
data.grid.size.x, data.grid.size.y,
data.grid.size.z);
/* Avoid Metal validation error. Empty draws from tessellation will
* have values set to 0. */
if (data.grid.size.x != 0 && data.grid.size.y != 0)
mtl_draw_primitives(enc, data.primitive_type, data.vertex_offset,
data.grid.size.x, data.grid.size.y,
data.grid.size.z);
}
}
}
@ -1357,6 +1525,128 @@ kk_upload_per_draw_data(struct kk_cmd_buffer *cmd, uint32_t upload_mask,
}
}
static void
kk_dispatch_compute(mtl_compute_encoder *enc, struct kk_grid grid,
struct mtl_size local_size)
{
if (grid.mode == KK_GRID_DIRECT)
mtl_dispatch_threads(enc, grid.size, local_size);
else
mtl_dispatch_threadgroups_with_indirect_buffer(enc, grid.indirect,
grid.offset, local_size);
}
static struct kk_draw_data
kk_launch_tess(struct kk_cmd_buffer *cmd, struct kk_draw_data draw,
uint32_t draw_id)
{
struct kk_device *dev = kk_cmd_buffer_device(cmd);
struct kk_graphics_state *gfx = &cmd->state.gfx;
struct kk_grid grid_vs, grid_tcs, grid_tess;
struct kk_shader *vs = cmd->state.shaders[MESA_SHADER_VERTEX];
struct kk_shader *tcs = cmd->state.shaders[MESA_SHADER_TESS_CTRL];
struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
uint32_t input_patch_size = dyn->ts.patch_control_points;
uint64_t state = gfx->descriptors.root.draw.tess_params;
struct kk_tess_info info = gfx->tess.info;
/* Setup grids */
if (kk_grid_is_indirect(draw.grid)) {
struct libkk_tess_setup_indirect_args args = {
.p = state,
.grids = gfx->tess.indirect_ptr.gpu,
.indirect =
mtl_buffer_get_gpu_address(draw.grid.indirect) + draw.grid.offset,
.vp = gfx->descriptors.root.draw.vertex_params,
.vertex_outputs = vs->info.vs.outputs_written,
.tcs_statistic = 0,
};
if (draw.index.buffer) {
args.in_index_buffer =
mtl_buffer_get_gpu_address(draw.index.buffer) + draw.index.offset;
args.in_index_size_B = mtl_index_type_to_size_B(draw.index.type);
args.in_index_buffer_range_el =
draw.index.range / args.in_index_size_B;
}
libkk_tess_setup_indirect_struct(cmd, kk_grid_1d(1), true, args);
uint32_t grid_stride = sizeof(uint32_t) * 3;
grid_vs =
kk_grid_indirect(gfx->tess.indirect_ptr.buffer,
gfx->tess.indirect_ptr.offset + 0u * grid_stride);
grid_tcs =
kk_grid_indirect(gfx->tess.indirect_ptr.buffer,
gfx->tess.indirect_ptr.offset + 1u * grid_stride);
grid_tess =
kk_grid_indirect(gfx->tess.indirect_ptr.buffer,
gfx->tess.indirect_ptr.offset + 2u * grid_stride);
} else {
uint32_t patches = draw.grid.size.x / input_patch_size;
grid_vs = grid_tcs = kk_grid_2d(draw.grid.size.x, draw.grid.size.y);
grid_tcs.size.x = patches * tcs->info.tess.tcs_output_patch_size;
grid_tess = kk_grid_1d(patches * draw.grid.size.y);
}
/* First launch the VS and TCS */
mtl_compute_encoder *enc = kk_encoder_pre_gfx_encoder(cmd);
{
mtl_compute_pipeline_state *pipeline = vs->pipeline.gfx.pre_render[0];
struct mtl_size local_size = {64, 1, 1};
mtl_compute_set_pipeline_state(enc, pipeline);
mtl_compute_set_buffer(enc, gfx->descriptors.root.root_buffer.buffer,
gfx->descriptors.root.root_buffer.offset, 0u);
struct kk_per_draw_data shader_data = {.draw_id = draw_id};
struct kk_ptr shader_data_gpu =
kk_pool_upload(cmd, &shader_data, sizeof(shader_data), 8u);
mtl_compute_set_buffer(enc, shader_data_gpu.buffer,
shader_data_gpu.offset, 2);
kk_dispatch_compute(enc, grid_vs, local_size);
/* TODO_KOSMICKRISP Maybe too big of a barrier? We could definitely just
* barrier the buffers we know we modify. */
mtl_memory_barrier_with_scope(enc, MTL_BARRIER_SCOPE_BUFFERS);
}
{
mtl_compute_pipeline_state *pipeline = vs->pipeline.gfx.pre_render[1];
struct mtl_size local_size = {tcs->info.tess.tcs_output_patch_size, 1, 1};
/* Avoid Metal validation error by trying to launch empty compute. Return
* empty data. We set restart to true to avoid unroll. */
if (grid_tcs.mode == KK_GRID_DIRECT && grid_tcs.size.x == 0u)
return (struct kk_draw_data){.grid = kk_grid_1d(0u)};
mtl_compute_set_pipeline_state(enc, pipeline);
kk_dispatch_compute(enc, grid_tcs, local_size);
mtl_memory_barrier_with_scope(enc, MTL_BARRIER_SCOPE_BUFFERS);
}
/* First generate counts, then prefix sum them, and then tessellate. */
libkk_tessellate(cmd, grid_tess, true, info.mode, POLY_TESS_MODE_COUNT,
state);
mtl_memory_barrier_with_scope(enc, MTL_BARRIER_SCOPE_BUFFERS);
libkk_prefix_sum_tess(cmd, kk_grid_1d(1u), true, state);
mtl_memory_barrier_with_scope(enc, MTL_BARRIER_SCOPE_BUFFERS);
libkk_tessellate(cmd, grid_tess, true, info.mode, POLY_TESS_MODE_WITH_COUNTS,
state);
mtl_memory_barrier_with_scope(enc, MTL_BARRIER_SCOPE_BUFFERS);
draw.grid =
kk_grid_indirect(gfx->tess.out_draws_buffer, gfx->tess.out_draws_offset);
draw.index.buffer = dev->heap->map;
draw.index.offset = sizeof(struct poly_heap);
draw.index.type = MTL_INDEX_TYPE_UINT32;
draw.primitive_type = mesa_prim_to_mtl_primitive_type(gfx->tess.prim);
return draw;
}
/* When the current draw contains stages not present in Metal such as
* tessellation, this step will launch required emulation when needed and build
* the per draw data required to launch the Metal draw. */
@ -1364,30 +1654,99 @@ static struct kk_draw_data
build_draw_data(struct kk_cmd_buffer *cmd, struct kk_draw_command *data,
uint32_t draw_id)
{
bool tess = cmd->state.shaders[MESA_SHADER_TESS_EVAL];
struct kk_draw_data draw = {
.index.buffer = data->index_buffer,
.index.offset = data->index_buffer_offset,
.index.type = data->indexed ? index_size_in_bytes_to_mtl_index_type(
data->index_buffer_el_size_B)
: 0u,
.primitive_type = mesa_prim_to_mtl_primitive_type(data->prim),
.index.range = data->index_buffer_range_B,
.primitive_type = tess ? 0u : mesa_prim_to_mtl_primitive_type(data->prim),
};
uint64_t first_vertex_gpu = 0u;
uint64_t base_instance_gpu = 0u;
if (data->indirect) {
draw.grid = kk_grid_indirect(data->indirect_command.buffer,
data->indirect_command.offset +
draw_id * data->indirect_command.stride);
uint64_t indirect_offset = data->indirect_command.offset +
draw_id * data->indirect_command.stride;
draw.grid =
kk_grid_indirect(data->indirect_command.buffer, indirect_offset);
if (tess) {
uint64_t first_vertex_offset =
data->indexed ? offsetof(VkDrawIndexedIndirectCommand, vertexOffset)
: offsetof(VkDrawIndirectCommand, firstVertex);
uint64_t base_instance_offset =
data->indexed
? offsetof(VkDrawIndexedIndirectCommand, firstInstance)
: offsetof(VkDrawIndirectCommand, firstInstance);
first_vertex_gpu =
mtl_buffer_get_gpu_address(data->indirect_command.buffer) +
indirect_offset + first_vertex_offset;
base_instance_gpu =
mtl_buffer_get_gpu_address(data->indirect_command.buffer) +
indirect_offset + base_instance_offset;
}
} else if (data->indexed) {
VkDrawIndexedIndirectCommand cmd = data->indexed_draws[draw_id];
draw.grid =
kk_grid_3d(cmd.indexCount, cmd.instanceCount, cmd.firstInstance);
draw.vertex_offset = cmd.vertexOffset;
draw.index.offset += cmd.firstIndex * data->index_buffer_el_size_B;
VkDrawIndexedIndirectCommand draw_cmd = data->indexed_draws[draw_id];
draw.grid = kk_grid_3d(draw_cmd.indexCount, draw_cmd.instanceCount,
draw_cmd.firstInstance);
draw.vertex_offset = draw_cmd.vertexOffset;
draw.index.offset += draw_cmd.firstIndex * data->index_buffer_el_size_B;
if (tess) {
first_vertex_gpu = kk_pool_upload(cmd, &draw_cmd.vertexOffset,
sizeof(draw_cmd.vertexOffset), 4u)
.gpu;
base_instance_gpu = kk_pool_upload(cmd, &draw_cmd.firstInstance,
sizeof(draw_cmd.firstInstance), 4u)
.gpu;
}
} else {
VkDrawIndirectCommand cmd = data->draws[draw_id];
draw.grid =
kk_grid_3d(cmd.vertexCount, cmd.instanceCount, cmd.firstInstance);
draw.vertex_offset = cmd.firstVertex;
VkDrawIndirectCommand draw_cmd = data->draws[draw_id];
draw.grid = kk_grid_3d(draw_cmd.vertexCount, draw_cmd.instanceCount,
draw_cmd.firstInstance);
draw.vertex_offset = draw_cmd.firstVertex;
if (tess) {
first_vertex_gpu = kk_pool_upload(cmd, &draw_cmd.firstVertex,
sizeof(draw_cmd.firstVertex), 4u)
.gpu;
base_instance_gpu = kk_pool_upload(cmd, &draw_cmd.firstInstance,
sizeof(draw_cmd.firstInstance), 4u)
.gpu;
}
}
/* Emulate tessellation. */
if (tess) {
struct kk_ptr tess_args = {};
struct kk_graphics_state *gfx = &cmd->state.gfx;
struct kk_descriptor_state *desc = &gfx->descriptors;
if (cmd->state.shaders[MESA_SHADER_TESS_EVAL]) {
gfx->descriptors.root.draw.index_size = data->index_buffer_el_size_B;
gfx->descriptors.root.draw.base_vertex_addr = first_vertex_gpu;
gfx->descriptors.root.draw.base_instance_addr = base_instance_gpu;
desc->root.draw.vertex_params = kk_upload_vertex_params(cmd, draw);
tess_args = kk_pool_alloc(cmd, sizeof(struct poly_tess_params), 4);
gfx->descriptors.root.draw.tess_params = tess_args.gpu;
gfx->descriptors.root_dirty = true;
}
if (desc->root_dirty) {
kk_upload_descriptor_root(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS);
struct kk_ptr root_buffer = desc->root.root_buffer;
mtl_set_vertex_buffer(kk_render_encoder(cmd), root_buffer.buffer,
root_buffer.offset, 0);
mtl_set_fragment_buffer(kk_render_encoder(cmd), root_buffer.buffer,
root_buffer.offset, 0);
if (tess_args.gpu) {
kk_upload_tess_params(cmd, tess_args.cpu, draw);
}
}
draw = kk_launch_tess(cmd, draw, draw_id);
}
return draw;
@ -1406,11 +1765,14 @@ kk_draw(struct kk_cmd_buffer *cmd, struct kk_draw_command *data)
if (data->predicate_count > 0 && !kk_predicate_draws(cmd, data))
return;
/* Unroll geometry. Skip draw if we fail. */
bool requires_unroll = data->prim == MESA_PRIM_TRIANGLE_FAN ||
requires_index_promotion(data) ||
requires_unroll_restart(cmd, data) ||
requires_index_robustness(cmd, data);
bool tess = cmd->state.shaders[MESA_SHADER_TESS_EVAL];
/* Unroll geometry. Skip draw if we fail. No need to unroll if tessellation
* is present since it also handles unrolling. */
bool requires_unroll = !tess && (data->prim == MESA_PRIM_TRIANGLE_FAN ||
requires_index_promotion(data) ||
requires_unroll_restart(cmd, data) ||
requires_index_robustness(cmd, data));
if (requires_unroll && !kk_unroll_geometry(cmd, data))
return;

View file

@ -816,9 +816,39 @@ lower_poly(struct nir_builder *b, nir_intrinsic_instr *intrin, void *data)
case nir_intrinsic_load_index_size_poly:
return lower_sysval_to_root_table(b, intrin, draw.index_size);
case nir_intrinsic_load_first_vertex:
if (*(bool *)data)
return lower_sysval_to_root_table(b, intrin, draw.base_vertex);
FALLTHROUGH;
/* Lower only compute shaders */
if (*(bool *)data) {
uint32_t root_table_offset =
kk_root_descriptor_offset(draw.base_vertex_addr);
b->cursor = nir_instr_remove(&intrin->instr);
assert((root_table_offset & 3) == 0 && "aligned");
nir_def *addr = load_root(b, intrin->def.num_components, 64u,
nir_imm_int(b, root_table_offset), 4);
nir_def *val = nir_load_global(b, 1u, intrin->def.bit_size, addr);
nir_def_rewrite_uses(&intrin->def, val);
return true;
}
return false;
case nir_intrinsic_load_base_instance:
/* Lower only compute shaders */
if (*(bool *)data) {
uint32_t root_table_offset =
kk_root_descriptor_offset(draw.base_instance_addr);
b->cursor = nir_instr_remove(&intrin->instr);
assert((root_table_offset & 3) == 0 && "aligned");
nir_def *addr = load_root(b, intrin->def.num_components, 64u,
nir_imm_int(b, root_table_offset), 4);
nir_def *val = nir_load_global(b, 1u, intrin->def.bit_size, addr);
nir_def_rewrite_uses(&intrin->def, val);
return true;
}
return false;
default:
return false;
}

View file

@ -17,9 +17,6 @@
struct ctx {
struct kk_attribute *attribs;
bool requires_vertex_id;
bool requires_instance_id;
bool requires_base_instance;
bool requires_robustness2;
};
@ -165,22 +162,19 @@ pass(struct nir_builder *b, nir_intrinsic_instr *intr, void *data)
nir_def *el;
if (attrib.instanced) {
if (attrib.divisor > 0) {
/* Metal's instance_id has base_instance included */
nir_def *instance_id =
nir_isub(b, nir_load_instance_id(b), nir_load_base_instance(b));
el = nir_udiv_imm(b, instance_id, attrib.divisor);
ctx->requires_instance_id = true;
el = nir_udiv_imm(b, nir_load_instance_id(b), attrib.divisor);
BITSET_SET(b->shader->info.system_values_read,
SYSTEM_VALUE_INSTANCE_ID);
} else
el = nir_imm_int(b, 0);
el = nir_iadd(b, el, nir_load_base_instance(b));
ctx->requires_base_instance = true;
BITSET_SET(b->shader->info.system_values_read,
SYSTEM_VALUE_BASE_INSTANCE);
} else {
el = nir_load_vertex_id(b);
ctx->requires_vertex_id = true;
BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_VERTEX_ID);
}
/* Load the pointer of the buffer from the argument buffer */
@ -283,14 +277,6 @@ kk_nir_lower_vbo(nir_shader *nir, struct kk_attribute *attribs,
.attribs = attribs,
.requires_robustness2 = robustness2,
};
bool progress =
nir_shader_intrinsics_pass(nir, pass, nir_metadata_control_flow, &ctx);
if (ctx.requires_instance_id)
BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);
if (ctx.requires_base_instance)
BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_BASE_INSTANCE);
if (ctx.requires_vertex_id)
BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_VERTEX_ID);
return progress;
return nir_shader_intrinsics_pass(nir, pass, nir_metadata_control_flow,
&ctx);
}

View file

@ -227,6 +227,7 @@ kk_get_device_features(
.shaderStorageImageReadWithoutFormat = true,
.shaderStorageImageWriteWithoutFormat = true,
.shaderUniformBufferArrayDynamicIndexing = true,
.tessellationShader = true,
.textureCompressionASTC_LDR = true,
.textureCompressionBC = true,
.textureCompressionETC2 = true,

View file

@ -660,10 +660,10 @@ gather_shader_info(struct kk_shader *shader, nir_shader *nir,
{
shader->info.stage = nir->info.stage;
shader->info.uses_per_draw_data = msl_gather_uses_per_draw_data(nir);
shader->info.num_cull_distances = nir->info.cull_distance_array_size;
if (nir->info.stage == MESA_SHADER_VERTEX) {
nir_shader_intrinsics_pass(nir, gather_vs_inputs, nir_metadata_all,
&shader->info.vs.attribs_read);
shader->info.vs.num_cull_distances = nir->info.cull_distance_array_size;
shader->info.vs.outputs_written = nir->info.outputs_written;
} else if (nir->info.stage == MESA_SHADER_FRAGMENT) {
/* Some meta shaders like vk-meta-resolve will have depth_layout as NONE
@ -761,7 +761,7 @@ kk_compile_shader(struct kk_device *dev, nir_shader *nir,
gather_shader_info(shader, nir, state);
unsigned num_cull_distances =
prev_stage ? prev_stage->info.vs.num_cull_distances : 0;
prev_stage ? prev_stage->info.num_cull_distances : 0;
msl_nir_lower_clip_cull_distance(nir, num_cull_distances);
/* When using poly to emulate tessellation, vertex and tess control shaders
@ -778,7 +778,11 @@ kk_compile_shader(struct kk_device *dev, nir_shader *nir,
memset(&nir->info.cs, 0, sizeof(nir->info.cs));
nir->xfb_info = NULL;
NIR_PASS(_, nir, poly_nir_lower_sw_vs);
}
} else
/* Metal's instance_id contains base_instance. When the emulation path
* is taken, since we launch compute, they correctly get translated.
* For the non-emulated path we need to subtract base_instance... */
NIR_PASS(_, nir, msl_nir_lower_instance_id);
} else if (stage == MESA_SHADER_TESS_CTRL) {
NIR_PASS(_, nir, poly_nir_lower_tcs);

View file

@ -48,6 +48,10 @@ kk_tess_info_merge(struct kk_tess_info a, struct kk_tess_info b)
struct kk_shader_info {
mesa_shader_stage stage;
bool uses_per_draw_data;
/* Required for fragment shader cull distance discards. */
uint8_t num_cull_distances;
union {
/* Vertex shader is the pipeline, store all relevant data here. */
struct {
@ -62,9 +66,6 @@ struct kk_shader_info {
uint32_t sample_count;
uint64_t outputs_written;
/* Required for fragment shader cull distance discards. */
uint8_t num_cull_distances;
/* Data needed for serialization. */
enum mtl_primitive_topology_class topology;
enum mtl_pixel_format rt_formats[MAX_DRAW_BUFFERS];