nvk: Lower mesh and task shaders

Signed-off-by: Mary Guillemard <mary@mary.zone>
Reviewed-by: Mel Henning <mhenning@darkrefraction.com>
Tested-by: Thomas H.P. Andersen <phomes@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27196>
This commit is contained in:
Mary Guillemard 2026-02-05 21:09:42 +01:00 committed by Marge Bot
parent cf29933de1
commit cdb0dea462
13 changed files with 1117 additions and 18 deletions

View file

@ -23,6 +23,7 @@ libnak_c_files = files(
'nak_nir_lower_fs_inputs.c',
'nak_nir_lower_gs_intrinsics.c',
'nak_nir_lower_image_addrs.c',
'nak_nir_lower_mesh_intrinsics.c',
'nak_nir_lower_non_uniform_ldcx.c',
'nak_nir_lower_scan_reduce.c',
'nak_nir_lower_shared_atomics.c',

View file

@ -109,7 +109,8 @@ const extern struct nak_constant_offset_info nak_const_offsets_turing_graphics;
void nak_postprocess_nir(nir_shader *nir, const struct nak_compiler *nak,
nir_variable_mode robust2_modes,
const struct nak_fs_key *fs_key);
const struct nak_fs_key *fs_key,
bool has_task_shader);
enum ENUM_PACKED nak_ts_domain {
NAK_TS_DOMAIN_ISOLINE = 0,

View file

@ -485,7 +485,9 @@ fn nak_compile_shader_internal(
fs_key: *const nak_fs_key,
has_task_shader: bool,
) -> *mut nak_shader_bin {
unsafe { nak_postprocess_nir(nir, nak, robust2_modes, fs_key) };
unsafe {
nak_postprocess_nir(nir, nak, robust2_modes, fs_key, has_task_shader)
};
let nak = unsafe { &*nak };
let nir = unsafe { &*nir };
let fs_key = if fs_key.is_null() {

View file

@ -3748,6 +3748,11 @@ impl<'a> ShaderFromNir<'a> {
self.set_dst(&intrin.def, dst.into());
}
nir_intrinsic_shared_atomic_nv => {
assert!(
self.nir.info.stage() == MESA_SHADER_COMPUTE
|| self.nir.info.stage() == MESA_SHADER_KERNEL
);
let bit_size = intrin.def.bit_size();
let addr = self.get_src(&srcs[0]);
let uaddr = self.get_src(&srcs[1]);
@ -3775,6 +3780,11 @@ impl<'a> ShaderFromNir<'a> {
self.set_dst(&intrin.def, dst);
}
nir_intrinsic_shared_atomic_swap_nv => {
assert!(
self.nir.info.stage() == MESA_SHADER_COMPUTE
|| self.nir.info.stage() == MESA_SHADER_KERNEL
);
assert!(intrin.atomic_op() == nir_atomic_op_cmpxchg);
let bit_size = intrin.def.bit_size();
let addr = self.get_src(&srcs[0]);

View file

@ -69,6 +69,14 @@ nak_nir_workgroup_has_one_subgroup(const nir_shader *nir)
*/
return true;
case MESA_SHADER_TASK:
case MESA_SHADER_MESH:
/*
* Task and Mesh runs on the Vertex and Tesselation stage and follows the
* same rules.
*/
return true;
case MESA_SHADER_COMPUTE:
case MESA_SHADER_KERNEL: {
if (nir->info.workgroup_size_variable)
@ -414,6 +422,7 @@ nak_varying_attr_addr(const struct nak_compiler *nak, gl_varying_slot slot)
case VARYING_SLOT_POS: return NAK_ATTR_POSITION;
case VARYING_SLOT_CLIP_DIST0: return NAK_ATTR_CLIP_CULL_DIST_0;
case VARYING_SLOT_CLIP_DIST1: return NAK_ATTR_CLIP_CULL_DIST_4;
case VARYING_SLOT_VIEWPORT_MASK: return NAK_ATTR_VIEWPORT_MASK;
case VARYING_SLOT_PRIMITIVE_SHADING_RATE:
return nak->sm >= 86 ? NAK_ATTR_VPRS_TABLE_INDEX
: NAK_ATTR_VIEWPORT_INDEX;
@ -422,6 +431,23 @@ nak_varying_attr_addr(const struct nak_compiler *nak, gl_varying_slot slot)
}
}
uint16_t
nak_varying_mesh_skew_attr_addr(const struct nak_compiler *nak, gl_varying_slot slot)
{
switch (slot) {
/* Don't map to anything in SPH */
case VARYING_SLOT_PRIMITIVE_COUNT:
case VARYING_SLOT_PRIMITIVE_INDICES:
return 0;
case VARYING_SLOT_VIEWPORT:
case VARYING_SLOT_CULL_PRIMITIVE:
UNREACHABLE("Should have been lowered by nak_nir_lower_mesh_emulated_attributes");
default: return nak_varying_attr_addr(nak, slot);
}
}
static uint16_t
nak_fs_out_addr(gl_frag_result slot, uint32_t blend_idx)
{
@ -552,6 +578,9 @@ nak_nir_lower_system_value_intrin(nir_builder *b, nir_intrinsic_instr *intrin,
}
case nir_intrinsic_load_local_invocation_id: {
/* Should have been lowered earlier */
assert(!mesa_shader_stage_is_mesh(b->shader->info.stage));
nir_def *x = nak_nir_load_sysval(b, NAK_SV_TID_X,
ACCESS_CAN_REORDER);
nir_def *y = nak_nir_load_sysval(b, NAK_SV_TID_Y,
@ -607,6 +636,15 @@ nak_nir_lower_system_value_intrin(nir_builder *b, nir_intrinsic_instr *intrin,
}
break;
case nir_intrinsic_load_local_invocation_index: {
if (b->shader->info.stage != MESA_SHADER_TASK &&
b->shader->info.stage != MESA_SHADER_MESH)
return false;
val = nak_nir_load_sysval(b, NAK_SV_LANE_ID, ACCESS_CAN_REORDER);
break;
}
case nir_intrinsic_is_helper_invocation:
case nir_intrinsic_load_helper_invocation: {
val = nak_nir_load_sysval(b, NAK_SV_THREAD_KILL, 0);
@ -964,6 +1002,31 @@ nak_mem_access_size_align(nir_intrinsic_op intrin,
}
}
static nir_mem_access_size_align
nak_mesh_mem_access_size_align(nir_intrinsic_op intrin,
uint8_t bytes, uint8_t bit_size,
uint32_t align_mul, uint32_t align_offset,
bool offset_is_const, enum gl_access_qualifier access,
const void *cb_data)
{
switch (intrin) {
case nir_intrinsic_load_shared:
case nir_intrinsic_load_task_payload:
case nir_intrinsic_store_shared:
return (nir_mem_access_size_align) {
.bit_size = 32,
.num_components = 1,
.align = 4,
.shift = nir_mem_access_shift_method_scalar,
};
default:
return nak_mem_access_size_align(intrin, bytes, bit_size, align_mul,
align_offset, offset_is_const, access,
cb_data);
}
}
static bool
nir_shader_has_local_variables(const nir_shader *nir)
{
@ -1284,14 +1347,108 @@ nak_nir_max_imm_offset(nir_intrinsic_instr *intrin, const void *data)
}
}
static void
nak_mesh_skew_attr_mark_used(struct lower_mesh_intrinsics_ctx *ctx,
uint32_t base_addr,
uint32_t range,
bool per_primitive)
{
if (base_addr == 0)
return;
const uint32_t start_bit_idx = nak_mesh_skew_attr_used_index(base_addr);
const uint32_t end_bit_idx = nak_mesh_skew_attr_used_index(base_addr + range);
if (per_primitive)
BITSET_SET_RANGE(ctx->skew_prim_attr_used, start_bit_idx, end_bit_idx - 1);
else
BITSET_SET_RANGE(ctx->skew_vert_attr_used, start_bit_idx, end_bit_idx - 1);
}
static bool
nak_nir_gather_mesh_outputs(nir_shader *nir, struct lower_mesh_intrinsics_ctx *ctx)
{
bool progress = false;
nir_foreach_function(func, nir) {
nir_foreach_block_safe(block, func->impl) {
nir_foreach_instr_safe(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
if (intrin->intrinsic != nir_intrinsic_store_per_primitive_output &&
intrin->intrinsic != nir_intrinsic_store_per_vertex_output)
continue;
nir_def *offset = intrin->src[2].ssa;
nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
uint32_t component = nir_intrinsic_component(intrin);
uint32_t base_addr = nak_varying_mesh_skew_attr_addr(ctx->nak, sem.location);
/* Skip non SPH attributes */
if (base_addr == 0)
continue;
base_addr += 4 * component;
uint32_t range;
if (nir_src_is_const(nir_src_for_ssa(offset))) {
uint32_t const_offset = nir_src_as_uint(nir_src_for_ssa(offset));
/* Tighten the range */
base_addr += const_offset * 16;
range = 4 * intrin->num_components;
} else {
range = (sem.num_slots - 1) * 16 + intrin->num_components * 4;
}
const bool is_per_primitive = intrin->intrinsic == nir_intrinsic_store_per_primitive_output;
nak_mesh_skew_attr_mark_used(ctx, base_addr, range, is_per_primitive);
}
}
}
return progress;
}
void
nak_postprocess_nir(nir_shader *nir,
const struct nak_compiler *nak,
nir_variable_mode robust2_modes,
const struct nak_fs_key *fs_key)
const struct nak_fs_key *fs_key,
bool has_task_shader)
{
UNUSED bool progress = false;
const bool is_mesh_stage = nir->info.stage == MESA_SHADER_TASK ||
nir->info.stage == MESA_SHADER_MESH;
if (is_mesh_stage) {
const uint32_t wg_size = nir->info.workgroup_size[0] *
nir->info.workgroup_size[1] *
nir->info.workgroup_size[2];
/* As the mesh stages run as vertex or tessellation stages, we only have
* 32 local invocations in hardware, so if the user requests more than 32
* local invocations, we need to lower them. */
if (wg_size > 32) {
/* Make sure that all system values are lowered and no halt/return/goto
* are present for nir_lower_workgroup_size. */
OPT(nir, nir_lower_system_values);
OPT(nir, nir_lower_halt_to_return);
OPT(nir, nir_lower_returns);
OPT(nir, nir_lower_workgroup_size, 32);
nak_optimize_nir(nir, nak);
nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
}
OPT(nir, nak_nir_lower_mesh_stages_shared_atomics);
}
nak_optimize_nir(nir, nak);
const nir_lower_subgroups_options subgroups_options = {
@ -1333,14 +1490,17 @@ nak_postprocess_nir(nir_shader *nir,
vectorize_opts.modes = nir_var_mem_global |
nir_var_mem_ssbo |
nir_var_mem_shared |
nir_var_mem_task_payload |
nir_var_shader_temp;
vectorize_opts.callback = nak_mem_vectorize_cb;
vectorize_opts.robust_modes = robust2_modes;
OPT(nir, nir_opt_load_store_vectorize, &vectorize_opts);
nir_lower_mem_access_bit_sizes_options mem_bit_size_options = {
.modes = nir_var_mem_constant | nir_var_mem_ubo | nir_var_mem_generic,
.callback = nak_mem_access_size_align,
.modes = nir_var_mem_constant | nir_var_mem_ubo | nir_var_mem_generic |
nir_var_mem_task_payload,
.callback = is_mesh_stage ? nak_mesh_mem_access_size_align
: nak_mem_access_size_align,
};
OPT(nir, nir_lower_mem_access_bit_sizes, &mem_bit_size_options);
OPT(nir, nir_lower_bit_size, lower_bit_size_cb, (void *)nak);
@ -1417,6 +1577,30 @@ nak_postprocess_nir(nir_shader *nir,
OPT(nir, nir_opt_constant_folding);
break;
case MESA_SHADER_TASK: {
OPT(nir, nak_nir_lower_task_intrinsics);
OPT(nir, nir_opt_constant_folding);
break;
}
case MESA_SHADER_MESH: {
OPT(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
type_size_vec4, nir_lower_io_lower_64bit_to_32);
OPT(nir, nir_opt_constant_folding);
OPT(nir, nak_nir_lower_mesh_emulated_attributes);
struct lower_mesh_intrinsics_ctx ctx = {
.nak = nak,
.max_vertices_out = nir->info.mesh.max_vertices_out,
.max_primitives_out = nir->info.mesh.max_primitives_out,
.has_task_shader = has_task_shader,
};
OPT(nir, nak_nir_gather_mesh_outputs, &ctx);
OPT(nir, nak_nir_lower_mesh_intrinsics, &ctx);
OPT(nir, nir_opt_constant_folding);
break;
}
default:
UNREACHABLE("Unsupported shader stage");
}

View file

@ -237,7 +237,8 @@ lower_fs_input_intrin(nir_builder *b, nir_intrinsic_instr *intrin, void *data)
break;
}
case nir_intrinsic_load_input: {
case nir_intrinsic_load_input:
case nir_intrinsic_load_per_primitive_input: {
const uint16_t addr = fs_input_intrin_addr(intrin, ctx->nak);
res = load_fs_input(b, intrin->def.num_components, addr, ctx->nak);
break;

View file

@ -0,0 +1,586 @@
/*
* Copyright © 2026 Valve Corporation.
* Copyright © 2023 Collabora, Ltd.
* SPDX-License-Identifier: MIT
*/
#include "nak_private.h"
#include "nir_builder.h"
static bool
lower_mesh_io_intrin(nir_builder *b,
nir_intrinsic_instr *intrin,
const struct lower_mesh_intrinsics_ctx *ctx)
{
b->cursor = nir_before_instr(&intrin->instr);
nir_def *vtx = NULL, *offset = NULL, *data = NULL;
switch (intrin->intrinsic) {
case nir_intrinsic_load_per_vertex_output:
case nir_intrinsic_load_per_primitive_output:
vtx = intrin->src[0].ssa;
offset = intrin->src[1].ssa;
break;
case nir_intrinsic_store_per_vertex_output:
case nir_intrinsic_store_per_primitive_output:
data = intrin->src[0].ssa;
vtx = intrin->src[1].ssa;
offset = intrin->src[2].ssa;
break;
default:
UNREACHABLE("unknown intrinsic");
}
const bool is_per_primitive = intrin->intrinsic == nir_intrinsic_load_per_primitive_output ||
intrin->intrinsic == nir_intrinsic_store_per_primitive_output;
const bool is_store = data != NULL;
nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
const bool is_primitive_indices = sem.location == VARYING_SLOT_PRIMITIVE_INDICES;
const struct nak_nir_isbe_flags flags = {
.access = is_primitive_indices ? NAK_ISBE_ACCESS_MAP : NAK_ISBE_ACCESS_ATTR,
.output = true,
.skew = !is_primitive_indices,
.per_primitive = is_per_primitive,
};
uint32_t base_addr =
nak_varying_mesh_skew_attr_addr(ctx->nak, sem.location) +
4 * nir_intrinsic_component(intrin);
uint32_t range;
if (nir_src_is_const(nir_src_for_ssa(offset))) {
uint32_t const_offset = nir_src_as_uint(nir_src_for_ssa(offset));
/* Tighten the range */
base_addr += const_offset * 16;
range = 4 * intrin->num_components;
if (const_offset != 0)
offset = nir_imm_int(b, 0);
} else {
/* Offsets from NIR are in vec4's */
offset = nir_imul_imm(b, offset, 16);
range = (sem.num_slots - 1) * 16 + intrin->num_components * 4;
}
nir_def *isbe_offset;
uint32_t stride;
if (is_primitive_indices) {
const uint32_t vertices_per_prim = mesa_vertices_per_prim(b->shader->info.mesh.primitive_type);
/* Indices are 8 bits on hardware */
isbe_offset = nir_iadd(b, offset, nir_iadd_imm(b, nir_imul_imm(b, vtx, vertices_per_prim), 4));
stride = 1;
} else {
uint16_t skew_attr_offset = nak_mesh_skew_offset(ctx, sem.location, base_addr, is_per_primitive);
nir_def *skew_start_offset;
uint16_t skew_group_size;
if (is_per_primitive) {
skew_start_offset = nir_imm_int(b, nak_mesh_skew_vert_total_size(ctx));
skew_group_size = nak_mesh_skew_prim_group_size(ctx);
} else {
skew_start_offset = nir_imm_int(b, 0);
skew_group_size = nak_mesh_skew_vert_group_size(ctx);
}
/* Readjust offset to take into account SKEW groups */
nir_def *offset_ajusted = nir_imul_imm(b, offset, NAK_MESH_SKEW_GROUP_COUNT);
skew_start_offset = nir_iadd(b, skew_start_offset, nir_imul_imm(b, nir_udiv_imm(b, vtx, 32), skew_group_size));
isbe_offset = nir_iadd(b, nir_iadd_imm(b, nir_iadd(b, nir_imul_imm(b, nir_imod_imm(b, vtx, 32), 4),
skew_start_offset),
skew_attr_offset),
offset_ajusted);
stride = 4 * NAK_MESH_SKEW_GROUP_COUNT;
}
if (is_store) {
u_foreach_bit(c, nir_intrinsic_write_mask(intrin)) {
nir_def *c_offset = nir_iadd_imm(b, isbe_offset, c * stride);
nir_def *c_data = nir_channel(b, data, c);
/* Handle indices conversion */
if (is_primitive_indices)
c_data = nir_u2u8(b, c_data);
nir_isbewr_nv(b, c_data, c_offset, .range_base = base_addr,
.range = range, .flags = NAK_AS_U32(flags));
}
} else {
const uint8_t bit_size = is_primitive_indices ? 8 : intrin->def.bit_size;
nir_def *comps[NIR_MAX_VEC_COMPONENTS];
for (uint32_t c = 0; c < intrin->num_components; c++) {
nir_def *c_offset = nir_iadd_imm(b, isbe_offset, c * stride);
nir_def *c_data =
nir_isberd_nv(b, bit_size, c_offset, .range_base = base_addr,
.range = range, .flags = NAK_AS_U32(flags));
/* Handle indices conversion */
if (is_primitive_indices)
c_data = nir_u2u32(b, c_data);
comps[c] = c_data;
}
nir_def *dst = nir_vec(b, comps, intrin->num_components);
nir_def_rewrite_uses(&intrin->def, dst);
}
nir_instr_remove(&intrin->instr);
return true;
}
static bool
lower_set_vertex_and_primitive_count(nir_builder *b,
nir_intrinsic_instr *intrin)
{
b->cursor = nir_before_instr(&intrin->instr);
nir_def *primitive_count = intrin->src[1].ssa;
nir_def *offset = nir_imm_int(b, 0x3);
const struct nak_nir_isbe_flags flags = {
.access = NAK_ISBE_ACCESS_MAP,
.output = true,
.skew = false,
.per_primitive = false,
};
nir_isbewr_nv(b, primitive_count, offset,
.flags = NAK_AS_U32(flags));
nir_instr_remove(&intrin->instr);
return true;
}
static bool
lower_load_workgroup_index(nir_builder *b,
nir_intrinsic_instr *intrin,
bool from_skew)
{
nir_function_impl *impl = nir_shader_get_entrypoint(b->shader);
/* We need to make sure that this is read before any writes to allow ISBE
* space sharing optimisation to happen */
b->cursor = nir_before_impl(impl);
const struct nak_nir_isbe_flags flags = {
.access = NAK_ISBE_ACCESS_ATTR,
.output = false,
.skew = from_skew,
.per_primitive = false,
};
nir_def *dst = nir_isberd_nv(b, 32, nir_imm_int(b, 0),
.range_base = NAK_ATTR_VERTEX_ID,
.range = 4,
.flags = NAK_AS_U32(flags));
nir_def_rewrite_uses(&intrin->def, dst);
nir_instr_remove(&intrin->instr);
return true;
}
static bool
lower_load_num_workgroups(nir_builder *b, nir_intrinsic_instr *intrin)
{
/* If we are here, we have a task shader */
b->cursor = nir_before_instr(&intrin->instr);
const struct nak_nir_isbe_flags flags = {
.access = NAK_ISBE_ACCESS_ATTR,
.output = false,
.skew = false,
.per_primitive = false,
};
nir_def *x =
nir_isberd_nv(b, 32, nir_imm_int(b, 0x8), .flags = NAK_AS_U32(flags),
.access = ACCESS_CAN_REORDER);
nir_def *y =
nir_isberd_nv(b, 32, nir_imm_int(b, 0xC), .flags = NAK_AS_U32(flags),
.access = ACCESS_CAN_REORDER);
nir_def *z =
nir_isberd_nv(b, 32, nir_imm_int(b, 0x10), .flags = NAK_AS_U32(flags),
.access = ACCESS_CAN_REORDER);
nir_def *dst = nir_vec3(b, x, y, z);
nir_def_rewrite_uses(&intrin->def, dst);
nir_instr_remove(&intrin->instr);
return true;
}
static bool
lower_load_shared(nir_builder *b, nir_intrinsic_instr *intrin,
uint32_t base_offset)
{
b->cursor = nir_before_instr(&intrin->instr);
nir_def *offset = intrin->src[0].ssa;
const uint8_t bit_size = intrin->def.bit_size;
assert(bit_size == 32 && intrin->def.num_components == 1);
const uint32_t base = nir_intrinsic_base(intrin);
const struct nak_nir_isbe_flags flags = {
.access = NAK_ISBE_ACCESS_ATTR,
.output = true,
.skew = false,
.per_primitive = false,
};
offset = nir_iadd_imm(b, offset, base_offset + base);
nir_def *dst = nir_isberd_nv(b, 32, offset, .flags = NAK_AS_U32(flags));
nir_def_rewrite_uses(&intrin->def, dst);
nir_instr_remove(&intrin->instr);
return true;
}
static bool
lower_store_shared(nir_builder *b, nir_intrinsic_instr *intrin,
uint32_t base_offset)
{
b->cursor = nir_before_instr(&intrin->instr);
nir_def *value = intrin->src[0].ssa;
nir_def *offset = intrin->src[1].ssa;
const uint8_t bit_size = value->bit_size;
assert(bit_size == 32 &&
nir_intrinsic_write_mask(intrin) == nir_component_mask(1));
const uint32_t base = nir_intrinsic_base(intrin);
const struct nak_nir_isbe_flags flags = {
.access = NAK_ISBE_ACCESS_ATTR,
.output = true,
.skew = false,
.per_primitive = false,
};
offset = nir_iadd_imm(b, offset, base_offset + base);
nir_isbewr_nv(b, value, offset, .flags = NAK_AS_U32(flags));
nir_instr_remove(&intrin->instr);
return true;
}
static bool
lower_load_task_payload(nir_builder *b, nir_intrinsic_instr *intrin)
{
b->cursor = nir_before_instr(&intrin->instr);
nir_def *offset = intrin->src[0].ssa;
const uint8_t bit_size = intrin->def.bit_size;
assert(bit_size == 32 && intrin->def.num_components == 1);
const uint32_t base = nir_intrinsic_base(intrin);
const struct nak_nir_isbe_flags flags = {
.access = NAK_ISBE_ACCESS_ATTR,
.output = false,
.skew = false,
.per_primitive = false,
};
offset = nir_iadd_imm(b, offset, base);
nir_def *dst = nir_isberd_nv(b, 32, offset, .flags = NAK_AS_U32(flags),
.access = ACCESS_CAN_REORDER);
nir_def_rewrite_uses(&intrin->def, dst);
nir_instr_remove(&intrin->instr);
return true;
}
static bool
lower_mesh_intrin(nir_builder *b,
nir_intrinsic_instr *intrin,
void *cb_data)
{
const struct lower_mesh_intrinsics_ctx *ctx = cb_data;
/* Shared memory is after attributes on mesh shaders */
const uint32_t shared_memory_base = nak_mesh_skew_total_size(ctx);
assert(shared_memory_base % 0x80 == 0);
switch (intrin->intrinsic) {
case nir_intrinsic_load_per_vertex_output:
case nir_intrinsic_load_per_primitive_output:
case nir_intrinsic_store_per_vertex_output:
case nir_intrinsic_store_per_primitive_output:
return lower_mesh_io_intrin(b, intrin, ctx);
case nir_intrinsic_set_vertex_and_primitive_count:
return lower_set_vertex_and_primitive_count(b, intrin);
case nir_intrinsic_load_workgroup_index:
return lower_load_workgroup_index(b, intrin, !ctx->has_task_shader);
case nir_intrinsic_load_num_workgroups:
return lower_load_num_workgroups(b, intrin);
case nir_intrinsic_load_shared:
return lower_load_shared(b, intrin, shared_memory_base);
case nir_intrinsic_store_shared:
return lower_store_shared(b, intrin, shared_memory_base);
case nir_intrinsic_load_task_payload:
return lower_load_task_payload(b, intrin);
case nir_intrinsic_shared_atomic:
case nir_intrinsic_shared_atomic_swap:
UNREACHABLE(
"Should have been lowered by nak_nir_lower_mesh_stages_shared_atomics");
default:
return false;
}
}
struct lower_emulated_attributes_state {
uint32_t viewport_shared_offset;
uint32_t cullprimitive_shared_offset;
};
static bool
lower_emulated_attributes_intrin(nir_builder *b, nir_intrinsic_instr *intrin,
void *_data)
{
const struct lower_emulated_attributes_state *state = _data;
nir_def *vtx = NULL, *offset = NULL;
switch (intrin->intrinsic) {
case nir_intrinsic_load_per_primitive_output:
vtx = intrin->src[0].ssa;
offset = intrin->src[1].ssa;
break;
case nir_intrinsic_store_per_primitive_output:
vtx = intrin->src[1].ssa;
offset = intrin->src[2].ssa;
break;
default:
return false;
}
nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
if (sem.location != VARYING_SLOT_VIEWPORT &&
sem.location != VARYING_SLOT_CULL_PRIMITIVE)
return false;
b->cursor = nir_before_instr(&intrin->instr);
nir_def *shared_offset = nir_iadd_imm(b, offset, nir_intrinsic_base(intrin));
shared_offset = nir_iadd(b, shared_offset, nir_imul_imm(b, vtx, 4));
if (sem.location == VARYING_SLOT_CULL_PRIMITIVE)
shared_offset =
nir_iadd_imm(b, shared_offset, state->cullprimitive_shared_offset);
else
shared_offset =
nir_iadd_imm(b, shared_offset, state->viewport_shared_offset);
if (intrin->intrinsic == nir_intrinsic_store_per_primitive_output) {
nir_def *data = intrin->src[0].ssa;
switch (sem.location) {
case VARYING_SLOT_VIEWPORT:
/* In case of Viewport, the data needs to be translated to a proper
* mask value to map to ViewportMask */
data = nir_ishl(b, nir_imm_int(b, 1), data);
break;
case VARYING_SLOT_CULL_PRIMITIVE:
/* In case of CullPrimitive, the data is already a 32-bit value so no
* translation is needed */
break;
default:
UNREACHABLE("Should never happen");
}
nir_store_shared(b, data, shared_offset);
nir_instr_remove(&intrin->instr);
} else {
/* Reading back isn't allowed by VK_EXT_mesh_shader but allowed by
* VK_NV_mesh_shader. We support readback for completeness and in case we
* add support for NV specific extension in the future */
nir_def *data = nir_load_shared(b, 1, 32, shared_offset);
switch (sem.location) {
case VARYING_SLOT_VIEWPORT:
/* In case of Viewport, find the first index that is set. */
data = nir_find_lsb(b, data);
break;
case VARYING_SLOT_CULL_PRIMITIVE:
/* In case of CullPrimitive, we check if no bits are set */
data = nir_ine_imm(b, data, 0);
break;
default:
UNREACHABLE("Should never happen");
}
nir_def_replace(&intrin->def, data);
}
return true;
}
bool
nak_nir_lower_mesh_emulated_attributes(nir_shader *nir)
{
if (nir->info.stage != MESA_SHADER_MESH)
return false;
/* Only apply this pass when really needed */
if ((nir->info.per_primitive_outputs &
(VARYING_BIT_CULL_PRIMITIVE | VARYING_BIT_VIEWPORT)) == 0)
return false;
/* If we are here, we need to emulate Viewport / CullPrimitive with the
* ViewportMask. This means if we need to always keep a shadow copy of the
* ViewportMask and CullPrimitive in shared memory and write the actual
* ViewportMask at the end of the shader. */
bool progress = false;
/* Reserve space for the Viewport and CullPrimitive shadow copies */
uint32_t shared_memory_offset = nir->info.shared_size;
nir->info.shared_size += 8 * nir->info.mesh.max_primitives_out;
struct lower_emulated_attributes_state state = {
.viewport_shared_offset = shared_memory_offset,
.cullprimitive_shared_offset =
shared_memory_offset + 4 * nir->info.mesh.max_primitives_out,
};
/* First we lower things to shared memory */
progress |= nir_shader_intrinsics_pass(nir, lower_emulated_attributes_intrin,
nir_metadata_control_flow, &state);
/* Finally, we ensure that the shared region is init at the start of the
* shader and we add primitive writes at the end of the shader to write the
* real value depending on the culling state.*/
if (progress) {
nir_function_impl *impl = nir_shader_get_entrypoint(nir);
nir_builder b = nir_builder_at(nir_before_impl(impl));
nir_def *zero = nir_imm_int(&b, 0);
nir_def *viewport_default = nir_imm_int(&b, 1 << 0);
nir_def *lane_id =
nak_nir_load_sysval(&b, NAK_SV_LANE_ID, ACCESS_CAN_REORDER);
nir_push_if(&b, nir_ieq(&b, lane_id, zero));
{
for (uint32_t i = 0; i < nir->info.mesh.max_primitives_out; i++) {
nir_store_shared(
&b, viewport_default,
nir_imm_int(&b, state.viewport_shared_offset + i * 4));
nir_store_shared(
&b, zero,
nir_imm_int(&b, state.cullprimitive_shared_offset + i * 4));
}
}
nir_pop_if(&b, NULL);
nir_barrier(&b, SCOPE_WORKGROUP, SCOPE_WORKGROUP, NIR_MEMORY_ACQ_REL,
nir_var_mem_shared);
b = nir_builder_at(nir_after_impl(impl));
nir_barrier(&b, SCOPE_WORKGROUP, SCOPE_WORKGROUP, NIR_MEMORY_ACQ_REL,
nir_var_mem_shared);
nir_push_if(&b, nir_ieq(&b, lane_id, zero));
{
for (uint32_t i = 0; i < nir->info.mesh.max_primitives_out; i++) {
nir_def *viewport_mask = nir_load_shared(
&b, 1, 32,
nir_imm_int(&b, state.viewport_shared_offset + i * 4));
nir_def *cull_primitive = nir_load_shared(
&b, 1, 32,
nir_imm_int(&b, state.cullprimitive_shared_offset + i * 4));
viewport_mask = nir_bcsel(&b, nir_ine_imm(&b, cull_primitive, 0),
zero, viewport_mask);
nir_store_per_primitive_output(
&b, viewport_mask, nir_imm_int(&b, i), zero, .base = 0,
.src_type = nir_type_uint32,
.io_semantics = (nir_io_semantics){
.location = VARYING_SLOT_VIEWPORT_MASK,
.num_slots = 1,
});
}
}
nir_pop_if(&b, NULL);
}
return progress;
}
bool
nak_nir_lower_mesh_intrinsics(nir_shader *nir,
struct lower_mesh_intrinsics_ctx *ctx)
{
return nir_shader_intrinsics_pass(
nir, lower_mesh_intrin, nir_metadata_block_index | nir_metadata_dominance,
ctx);
}
static bool
lower_launch_mesh_workgroups(nir_builder *b, nir_intrinsic_instr *intrin)
{
b->cursor = nir_before_instr(&intrin->instr);
nir_def *dim = intrin->src[0].ssa;
nir_def *x = nir_channel(b, dim, 0);
nir_def *y = nir_channel(b, dim, 1);
nir_def *z = nir_channel(b, dim, 2);
nir_def *task_count = nir_imul(b, nir_imul(b, x, y), z);
const struct nak_nir_isbe_flags flags = {
.access = NAK_ISBE_ACCESS_ATTR,
.output = true,
.skew = false,
.per_primitive = false,
};
nir_isbewr_nv(b, task_count, nir_imm_int(b, 0x4),
.flags = NAK_AS_U32(flags));
nir_isbewr_nv(b, x, nir_imm_int(b, 0x8), .flags = NAK_AS_U32(flags));
nir_isbewr_nv(b, y, nir_imm_int(b, 0xC), .flags = NAK_AS_U32(flags));
nir_isbewr_nv(b, z, nir_imm_int(b, 0x10), .flags = NAK_AS_U32(flags));
nir_instr_remove(&intrin->instr);
return true;
}
static bool
lower_task_intrin(nir_builder *b,
nir_intrinsic_instr *intrin,
void *cb_data)
{
switch (intrin->intrinsic) {
case nir_intrinsic_load_shared:
return lower_load_shared(b, intrin, 0);
case nir_intrinsic_store_shared:
return lower_store_shared(b, intrin, 0);
case nir_intrinsic_load_workgroup_index:
return lower_load_workgroup_index(b, intrin, true);
case nir_intrinsic_launch_mesh_workgroups:
return lower_launch_mesh_workgroups(b, intrin);
case nir_intrinsic_shared_atomic:
case nir_intrinsic_shared_atomic_swap:
UNREACHABLE(
"Should have been lowered by nak_nir_lower_mesh_stages_shared_atomics");
case nir_intrinsic_load_task_payload:
case nir_intrinsic_store_task_payload:
case nir_intrinsic_task_payload_atomic:
case nir_intrinsic_task_payload_atomic_swap:
UNREACHABLE("Should have been lowered by nvk_nir_lower_task_shader");
default:
return false;
}
}
bool
nak_nir_lower_task_intrinsics(nir_shader *nir)
{
return nir_shader_intrinsics_pass(nir, lower_task_intrin,
nir_metadata_block_index |
nir_metadata_dominance,
NULL);
}

View file

@ -76,6 +76,8 @@ enum ENUM_PACKED nak_attr {
NAK_ATTR_INSTANCE_ID = 0x2f8,
NAK_ATTR_VERTEX_ID = 0x2fc,
/* System values D */
NAK_ATTR_VIEWPORT_MASK = 0x3a0,
NAK_ATTR_BARY_COORD_NO_PERSP_X = 0x3a8,
NAK_ATTR_BARY_COORD_NO_PERSP_Y = 0x3ac,
NAK_ATTR_BARY_COORD_NO_PERSP_Z = 0x3b0,
@ -84,7 +86,8 @@ enum ENUM_PACKED nak_attr {
NAK_ATTR_BARY_COORD_X = 0x3b4,
NAK_ATTR_BARY_COORD_Y = 0x3b8,
NAK_ATTR_BARY_COORD_Z = 0x3bc,
NAK_ATTR_BARY_COORD = NAK_ATTR_BARY_COORD_X,
NAK_ATTR_BARY_COORD = NAK_ATTR_BARY_COORD_X,
NAK_ATTR_SPH_END = NAK_ATTR_BARY_COORD_Z + 4,
/* Not in SPH */
NAK_ATTR_FRONT_FACE = 0x3fc,
@ -100,6 +103,8 @@ nak_attribute_attr_addr(UNUSED const struct nak_compiler *nak,
uint16_t nak_varying_attr_addr(const struct nak_compiler *nak,
gl_varying_slot slot);
uint16_t nak_varying_mesh_skew_attr_addr(const struct nak_compiler *nak,
gl_varying_slot slot);
uint16_t nak_sysval_attr_addr(const struct nak_compiler *nak,
gl_system_value sysval);
@ -274,6 +279,81 @@ struct nak_nir_isbe_flags {
uint32_t pad : 27;
};
struct lower_mesh_intrinsics_ctx {
const struct nak_compiler *nak;
uint32_t max_vertices_out;
uint32_t max_primitives_out;
bool has_task_shader;
BITSET_DECLARE(skew_vert_attr_used, NAK_ATTR_SPH_END);
BITSET_DECLARE(skew_prim_attr_used, NAK_ATTR_SPH_END);
};
#define NAK_MESH_SKEW_GROUP_COUNT 32
static uint32_t
nak_mesh_skew_attr_used_index(uint32_t base_addr)
{
assert(base_addr < NAK_ATTR_SPH_END);
return base_addr / 4;
}
static uint32_t
nak_mesh_skew_vert_group_size(const struct lower_mesh_intrinsics_ctx *ctx)
{
return BITSET_COUNT(ctx->skew_vert_attr_used) * 4 * NAK_MESH_SKEW_GROUP_COUNT;
}
static uint32_t
nak_mesh_skew_vert_total_size(const struct lower_mesh_intrinsics_ctx *ctx)
{
return nak_mesh_skew_vert_group_size(ctx) * DIV_ROUND_UP(ctx->max_vertices_out, NAK_MESH_SKEW_GROUP_COUNT);
}
static uint32_t
nak_mesh_skew_prim_group_size(const struct lower_mesh_intrinsics_ctx *ctx)
{
return BITSET_COUNT(ctx->skew_prim_attr_used) * 4 * NAK_MESH_SKEW_GROUP_COUNT;
}
static uint32_t
nak_mesh_skew_prim_total_size(const struct lower_mesh_intrinsics_ctx *ctx)
{
return nak_mesh_skew_prim_group_size(ctx) * DIV_ROUND_UP(ctx->max_primitives_out, NAK_MESH_SKEW_GROUP_COUNT);
}
static uint32_t
nak_mesh_skew_total_size(const struct lower_mesh_intrinsics_ctx *ctx)
{
return nak_mesh_skew_vert_total_size(ctx) + nak_mesh_skew_prim_total_size(ctx);
}
static uint32_t
nak_mesh_skew_offset(const struct lower_mesh_intrinsics_ctx *ctx,
gl_varying_slot slot,
uint32_t base_addr,
bool per_primitive)
{
const uint32_t bit_idx = nak_mesh_skew_attr_used_index(base_addr);
uint32_t bit_count;
if (per_primitive)
bit_count = BITSET_PREFIX_SUM(ctx->skew_prim_attr_used, bit_idx);
else
bit_count = BITSET_PREFIX_SUM(ctx->skew_vert_attr_used, bit_idx);
uint32_t size = bit_count * 4 * NAK_MESH_SKEW_GROUP_COUNT;
return size;
}
bool nak_nir_lower_mesh_emulated_attributes(nir_shader *nir);
bool nak_nir_lower_mesh_intrinsics(nir_shader *nir, struct lower_mesh_intrinsics_ctx *ctx);
bool nak_nir_lower_task_intrinsics(nir_shader *nir);
enum nak_interp_mode {
NAK_INTERP_MODE_PERSPECTIVE,
NAK_INTERP_MODE_SCREEN_LINEAR,

View file

@ -52,6 +52,7 @@ nvk_files = files(
'nvk_mme.c',
'nvk_mme.h',
'nvk_nir_lower_descriptors.c',
'nvk_nir_lower_mesh_shader.c',
'nvk_physical_device.c',
'nvk_physical_device.h',
'nvk_private.h',

View file

@ -60,6 +60,7 @@ struct lower_descriptors_ctx {
bool use_edb_buffer_views;
bool clamp_desc_array_bounds;
bool indirect_bind;
bool has_task_shader;
nir_address_format ubo_addr_format;
nir_address_format ssbo_addr_format;
@ -1122,6 +1123,8 @@ static bool
try_lower_intrin(nir_builder *b, nir_intrinsic_instr *intrin,
const struct lower_descriptors_ctx *ctx)
{
const mesa_shader_stage stage = b->shader->info.stage;
switch (intrin->intrinsic) {
case nir_intrinsic_load_constant:
return lower_load_constant(b, intrin, ctx);
@ -1133,10 +1136,20 @@ try_lower_intrin(nir_builder *b, nir_intrinsic_instr *intrin,
UNREACHABLE("Should have been lowered by nir_lower_cs_intrinsics()");
case nir_intrinsic_load_num_workgroups:
/* We use ISBE.ATTR to pass this from task. */
if (stage == MESA_SHADER_MESH && ctx->has_task_shader)
return false;
if (stage == MESA_SHADER_MESH || stage == MESA_SHADER_TASK)
return lower_sysval_to_root_table(b, intrin, draw.mesh.group_count, ctx);
return lower_sysval_to_root_table(b, intrin, cs.group_count, ctx);
case nir_intrinsic_load_base_workgroup_id:
return lower_sysval_to_root_table(b, intrin, cs.base_group, ctx);
if (stage == MESA_SHADER_COMPUTE)
return lower_sysval_to_root_table(b, intrin, cs.base_group, ctx);
return false;
case nir_intrinsic_load_push_constant:
return lower_load_push_constant(b, intrin, ctx);
@ -1548,6 +1561,7 @@ nvk_nir_lower_descriptors(nir_shader *nir,
rs->images != VK_PIPELINE_ROBUSTNESS_IMAGE_BEHAVIOR_DISABLED_EXT,
.indirect_bind =
shader_flags & VK_SHADER_CREATE_INDIRECT_BINDABLE_BIT_EXT,
.has_task_shader = (shader_flags & VK_SHADER_CREATE_NO_TASK_SHADER_BIT_EXT) == 0,
.ssbo_addr_format = nvk_ssbo_addr_format(pdev, rs),
.ubo_addr_format = nvk_ubo_addr_format(pdev, rs),
};

View file

@ -0,0 +1,190 @@
/*
* Copyright © 2026 Valve Corporation.
* SPDX-License-Identifier: MIT
*/
#include "util/macros.h"
#include "nir.h"
#include "nir_builder.h"
#include "nir_defines.h"
#include "nvk_shader.h"
#include "shader_enums.h"
static bool
add_task_payload_base_offset(nir_builder *b, nir_intrinsic_instr *intrin,
void *data)
{
const uint32_t *offset = data;
switch (intrin->intrinsic) {
case nir_intrinsic_load_task_payload:
case nir_intrinsic_store_task_payload:
case nir_intrinsic_task_payload_atomic:
case nir_intrinsic_task_payload_atomic_swap:
break;
default:
return false;
}
unsigned base = nir_intrinsic_base(intrin);
nir_intrinsic_set_base(intrin, base + *offset);
return true;
}
static bool
nvk_nir_lower_common_task_payload(nir_shader *nir)
{
/* The first 0x20 bytes are used by launch_mesh_workgroups */
uint32_t task_payload_reserved_size = 0x20;
/* Take into account the reserved chunk in task memory */
nir->info.task_payload_size += task_payload_reserved_size;
/* Add the reserved chunk to every task payload accesses */
return nir_shader_intrinsics_pass(nir, add_task_payload_base_offset,
nir_metadata_all,
&task_payload_reserved_size);
}
static bool
lower_set_vertex_and_primitive_count_intrin(nir_builder *b,
nir_intrinsic_instr *intrin,
UNUSED void *data)
{
if (intrin->intrinsic != nir_intrinsic_set_vertex_and_primitive_count)
return false;
b->cursor = nir_before_instr(&intrin->instr);
nir_def *local_invocation_index = nir_load_local_invocation_index(b);
nir_push_if(b, nir_ieq(b, local_invocation_index, nir_imm_int(b, 0)));
{
nir_set_vertex_and_primitive_count(
b, intrin->src[0].ssa, intrin->src[1].ssa, intrin->src[2].ssa);
}
nir_pop_if(b, NULL);
nir_instr_remove(&intrin->instr);
return true;
}
bool
nvk_nir_lower_mesh_shader(nir_shader *nir, VkShaderCreateFlagsEXT shader_flags)
{
if (nir->info.stage != MESA_SHADER_MESH)
return false;
bool progress = false;
if ((shader_flags & VK_SHADER_CREATE_NO_TASK_SHADER_BIT_EXT) == 0)
progress |= nvk_nir_lower_common_task_payload(nir);
progress |= nir_shader_intrinsics_pass(
nir, lower_set_vertex_and_primitive_count_intrin, nir_metadata_none,
NULL);
return progress;
}
static bool
launch_mesh_workgroups_intrin(nir_builder *b, nir_intrinsic_instr *intrin,
UNUSED void *data)
{
if (intrin->intrinsic != nir_intrinsic_launch_mesh_workgroups)
return false;
b->cursor = nir_before_instr(&intrin->instr);
nir_def *local_invocation_index = nir_load_local_invocation_index(b);
nir_push_if(b, nir_ieq(b, local_invocation_index, nir_imm_int(b, 0)));
{
nir_launch_mesh_workgroups(b, intrin->src[0].ssa);
}
nir_pop_if(b, NULL);
nir_instr_remove(&intrin->instr);
return true;
}
static nir_intrinsic_op
task_payload_intrinsic_to_shared(nir_intrinsic_op op)
{
switch (op) {
case nir_intrinsic_load_task_payload:
return nir_intrinsic_load_shared;
case nir_intrinsic_store_task_payload:
return nir_intrinsic_store_shared;
case nir_intrinsic_task_payload_atomic:
return nir_intrinsic_shared_atomic;
case nir_intrinsic_task_payload_atomic_swap:
return nir_intrinsic_shared_atomic_swap;
default:
return nir_num_intrinsics;
}
}
static bool
lower_task_payload_intrin(nir_builder *b, nir_intrinsic_instr *intrin,
UNUSED void *data)
{
nir_intrinsic_op new_op =
task_payload_intrinsic_to_shared(intrin->intrinsic);
if (new_op == nir_num_intrinsics)
return false;
intrin->intrinsic = new_op;
return true;
}
static bool
add_shared_base_offset(nir_builder *b, nir_intrinsic_instr *intrin,
UNUSED void *data)
{
switch (intrin->intrinsic) {
case nir_intrinsic_load_shared:
case nir_intrinsic_store_shared:
case nir_intrinsic_shared_atomic:
case nir_intrinsic_shared_atomic_swap:
break;
default:
return false;
}
const uint32_t shared_memory_base = b->shader->info.task_payload_size;
assert(shared_memory_base % 0x80 == 0);
unsigned base = nir_intrinsic_base(intrin);
nir_intrinsic_set_base(intrin, base + shared_memory_base);
return true;
}
bool
nvk_nir_lower_task_shader(nir_shader *nir)
{
if (nir->info.stage != MESA_SHADER_TASK)
return false;
bool progress = false;
/* Apply common lowering for task payload */
progress |= nvk_nir_lower_common_task_payload(nir);
/* Ensure alignment based on ISBE mem lines size (128 bytes) */
nir->info.task_payload_size = align(nir->info.task_payload_size, 128);
/* Readjust shared memory size to include the task payload */
nir->info.shared_size += nir->info.task_payload_size;
/* Now move all shared memory after task payload range and lower task payload
* to shared memory */
progress |= nir_shader_intrinsics_pass(nir, add_shared_base_offset,
nir_metadata_all, NULL);
progress |= nir_shader_intrinsics_pass(nir, lower_task_payload_intrin,
nir_metadata_all, NULL);
/* Finally we ensure that launch_mesh_workgroups is only running on lane 0 */
progress |= nir_shader_intrinsics_pass(nir, launch_mesh_workgroups_intrin,
nir_metadata_none, NULL);
return progress;
}

View file

@ -382,8 +382,10 @@ nvk_lower_nir(struct nvk_device *dev, nir_shader *nir,
lookup_ycbcr_conversion, &ycbcr_state);
nir_lower_compute_system_values_options csv_options = {
.has_base_workgroup_id = true,
.has_base_workgroup_id = mesa_shader_stage_is_compute(nir->info.stage),
.lower_local_invocation_index = mesa_shader_stage_is_compute(nir->info.stage),
.lower_workgroup_id_to_index = mesa_shader_stage_is_mesh(nir->info.stage),
.lower_cs_local_id_to_index = mesa_shader_stage_is_mesh(nir->info.stage),
};
NIR_PASS(_, nir, nir_lower_compute_system_values, &csv_options);
@ -461,19 +463,43 @@ nvk_lower_nir(struct nvk_device *dev, nir_shader *nir,
NIR_PASS(_, nir, nir_shader_intrinsics_pass,
lower_load_intrinsic, nir_metadata_none, pdev);
if (mesa_shader_stage_is_compute(nir->info.stage)) {
NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
nir_var_mem_shared, shared_var_info);
NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_shared,
if (mesa_shader_stage_uses_workgroup(nir->info.stage)) {
nir_variable_mode var_modes = nir_var_mem_shared;
if (mesa_shader_stage_is_mesh(nir->info.stage))
var_modes |= nir_var_mem_task_payload;
NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, var_modes,
shared_var_info);
NIR_PASS(_, nir, nir_lower_explicit_io, var_modes,
nir_address_format_32bit_offset);
if (nir->info.stage == MESA_SHADER_TASK)
NIR_PASS(_, nir, nvk_nir_lower_task_shader);
else if (nir->info.stage == MESA_SHADER_MESH)
NIR_PASS(_, nir, nvk_nir_lower_mesh_shader, shader_flags);
if (nir->info.zero_initialize_shared_memory && nir->info.shared_size > 0) {
/* QMD::SHARED_MEMORY_SIZE requires an alignment of 256B so it's safe to
* align everything up to 16B so we can write whole vec4s.
*/
nir->info.shared_size = align(nir->info.shared_size, 16);
uint32_t alignment;
uint32_t chunk_size;
if (mesa_shader_stage_is_mesh(nir->info.stage)) {
/* With task/mesh shaders, shared is in ISBE attribute space and is
* allocated in "lines" of 128 bytes. Additionally, we ISBE I/O
* instructions only support 1B and 4B granualities.*/
alignment = 128;
chunk_size = 4;
} else {
/* QMD::SHARED_MEMORY_SIZE requires an alignment of 256B so it's
* safe to align everything up to 16B so we can write whole vec4s.
*/
alignment = 16;
chunk_size = 16;
}
nir->info.shared_size = align(nir->info.shared_size, alignment);
NIR_PASS(_, nir, nir_zero_initialize_shared_memory,
nir->info.shared_size, 16);
nir->info.shared_size, chunk_size);
/* We need to call lower_compute_system_values again because
* nir_zero_initialize_shared_memory generates load_invocation_id which

View file

@ -158,6 +158,9 @@ nvk_nir_lower_descriptors(nir_shader *nir,
struct vk_descriptor_set_layout * const *set_layouts,
struct nvk_cbuf_map *cbuf_map_out);
bool nvk_nir_lower_mesh_shader(nir_shader *nir, VkShaderCreateFlagsEXT shader_flags);
bool nvk_nir_lower_task_shader(nir_shader *nir);
VkResult
nvk_compile_nir_shader(struct nvk_device *dev, nir_shader *nir,
const VkAllocationCallbacks *alloc,