From cdb0dea462e201f7fafe1eff47f034a609007ad4 Mon Sep 17 00:00:00 2001 From: Mary Guillemard Date: Thu, 5 Feb 2026 21:09:42 +0100 Subject: [PATCH] nvk: Lower mesh and task shaders Signed-off-by: Mary Guillemard Reviewed-by: Mel Henning Tested-by: Thomas H.P. Andersen Part-of: --- src/nouveau/compiler/meson.build | 1 + src/nouveau/compiler/nak.h | 3 +- src/nouveau/compiler/nak/api.rs | 4 +- src/nouveau/compiler/nak/from_nir.rs | 10 + src/nouveau/compiler/nak_nir.c | 190 +++++- .../compiler/nak_nir_lower_fs_inputs.c | 3 +- .../compiler/nak_nir_lower_mesh_intrinsics.c | 586 ++++++++++++++++++ src/nouveau/compiler/nak_private.h | 82 ++- src/nouveau/vulkan/meson.build | 1 + .../vulkan/nvk_nir_lower_descriptors.c | 16 +- .../vulkan/nvk_nir_lower_mesh_shader.c | 190 ++++++ src/nouveau/vulkan/nvk_shader.c | 46 +- src/nouveau/vulkan/nvk_shader.h | 3 + 13 files changed, 1117 insertions(+), 18 deletions(-) create mode 100644 src/nouveau/compiler/nak_nir_lower_mesh_intrinsics.c create mode 100644 src/nouveau/vulkan/nvk_nir_lower_mesh_shader.c diff --git a/src/nouveau/compiler/meson.build b/src/nouveau/compiler/meson.build index 7bdc07ceaef..a5e25d3ecf1 100644 --- a/src/nouveau/compiler/meson.build +++ b/src/nouveau/compiler/meson.build @@ -23,6 +23,7 @@ libnak_c_files = files( 'nak_nir_lower_fs_inputs.c', 'nak_nir_lower_gs_intrinsics.c', 'nak_nir_lower_image_addrs.c', + 'nak_nir_lower_mesh_intrinsics.c', 'nak_nir_lower_non_uniform_ldcx.c', 'nak_nir_lower_scan_reduce.c', 'nak_nir_lower_shared_atomics.c', diff --git a/src/nouveau/compiler/nak.h b/src/nouveau/compiler/nak.h index 06bada6ac0c..d7742f1bf32 100644 --- a/src/nouveau/compiler/nak.h +++ b/src/nouveau/compiler/nak.h @@ -109,7 +109,8 @@ const extern struct nak_constant_offset_info nak_const_offsets_turing_graphics; void nak_postprocess_nir(nir_shader *nir, const struct nak_compiler *nak, nir_variable_mode robust2_modes, - const struct nak_fs_key *fs_key); + const struct nak_fs_key *fs_key, + bool has_task_shader); enum ENUM_PACKED nak_ts_domain { NAK_TS_DOMAIN_ISOLINE = 0, diff --git a/src/nouveau/compiler/nak/api.rs b/src/nouveau/compiler/nak/api.rs index 49436ae53ee..b5120363ad2 100644 --- a/src/nouveau/compiler/nak/api.rs +++ b/src/nouveau/compiler/nak/api.rs @@ -485,7 +485,9 @@ fn nak_compile_shader_internal( fs_key: *const nak_fs_key, has_task_shader: bool, ) -> *mut nak_shader_bin { - unsafe { nak_postprocess_nir(nir, nak, robust2_modes, fs_key) }; + unsafe { + nak_postprocess_nir(nir, nak, robust2_modes, fs_key, has_task_shader) + }; let nak = unsafe { &*nak }; let nir = unsafe { &*nir }; let fs_key = if fs_key.is_null() { diff --git a/src/nouveau/compiler/nak/from_nir.rs b/src/nouveau/compiler/nak/from_nir.rs index 6521d51318a..d261d269223 100644 --- a/src/nouveau/compiler/nak/from_nir.rs +++ b/src/nouveau/compiler/nak/from_nir.rs @@ -3748,6 +3748,11 @@ impl<'a> ShaderFromNir<'a> { self.set_dst(&intrin.def, dst.into()); } nir_intrinsic_shared_atomic_nv => { + assert!( + self.nir.info.stage() == MESA_SHADER_COMPUTE + || self.nir.info.stage() == MESA_SHADER_KERNEL + ); + let bit_size = intrin.def.bit_size(); let addr = self.get_src(&srcs[0]); let uaddr = self.get_src(&srcs[1]); @@ -3775,6 +3780,11 @@ impl<'a> ShaderFromNir<'a> { self.set_dst(&intrin.def, dst); } nir_intrinsic_shared_atomic_swap_nv => { + assert!( + self.nir.info.stage() == MESA_SHADER_COMPUTE + || self.nir.info.stage() == MESA_SHADER_KERNEL + ); + assert!(intrin.atomic_op() == nir_atomic_op_cmpxchg); let bit_size = intrin.def.bit_size(); let addr = self.get_src(&srcs[0]); diff --git a/src/nouveau/compiler/nak_nir.c b/src/nouveau/compiler/nak_nir.c index 766f9311d9a..dd8854298a8 100644 --- a/src/nouveau/compiler/nak_nir.c +++ b/src/nouveau/compiler/nak_nir.c @@ -69,6 +69,14 @@ nak_nir_workgroup_has_one_subgroup(const nir_shader *nir) */ return true; + case MESA_SHADER_TASK: + case MESA_SHADER_MESH: + /* + * Task and Mesh runs on the Vertex and Tesselation stage and follows the + * same rules. + */ + return true; + case MESA_SHADER_COMPUTE: case MESA_SHADER_KERNEL: { if (nir->info.workgroup_size_variable) @@ -414,6 +422,7 @@ nak_varying_attr_addr(const struct nak_compiler *nak, gl_varying_slot slot) case VARYING_SLOT_POS: return NAK_ATTR_POSITION; case VARYING_SLOT_CLIP_DIST0: return NAK_ATTR_CLIP_CULL_DIST_0; case VARYING_SLOT_CLIP_DIST1: return NAK_ATTR_CLIP_CULL_DIST_4; + case VARYING_SLOT_VIEWPORT_MASK: return NAK_ATTR_VIEWPORT_MASK; case VARYING_SLOT_PRIMITIVE_SHADING_RATE: return nak->sm >= 86 ? NAK_ATTR_VPRS_TABLE_INDEX : NAK_ATTR_VIEWPORT_INDEX; @@ -422,6 +431,23 @@ nak_varying_attr_addr(const struct nak_compiler *nak, gl_varying_slot slot) } } +uint16_t +nak_varying_mesh_skew_attr_addr(const struct nak_compiler *nak, gl_varying_slot slot) +{ + switch (slot) { + /* Don't map to anything in SPH */ + case VARYING_SLOT_PRIMITIVE_COUNT: + case VARYING_SLOT_PRIMITIVE_INDICES: + return 0; + case VARYING_SLOT_VIEWPORT: + case VARYING_SLOT_CULL_PRIMITIVE: + UNREACHABLE("Should have been lowered by nak_nir_lower_mesh_emulated_attributes"); + + default: return nak_varying_attr_addr(nak, slot); + } +} + + static uint16_t nak_fs_out_addr(gl_frag_result slot, uint32_t blend_idx) { @@ -552,6 +578,9 @@ nak_nir_lower_system_value_intrin(nir_builder *b, nir_intrinsic_instr *intrin, } case nir_intrinsic_load_local_invocation_id: { + /* Should have been lowered earlier */ + assert(!mesa_shader_stage_is_mesh(b->shader->info.stage)); + nir_def *x = nak_nir_load_sysval(b, NAK_SV_TID_X, ACCESS_CAN_REORDER); nir_def *y = nak_nir_load_sysval(b, NAK_SV_TID_Y, @@ -607,6 +636,15 @@ nak_nir_lower_system_value_intrin(nir_builder *b, nir_intrinsic_instr *intrin, } break; + case nir_intrinsic_load_local_invocation_index: { + if (b->shader->info.stage != MESA_SHADER_TASK && + b->shader->info.stage != MESA_SHADER_MESH) + return false; + + val = nak_nir_load_sysval(b, NAK_SV_LANE_ID, ACCESS_CAN_REORDER); + break; + } + case nir_intrinsic_is_helper_invocation: case nir_intrinsic_load_helper_invocation: { val = nak_nir_load_sysval(b, NAK_SV_THREAD_KILL, 0); @@ -964,6 +1002,31 @@ nak_mem_access_size_align(nir_intrinsic_op intrin, } } +static nir_mem_access_size_align +nak_mesh_mem_access_size_align(nir_intrinsic_op intrin, + uint8_t bytes, uint8_t bit_size, + uint32_t align_mul, uint32_t align_offset, + bool offset_is_const, enum gl_access_qualifier access, + const void *cb_data) +{ + switch (intrin) { + case nir_intrinsic_load_shared: + case nir_intrinsic_load_task_payload: + case nir_intrinsic_store_shared: + return (nir_mem_access_size_align) { + .bit_size = 32, + .num_components = 1, + .align = 4, + .shift = nir_mem_access_shift_method_scalar, + }; + + default: + return nak_mem_access_size_align(intrin, bytes, bit_size, align_mul, + align_offset, offset_is_const, access, + cb_data); + } +} + static bool nir_shader_has_local_variables(const nir_shader *nir) { @@ -1284,14 +1347,108 @@ nak_nir_max_imm_offset(nir_intrinsic_instr *intrin, const void *data) } } +static void +nak_mesh_skew_attr_mark_used(struct lower_mesh_intrinsics_ctx *ctx, + uint32_t base_addr, + uint32_t range, + bool per_primitive) +{ + if (base_addr == 0) + return; + + const uint32_t start_bit_idx = nak_mesh_skew_attr_used_index(base_addr); + const uint32_t end_bit_idx = nak_mesh_skew_attr_used_index(base_addr + range); + + if (per_primitive) + BITSET_SET_RANGE(ctx->skew_prim_attr_used, start_bit_idx, end_bit_idx - 1); + else + BITSET_SET_RANGE(ctx->skew_vert_attr_used, start_bit_idx, end_bit_idx - 1); +} + +static bool +nak_nir_gather_mesh_outputs(nir_shader *nir, struct lower_mesh_intrinsics_ctx *ctx) +{ + bool progress = false; + + nir_foreach_function(func, nir) { + nir_foreach_block_safe(block, func->impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + + if (intrin->intrinsic != nir_intrinsic_store_per_primitive_output && + intrin->intrinsic != nir_intrinsic_store_per_vertex_output) + continue; + + nir_def *offset = intrin->src[2].ssa; + nir_io_semantics sem = nir_intrinsic_io_semantics(intrin); + uint32_t component = nir_intrinsic_component(intrin); + uint32_t base_addr = nak_varying_mesh_skew_attr_addr(ctx->nak, sem.location); + + /* Skip non SPH attributes */ + if (base_addr == 0) + continue; + + base_addr += 4 * component; + + uint32_t range; + if (nir_src_is_const(nir_src_for_ssa(offset))) { + uint32_t const_offset = nir_src_as_uint(nir_src_for_ssa(offset)); + + /* Tighten the range */ + base_addr += const_offset * 16; + range = 4 * intrin->num_components; + } else { + range = (sem.num_slots - 1) * 16 + intrin->num_components * 4; + } + + const bool is_per_primitive = intrin->intrinsic == nir_intrinsic_store_per_primitive_output; + + nak_mesh_skew_attr_mark_used(ctx, base_addr, range, is_per_primitive); + } + } + } + + return progress; +} + void nak_postprocess_nir(nir_shader *nir, const struct nak_compiler *nak, nir_variable_mode robust2_modes, - const struct nak_fs_key *fs_key) + const struct nak_fs_key *fs_key, + bool has_task_shader) { UNUSED bool progress = false; + const bool is_mesh_stage = nir->info.stage == MESA_SHADER_TASK || + nir->info.stage == MESA_SHADER_MESH; + + if (is_mesh_stage) { + const uint32_t wg_size = nir->info.workgroup_size[0] * + nir->info.workgroup_size[1] * + nir->info.workgroup_size[2]; + + /* As the mesh stages run as vertex or tessellation stages, we only have + * 32 local invocations in hardware, so if the user requests more than 32 + * local invocations, we need to lower them. */ + if (wg_size > 32) { + /* Make sure that all system values are lowered and no halt/return/goto + * are present for nir_lower_workgroup_size. */ + OPT(nir, nir_lower_system_values); + OPT(nir, nir_lower_halt_to_return); + OPT(nir, nir_lower_returns); + OPT(nir, nir_lower_workgroup_size, 32); + + nak_optimize_nir(nir, nak); + nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir)); + } + + OPT(nir, nak_nir_lower_mesh_stages_shared_atomics); + } + nak_optimize_nir(nir, nak); const nir_lower_subgroups_options subgroups_options = { @@ -1333,14 +1490,17 @@ nak_postprocess_nir(nir_shader *nir, vectorize_opts.modes = nir_var_mem_global | nir_var_mem_ssbo | nir_var_mem_shared | + nir_var_mem_task_payload | nir_var_shader_temp; vectorize_opts.callback = nak_mem_vectorize_cb; vectorize_opts.robust_modes = robust2_modes; OPT(nir, nir_opt_load_store_vectorize, &vectorize_opts); nir_lower_mem_access_bit_sizes_options mem_bit_size_options = { - .modes = nir_var_mem_constant | nir_var_mem_ubo | nir_var_mem_generic, - .callback = nak_mem_access_size_align, + .modes = nir_var_mem_constant | nir_var_mem_ubo | nir_var_mem_generic | + nir_var_mem_task_payload, + .callback = is_mesh_stage ? nak_mesh_mem_access_size_align + : nak_mem_access_size_align, }; OPT(nir, nir_lower_mem_access_bit_sizes, &mem_bit_size_options); OPT(nir, nir_lower_bit_size, lower_bit_size_cb, (void *)nak); @@ -1417,6 +1577,30 @@ nak_postprocess_nir(nir_shader *nir, OPT(nir, nir_opt_constant_folding); break; + case MESA_SHADER_TASK: { + OPT(nir, nak_nir_lower_task_intrinsics); + OPT(nir, nir_opt_constant_folding); + break; + } + case MESA_SHADER_MESH: { + OPT(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out, + type_size_vec4, nir_lower_io_lower_64bit_to_32); + OPT(nir, nir_opt_constant_folding); + + OPT(nir, nak_nir_lower_mesh_emulated_attributes); + + struct lower_mesh_intrinsics_ctx ctx = { + .nak = nak, + .max_vertices_out = nir->info.mesh.max_vertices_out, + .max_primitives_out = nir->info.mesh.max_primitives_out, + .has_task_shader = has_task_shader, + }; + OPT(nir, nak_nir_gather_mesh_outputs, &ctx); + OPT(nir, nak_nir_lower_mesh_intrinsics, &ctx); + OPT(nir, nir_opt_constant_folding); + break; + } + default: UNREACHABLE("Unsupported shader stage"); } diff --git a/src/nouveau/compiler/nak_nir_lower_fs_inputs.c b/src/nouveau/compiler/nak_nir_lower_fs_inputs.c index 7f0864efeaf..d4dc8c2b419 100644 --- a/src/nouveau/compiler/nak_nir_lower_fs_inputs.c +++ b/src/nouveau/compiler/nak_nir_lower_fs_inputs.c @@ -237,7 +237,8 @@ lower_fs_input_intrin(nir_builder *b, nir_intrinsic_instr *intrin, void *data) break; } - case nir_intrinsic_load_input: { + case nir_intrinsic_load_input: + case nir_intrinsic_load_per_primitive_input: { const uint16_t addr = fs_input_intrin_addr(intrin, ctx->nak); res = load_fs_input(b, intrin->def.num_components, addr, ctx->nak); break; diff --git a/src/nouveau/compiler/nak_nir_lower_mesh_intrinsics.c b/src/nouveau/compiler/nak_nir_lower_mesh_intrinsics.c new file mode 100644 index 00000000000..c7cf35bc57d --- /dev/null +++ b/src/nouveau/compiler/nak_nir_lower_mesh_intrinsics.c @@ -0,0 +1,586 @@ +/* + * Copyright © 2026 Valve Corporation. + * Copyright © 2023 Collabora, Ltd. + * SPDX-License-Identifier: MIT + */ + +#include "nak_private.h" +#include "nir_builder.h" + +static bool +lower_mesh_io_intrin(nir_builder *b, + nir_intrinsic_instr *intrin, + const struct lower_mesh_intrinsics_ctx *ctx) +{ + b->cursor = nir_before_instr(&intrin->instr); + + nir_def *vtx = NULL, *offset = NULL, *data = NULL; + + switch (intrin->intrinsic) { + case nir_intrinsic_load_per_vertex_output: + case nir_intrinsic_load_per_primitive_output: + vtx = intrin->src[0].ssa; + offset = intrin->src[1].ssa; + break; + + case nir_intrinsic_store_per_vertex_output: + case nir_intrinsic_store_per_primitive_output: + data = intrin->src[0].ssa; + vtx = intrin->src[1].ssa; + offset = intrin->src[2].ssa; + break; + + default: + UNREACHABLE("unknown intrinsic"); + } + + const bool is_per_primitive = intrin->intrinsic == nir_intrinsic_load_per_primitive_output || + intrin->intrinsic == nir_intrinsic_store_per_primitive_output; + + const bool is_store = data != NULL; + nir_io_semantics sem = nir_intrinsic_io_semantics(intrin); + + const bool is_primitive_indices = sem.location == VARYING_SLOT_PRIMITIVE_INDICES; + + const struct nak_nir_isbe_flags flags = { + .access = is_primitive_indices ? NAK_ISBE_ACCESS_MAP : NAK_ISBE_ACCESS_ATTR, + .output = true, + .skew = !is_primitive_indices, + .per_primitive = is_per_primitive, + }; + + uint32_t base_addr = + nak_varying_mesh_skew_attr_addr(ctx->nak, sem.location) + + 4 * nir_intrinsic_component(intrin); + + uint32_t range; + if (nir_src_is_const(nir_src_for_ssa(offset))) { + uint32_t const_offset = nir_src_as_uint(nir_src_for_ssa(offset)); + /* Tighten the range */ + base_addr += const_offset * 16; + range = 4 * intrin->num_components; + + if (const_offset != 0) + offset = nir_imm_int(b, 0); + } else { + /* Offsets from NIR are in vec4's */ + offset = nir_imul_imm(b, offset, 16); + range = (sem.num_slots - 1) * 16 + intrin->num_components * 4; + } + + nir_def *isbe_offset; + uint32_t stride; + if (is_primitive_indices) { + const uint32_t vertices_per_prim = mesa_vertices_per_prim(b->shader->info.mesh.primitive_type); + + /* Indices are 8 bits on hardware */ + isbe_offset = nir_iadd(b, offset, nir_iadd_imm(b, nir_imul_imm(b, vtx, vertices_per_prim), 4)); + stride = 1; + } else { + uint16_t skew_attr_offset = nak_mesh_skew_offset(ctx, sem.location, base_addr, is_per_primitive); + nir_def *skew_start_offset; + uint16_t skew_group_size; + + if (is_per_primitive) { + skew_start_offset = nir_imm_int(b, nak_mesh_skew_vert_total_size(ctx)); + skew_group_size = nak_mesh_skew_prim_group_size(ctx); + } else { + skew_start_offset = nir_imm_int(b, 0); + skew_group_size = nak_mesh_skew_vert_group_size(ctx); + } + + /* Readjust offset to take into account SKEW groups */ + nir_def *offset_ajusted = nir_imul_imm(b, offset, NAK_MESH_SKEW_GROUP_COUNT); + skew_start_offset = nir_iadd(b, skew_start_offset, nir_imul_imm(b, nir_udiv_imm(b, vtx, 32), skew_group_size)); + + isbe_offset = nir_iadd(b, nir_iadd_imm(b, nir_iadd(b, nir_imul_imm(b, nir_imod_imm(b, vtx, 32), 4), + skew_start_offset), + skew_attr_offset), + offset_ajusted); + stride = 4 * NAK_MESH_SKEW_GROUP_COUNT; + } + + if (is_store) { + u_foreach_bit(c, nir_intrinsic_write_mask(intrin)) { + nir_def *c_offset = nir_iadd_imm(b, isbe_offset, c * stride); + nir_def *c_data = nir_channel(b, data, c); + + /* Handle indices conversion */ + if (is_primitive_indices) + c_data = nir_u2u8(b, c_data); + + nir_isbewr_nv(b, c_data, c_offset, .range_base = base_addr, + .range = range, .flags = NAK_AS_U32(flags)); + } + } else { + const uint8_t bit_size = is_primitive_indices ? 8 : intrin->def.bit_size; + + nir_def *comps[NIR_MAX_VEC_COMPONENTS]; + for (uint32_t c = 0; c < intrin->num_components; c++) { + nir_def *c_offset = nir_iadd_imm(b, isbe_offset, c * stride); + nir_def *c_data = + nir_isberd_nv(b, bit_size, c_offset, .range_base = base_addr, + .range = range, .flags = NAK_AS_U32(flags)); + + /* Handle indices conversion */ + if (is_primitive_indices) + c_data = nir_u2u32(b, c_data); + + comps[c] = c_data; + } + + nir_def *dst = nir_vec(b, comps, intrin->num_components); + nir_def_rewrite_uses(&intrin->def, dst); + } + + nir_instr_remove(&intrin->instr); + + return true; +} + +static bool +lower_set_vertex_and_primitive_count(nir_builder *b, + nir_intrinsic_instr *intrin) +{ + b->cursor = nir_before_instr(&intrin->instr); + + nir_def *primitive_count = intrin->src[1].ssa; + nir_def *offset = nir_imm_int(b, 0x3); + + const struct nak_nir_isbe_flags flags = { + .access = NAK_ISBE_ACCESS_MAP, + .output = true, + .skew = false, + .per_primitive = false, + }; + + nir_isbewr_nv(b, primitive_count, offset, + .flags = NAK_AS_U32(flags)); + + nir_instr_remove(&intrin->instr); + + return true; +} + +static bool +lower_load_workgroup_index(nir_builder *b, + nir_intrinsic_instr *intrin, + bool from_skew) +{ + nir_function_impl *impl = nir_shader_get_entrypoint(b->shader); + + /* We need to make sure that this is read before any writes to allow ISBE + * space sharing optimisation to happen */ + b->cursor = nir_before_impl(impl); + + const struct nak_nir_isbe_flags flags = { + .access = NAK_ISBE_ACCESS_ATTR, + .output = false, + .skew = from_skew, + .per_primitive = false, + }; + + nir_def *dst = nir_isberd_nv(b, 32, nir_imm_int(b, 0), + .range_base = NAK_ATTR_VERTEX_ID, + .range = 4, + .flags = NAK_AS_U32(flags)); + + nir_def_rewrite_uses(&intrin->def, dst); + nir_instr_remove(&intrin->instr); + + return true; +} + +static bool +lower_load_num_workgroups(nir_builder *b, nir_intrinsic_instr *intrin) +{ + /* If we are here, we have a task shader */ + b->cursor = nir_before_instr(&intrin->instr); + + const struct nak_nir_isbe_flags flags = { + .access = NAK_ISBE_ACCESS_ATTR, + .output = false, + .skew = false, + .per_primitive = false, + }; + + nir_def *x = + nir_isberd_nv(b, 32, nir_imm_int(b, 0x8), .flags = NAK_AS_U32(flags), + .access = ACCESS_CAN_REORDER); + nir_def *y = + nir_isberd_nv(b, 32, nir_imm_int(b, 0xC), .flags = NAK_AS_U32(flags), + .access = ACCESS_CAN_REORDER); + nir_def *z = + nir_isberd_nv(b, 32, nir_imm_int(b, 0x10), .flags = NAK_AS_U32(flags), + .access = ACCESS_CAN_REORDER); + nir_def *dst = nir_vec3(b, x, y, z); + nir_def_rewrite_uses(&intrin->def, dst); + nir_instr_remove(&intrin->instr); + + return true; +} + +static bool +lower_load_shared(nir_builder *b, nir_intrinsic_instr *intrin, + uint32_t base_offset) +{ + b->cursor = nir_before_instr(&intrin->instr); + + nir_def *offset = intrin->src[0].ssa; + + const uint8_t bit_size = intrin->def.bit_size; + assert(bit_size == 32 && intrin->def.num_components == 1); + + const uint32_t base = nir_intrinsic_base(intrin); + const struct nak_nir_isbe_flags flags = { + .access = NAK_ISBE_ACCESS_ATTR, + .output = true, + .skew = false, + .per_primitive = false, + }; + + offset = nir_iadd_imm(b, offset, base_offset + base); + nir_def *dst = nir_isberd_nv(b, 32, offset, .flags = NAK_AS_U32(flags)); + nir_def_rewrite_uses(&intrin->def, dst); + nir_instr_remove(&intrin->instr); + + return true; +} + +static bool +lower_store_shared(nir_builder *b, nir_intrinsic_instr *intrin, + uint32_t base_offset) +{ + b->cursor = nir_before_instr(&intrin->instr); + + nir_def *value = intrin->src[0].ssa; + nir_def *offset = intrin->src[1].ssa; + + const uint8_t bit_size = value->bit_size; + assert(bit_size == 32 && + nir_intrinsic_write_mask(intrin) == nir_component_mask(1)); + + const uint32_t base = nir_intrinsic_base(intrin); + const struct nak_nir_isbe_flags flags = { + .access = NAK_ISBE_ACCESS_ATTR, + .output = true, + .skew = false, + .per_primitive = false, + }; + + offset = nir_iadd_imm(b, offset, base_offset + base); + nir_isbewr_nv(b, value, offset, .flags = NAK_AS_U32(flags)); + nir_instr_remove(&intrin->instr); + + return true; +} + +static bool +lower_load_task_payload(nir_builder *b, nir_intrinsic_instr *intrin) +{ + b->cursor = nir_before_instr(&intrin->instr); + + nir_def *offset = intrin->src[0].ssa; + + const uint8_t bit_size = intrin->def.bit_size; + assert(bit_size == 32 && intrin->def.num_components == 1); + + const uint32_t base = nir_intrinsic_base(intrin); + const struct nak_nir_isbe_flags flags = { + .access = NAK_ISBE_ACCESS_ATTR, + .output = false, + .skew = false, + .per_primitive = false, + }; + + offset = nir_iadd_imm(b, offset, base); + nir_def *dst = nir_isberd_nv(b, 32, offset, .flags = NAK_AS_U32(flags), + .access = ACCESS_CAN_REORDER); + nir_def_rewrite_uses(&intrin->def, dst); + nir_instr_remove(&intrin->instr); + + return true; +} + +static bool +lower_mesh_intrin(nir_builder *b, + nir_intrinsic_instr *intrin, + void *cb_data) +{ + const struct lower_mesh_intrinsics_ctx *ctx = cb_data; + + /* Shared memory is after attributes on mesh shaders */ + const uint32_t shared_memory_base = nak_mesh_skew_total_size(ctx); + assert(shared_memory_base % 0x80 == 0); + + switch (intrin->intrinsic) { + case nir_intrinsic_load_per_vertex_output: + case nir_intrinsic_load_per_primitive_output: + case nir_intrinsic_store_per_vertex_output: + case nir_intrinsic_store_per_primitive_output: + return lower_mesh_io_intrin(b, intrin, ctx); + case nir_intrinsic_set_vertex_and_primitive_count: + return lower_set_vertex_and_primitive_count(b, intrin); + case nir_intrinsic_load_workgroup_index: + return lower_load_workgroup_index(b, intrin, !ctx->has_task_shader); + case nir_intrinsic_load_num_workgroups: + return lower_load_num_workgroups(b, intrin); + case nir_intrinsic_load_shared: + return lower_load_shared(b, intrin, shared_memory_base); + case nir_intrinsic_store_shared: + return lower_store_shared(b, intrin, shared_memory_base); + case nir_intrinsic_load_task_payload: + return lower_load_task_payload(b, intrin); + case nir_intrinsic_shared_atomic: + case nir_intrinsic_shared_atomic_swap: + UNREACHABLE( + "Should have been lowered by nak_nir_lower_mesh_stages_shared_atomics"); + default: + return false; + } +} + +struct lower_emulated_attributes_state { + uint32_t viewport_shared_offset; + uint32_t cullprimitive_shared_offset; +}; + +static bool +lower_emulated_attributes_intrin(nir_builder *b, nir_intrinsic_instr *intrin, + void *_data) +{ + const struct lower_emulated_attributes_state *state = _data; + nir_def *vtx = NULL, *offset = NULL; + + switch (intrin->intrinsic) { + case nir_intrinsic_load_per_primitive_output: + vtx = intrin->src[0].ssa; + offset = intrin->src[1].ssa; + break; + + case nir_intrinsic_store_per_primitive_output: + vtx = intrin->src[1].ssa; + offset = intrin->src[2].ssa; + break; + + default: + return false; + } + + nir_io_semantics sem = nir_intrinsic_io_semantics(intrin); + + if (sem.location != VARYING_SLOT_VIEWPORT && + sem.location != VARYING_SLOT_CULL_PRIMITIVE) + return false; + + b->cursor = nir_before_instr(&intrin->instr); + nir_def *shared_offset = nir_iadd_imm(b, offset, nir_intrinsic_base(intrin)); + shared_offset = nir_iadd(b, shared_offset, nir_imul_imm(b, vtx, 4)); + + if (sem.location == VARYING_SLOT_CULL_PRIMITIVE) + shared_offset = + nir_iadd_imm(b, shared_offset, state->cullprimitive_shared_offset); + else + shared_offset = + nir_iadd_imm(b, shared_offset, state->viewport_shared_offset); + + if (intrin->intrinsic == nir_intrinsic_store_per_primitive_output) { + nir_def *data = intrin->src[0].ssa; + switch (sem.location) { + case VARYING_SLOT_VIEWPORT: + /* In case of Viewport, the data needs to be translated to a proper + * mask value to map to ViewportMask */ + data = nir_ishl(b, nir_imm_int(b, 1), data); + break; + case VARYING_SLOT_CULL_PRIMITIVE: + /* In case of CullPrimitive, the data is already a 32-bit value so no + * translation is needed */ + break; + default: + UNREACHABLE("Should never happen"); + } + + nir_store_shared(b, data, shared_offset); + nir_instr_remove(&intrin->instr); + } else { + /* Reading back isn't allowed by VK_EXT_mesh_shader but allowed by + * VK_NV_mesh_shader. We support readback for completeness and in case we + * add support for NV specific extension in the future */ + nir_def *data = nir_load_shared(b, 1, 32, shared_offset); + switch (sem.location) { + case VARYING_SLOT_VIEWPORT: + /* In case of Viewport, find the first index that is set. */ + data = nir_find_lsb(b, data); + break; + case VARYING_SLOT_CULL_PRIMITIVE: + /* In case of CullPrimitive, we check if no bits are set */ + data = nir_ine_imm(b, data, 0); + break; + default: + UNREACHABLE("Should never happen"); + } + nir_def_replace(&intrin->def, data); + } + + return true; +} + +bool +nak_nir_lower_mesh_emulated_attributes(nir_shader *nir) +{ + if (nir->info.stage != MESA_SHADER_MESH) + return false; + + /* Only apply this pass when really needed */ + if ((nir->info.per_primitive_outputs & + (VARYING_BIT_CULL_PRIMITIVE | VARYING_BIT_VIEWPORT)) == 0) + return false; + + /* If we are here, we need to emulate Viewport / CullPrimitive with the + * ViewportMask. This means if we need to always keep a shadow copy of the + * ViewportMask and CullPrimitive in shared memory and write the actual + * ViewportMask at the end of the shader. */ + bool progress = false; + + /* Reserve space for the Viewport and CullPrimitive shadow copies */ + uint32_t shared_memory_offset = nir->info.shared_size; + nir->info.shared_size += 8 * nir->info.mesh.max_primitives_out; + + struct lower_emulated_attributes_state state = { + .viewport_shared_offset = shared_memory_offset, + .cullprimitive_shared_offset = + shared_memory_offset + 4 * nir->info.mesh.max_primitives_out, + }; + + /* First we lower things to shared memory */ + progress |= nir_shader_intrinsics_pass(nir, lower_emulated_attributes_intrin, + nir_metadata_control_flow, &state); + + /* Finally, we ensure that the shared region is init at the start of the + * shader and we add primitive writes at the end of the shader to write the + * real value depending on the culling state.*/ + if (progress) { + nir_function_impl *impl = nir_shader_get_entrypoint(nir); + nir_builder b = nir_builder_at(nir_before_impl(impl)); + nir_def *zero = nir_imm_int(&b, 0); + nir_def *viewport_default = nir_imm_int(&b, 1 << 0); + + nir_def *lane_id = + nak_nir_load_sysval(&b, NAK_SV_LANE_ID, ACCESS_CAN_REORDER); + nir_push_if(&b, nir_ieq(&b, lane_id, zero)); + { + for (uint32_t i = 0; i < nir->info.mesh.max_primitives_out; i++) { + nir_store_shared( + &b, viewport_default, + nir_imm_int(&b, state.viewport_shared_offset + i * 4)); + nir_store_shared( + &b, zero, + nir_imm_int(&b, state.cullprimitive_shared_offset + i * 4)); + } + } + nir_pop_if(&b, NULL); + nir_barrier(&b, SCOPE_WORKGROUP, SCOPE_WORKGROUP, NIR_MEMORY_ACQ_REL, + nir_var_mem_shared); + + b = nir_builder_at(nir_after_impl(impl)); + nir_barrier(&b, SCOPE_WORKGROUP, SCOPE_WORKGROUP, NIR_MEMORY_ACQ_REL, + nir_var_mem_shared); + nir_push_if(&b, nir_ieq(&b, lane_id, zero)); + { + for (uint32_t i = 0; i < nir->info.mesh.max_primitives_out; i++) { + nir_def *viewport_mask = nir_load_shared( + &b, 1, 32, + nir_imm_int(&b, state.viewport_shared_offset + i * 4)); + nir_def *cull_primitive = nir_load_shared( + &b, 1, 32, + nir_imm_int(&b, state.cullprimitive_shared_offset + i * 4)); + + viewport_mask = nir_bcsel(&b, nir_ine_imm(&b, cull_primitive, 0), + zero, viewport_mask); + nir_store_per_primitive_output( + &b, viewport_mask, nir_imm_int(&b, i), zero, .base = 0, + .src_type = nir_type_uint32, + .io_semantics = (nir_io_semantics){ + .location = VARYING_SLOT_VIEWPORT_MASK, + .num_slots = 1, + }); + } + } + nir_pop_if(&b, NULL); + } + + return progress; +} + +bool +nak_nir_lower_mesh_intrinsics(nir_shader *nir, + struct lower_mesh_intrinsics_ctx *ctx) +{ + return nir_shader_intrinsics_pass( + nir, lower_mesh_intrin, nir_metadata_block_index | nir_metadata_dominance, + ctx); +} + +static bool +lower_launch_mesh_workgroups(nir_builder *b, nir_intrinsic_instr *intrin) +{ + b->cursor = nir_before_instr(&intrin->instr); + + nir_def *dim = intrin->src[0].ssa; + nir_def *x = nir_channel(b, dim, 0); + nir_def *y = nir_channel(b, dim, 1); + nir_def *z = nir_channel(b, dim, 2); + nir_def *task_count = nir_imul(b, nir_imul(b, x, y), z); + + const struct nak_nir_isbe_flags flags = { + .access = NAK_ISBE_ACCESS_ATTR, + .output = true, + .skew = false, + .per_primitive = false, + }; + + nir_isbewr_nv(b, task_count, nir_imm_int(b, 0x4), + .flags = NAK_AS_U32(flags)); + nir_isbewr_nv(b, x, nir_imm_int(b, 0x8), .flags = NAK_AS_U32(flags)); + nir_isbewr_nv(b, y, nir_imm_int(b, 0xC), .flags = NAK_AS_U32(flags)); + nir_isbewr_nv(b, z, nir_imm_int(b, 0x10), .flags = NAK_AS_U32(flags)); + nir_instr_remove(&intrin->instr); + return true; +} + +static bool +lower_task_intrin(nir_builder *b, + nir_intrinsic_instr *intrin, + void *cb_data) +{ + switch (intrin->intrinsic) { + case nir_intrinsic_load_shared: + return lower_load_shared(b, intrin, 0); + case nir_intrinsic_store_shared: + return lower_store_shared(b, intrin, 0); + case nir_intrinsic_load_workgroup_index: + return lower_load_workgroup_index(b, intrin, true); + case nir_intrinsic_launch_mesh_workgroups: + return lower_launch_mesh_workgroups(b, intrin); + case nir_intrinsic_shared_atomic: + case nir_intrinsic_shared_atomic_swap: + UNREACHABLE( + "Should have been lowered by nak_nir_lower_mesh_stages_shared_atomics"); + case nir_intrinsic_load_task_payload: + case nir_intrinsic_store_task_payload: + case nir_intrinsic_task_payload_atomic: + case nir_intrinsic_task_payload_atomic_swap: + UNREACHABLE("Should have been lowered by nvk_nir_lower_task_shader"); + default: + return false; + } +} + +bool +nak_nir_lower_task_intrinsics(nir_shader *nir) +{ + return nir_shader_intrinsics_pass(nir, lower_task_intrin, + nir_metadata_block_index | + nir_metadata_dominance, + NULL); +} diff --git a/src/nouveau/compiler/nak_private.h b/src/nouveau/compiler/nak_private.h index 520c7fdb896..c035a82cfa4 100644 --- a/src/nouveau/compiler/nak_private.h +++ b/src/nouveau/compiler/nak_private.h @@ -76,6 +76,8 @@ enum ENUM_PACKED nak_attr { NAK_ATTR_INSTANCE_ID = 0x2f8, NAK_ATTR_VERTEX_ID = 0x2fc, + /* System values D */ + NAK_ATTR_VIEWPORT_MASK = 0x3a0, NAK_ATTR_BARY_COORD_NO_PERSP_X = 0x3a8, NAK_ATTR_BARY_COORD_NO_PERSP_Y = 0x3ac, NAK_ATTR_BARY_COORD_NO_PERSP_Z = 0x3b0, @@ -84,7 +86,8 @@ enum ENUM_PACKED nak_attr { NAK_ATTR_BARY_COORD_X = 0x3b4, NAK_ATTR_BARY_COORD_Y = 0x3b8, NAK_ATTR_BARY_COORD_Z = 0x3bc, - NAK_ATTR_BARY_COORD = NAK_ATTR_BARY_COORD_X, + NAK_ATTR_BARY_COORD = NAK_ATTR_BARY_COORD_X, + NAK_ATTR_SPH_END = NAK_ATTR_BARY_COORD_Z + 4, /* Not in SPH */ NAK_ATTR_FRONT_FACE = 0x3fc, @@ -100,6 +103,8 @@ nak_attribute_attr_addr(UNUSED const struct nak_compiler *nak, uint16_t nak_varying_attr_addr(const struct nak_compiler *nak, gl_varying_slot slot); +uint16_t nak_varying_mesh_skew_attr_addr(const struct nak_compiler *nak, + gl_varying_slot slot); uint16_t nak_sysval_attr_addr(const struct nak_compiler *nak, gl_system_value sysval); @@ -274,6 +279,81 @@ struct nak_nir_isbe_flags { uint32_t pad : 27; }; +struct lower_mesh_intrinsics_ctx { + const struct nak_compiler *nak; + + uint32_t max_vertices_out; + uint32_t max_primitives_out; + bool has_task_shader; + + BITSET_DECLARE(skew_vert_attr_used, NAK_ATTR_SPH_END); + BITSET_DECLARE(skew_prim_attr_used, NAK_ATTR_SPH_END); +}; + +#define NAK_MESH_SKEW_GROUP_COUNT 32 + +static uint32_t +nak_mesh_skew_attr_used_index(uint32_t base_addr) +{ + assert(base_addr < NAK_ATTR_SPH_END); + + return base_addr / 4; +} + +static uint32_t +nak_mesh_skew_vert_group_size(const struct lower_mesh_intrinsics_ctx *ctx) +{ + return BITSET_COUNT(ctx->skew_vert_attr_used) * 4 * NAK_MESH_SKEW_GROUP_COUNT; +} + +static uint32_t +nak_mesh_skew_vert_total_size(const struct lower_mesh_intrinsics_ctx *ctx) +{ + return nak_mesh_skew_vert_group_size(ctx) * DIV_ROUND_UP(ctx->max_vertices_out, NAK_MESH_SKEW_GROUP_COUNT); +} + +static uint32_t +nak_mesh_skew_prim_group_size(const struct lower_mesh_intrinsics_ctx *ctx) +{ + return BITSET_COUNT(ctx->skew_prim_attr_used) * 4 * NAK_MESH_SKEW_GROUP_COUNT; +} + +static uint32_t +nak_mesh_skew_prim_total_size(const struct lower_mesh_intrinsics_ctx *ctx) +{ + return nak_mesh_skew_prim_group_size(ctx) * DIV_ROUND_UP(ctx->max_primitives_out, NAK_MESH_SKEW_GROUP_COUNT); +} + +static uint32_t +nak_mesh_skew_total_size(const struct lower_mesh_intrinsics_ctx *ctx) +{ + return nak_mesh_skew_vert_total_size(ctx) + nak_mesh_skew_prim_total_size(ctx); +} + +static uint32_t +nak_mesh_skew_offset(const struct lower_mesh_intrinsics_ctx *ctx, + gl_varying_slot slot, + uint32_t base_addr, + bool per_primitive) +{ + const uint32_t bit_idx = nak_mesh_skew_attr_used_index(base_addr); + + uint32_t bit_count; + + if (per_primitive) + bit_count = BITSET_PREFIX_SUM(ctx->skew_prim_attr_used, bit_idx); + else + bit_count = BITSET_PREFIX_SUM(ctx->skew_vert_attr_used, bit_idx); + + uint32_t size = bit_count * 4 * NAK_MESH_SKEW_GROUP_COUNT; + + return size; +} + +bool nak_nir_lower_mesh_emulated_attributes(nir_shader *nir); +bool nak_nir_lower_mesh_intrinsics(nir_shader *nir, struct lower_mesh_intrinsics_ctx *ctx); +bool nak_nir_lower_task_intrinsics(nir_shader *nir); + enum nak_interp_mode { NAK_INTERP_MODE_PERSPECTIVE, NAK_INTERP_MODE_SCREEN_LINEAR, diff --git a/src/nouveau/vulkan/meson.build b/src/nouveau/vulkan/meson.build index 21b66e8a3b2..89e6d39cfcc 100644 --- a/src/nouveau/vulkan/meson.build +++ b/src/nouveau/vulkan/meson.build @@ -52,6 +52,7 @@ nvk_files = files( 'nvk_mme.c', 'nvk_mme.h', 'nvk_nir_lower_descriptors.c', + 'nvk_nir_lower_mesh_shader.c', 'nvk_physical_device.c', 'nvk_physical_device.h', 'nvk_private.h', diff --git a/src/nouveau/vulkan/nvk_nir_lower_descriptors.c b/src/nouveau/vulkan/nvk_nir_lower_descriptors.c index 286293391f0..936140270ca 100644 --- a/src/nouveau/vulkan/nvk_nir_lower_descriptors.c +++ b/src/nouveau/vulkan/nvk_nir_lower_descriptors.c @@ -60,6 +60,7 @@ struct lower_descriptors_ctx { bool use_edb_buffer_views; bool clamp_desc_array_bounds; bool indirect_bind; + bool has_task_shader; nir_address_format ubo_addr_format; nir_address_format ssbo_addr_format; @@ -1122,6 +1123,8 @@ static bool try_lower_intrin(nir_builder *b, nir_intrinsic_instr *intrin, const struct lower_descriptors_ctx *ctx) { + const mesa_shader_stage stage = b->shader->info.stage; + switch (intrin->intrinsic) { case nir_intrinsic_load_constant: return lower_load_constant(b, intrin, ctx); @@ -1133,10 +1136,20 @@ try_lower_intrin(nir_builder *b, nir_intrinsic_instr *intrin, UNREACHABLE("Should have been lowered by nir_lower_cs_intrinsics()"); case nir_intrinsic_load_num_workgroups: + /* We use ISBE.ATTR to pass this from task. */ + if (stage == MESA_SHADER_MESH && ctx->has_task_shader) + return false; + + if (stage == MESA_SHADER_MESH || stage == MESA_SHADER_TASK) + return lower_sysval_to_root_table(b, intrin, draw.mesh.group_count, ctx); + return lower_sysval_to_root_table(b, intrin, cs.group_count, ctx); case nir_intrinsic_load_base_workgroup_id: - return lower_sysval_to_root_table(b, intrin, cs.base_group, ctx); + if (stage == MESA_SHADER_COMPUTE) + return lower_sysval_to_root_table(b, intrin, cs.base_group, ctx); + + return false; case nir_intrinsic_load_push_constant: return lower_load_push_constant(b, intrin, ctx); @@ -1548,6 +1561,7 @@ nvk_nir_lower_descriptors(nir_shader *nir, rs->images != VK_PIPELINE_ROBUSTNESS_IMAGE_BEHAVIOR_DISABLED_EXT, .indirect_bind = shader_flags & VK_SHADER_CREATE_INDIRECT_BINDABLE_BIT_EXT, + .has_task_shader = (shader_flags & VK_SHADER_CREATE_NO_TASK_SHADER_BIT_EXT) == 0, .ssbo_addr_format = nvk_ssbo_addr_format(pdev, rs), .ubo_addr_format = nvk_ubo_addr_format(pdev, rs), }; diff --git a/src/nouveau/vulkan/nvk_nir_lower_mesh_shader.c b/src/nouveau/vulkan/nvk_nir_lower_mesh_shader.c new file mode 100644 index 00000000000..c9755cd4d7b --- /dev/null +++ b/src/nouveau/vulkan/nvk_nir_lower_mesh_shader.c @@ -0,0 +1,190 @@ +/* + * Copyright © 2026 Valve Corporation. + * SPDX-License-Identifier: MIT + */ +#include "util/macros.h" +#include "nir.h" +#include "nir_builder.h" +#include "nir_defines.h" +#include "nvk_shader.h" +#include "shader_enums.h" + +static bool +add_task_payload_base_offset(nir_builder *b, nir_intrinsic_instr *intrin, + void *data) +{ + const uint32_t *offset = data; + + switch (intrin->intrinsic) { + case nir_intrinsic_load_task_payload: + case nir_intrinsic_store_task_payload: + case nir_intrinsic_task_payload_atomic: + case nir_intrinsic_task_payload_atomic_swap: + break; + default: + return false; + } + + unsigned base = nir_intrinsic_base(intrin); + nir_intrinsic_set_base(intrin, base + *offset); + return true; +} + +static bool +nvk_nir_lower_common_task_payload(nir_shader *nir) +{ + /* The first 0x20 bytes are used by launch_mesh_workgroups */ + uint32_t task_payload_reserved_size = 0x20; + + /* Take into account the reserved chunk in task memory */ + nir->info.task_payload_size += task_payload_reserved_size; + + /* Add the reserved chunk to every task payload accesses */ + return nir_shader_intrinsics_pass(nir, add_task_payload_base_offset, + nir_metadata_all, + &task_payload_reserved_size); +} + +static bool +lower_set_vertex_and_primitive_count_intrin(nir_builder *b, + nir_intrinsic_instr *intrin, + UNUSED void *data) +{ + if (intrin->intrinsic != nir_intrinsic_set_vertex_and_primitive_count) + return false; + + b->cursor = nir_before_instr(&intrin->instr); + + nir_def *local_invocation_index = nir_load_local_invocation_index(b); + nir_push_if(b, nir_ieq(b, local_invocation_index, nir_imm_int(b, 0))); + { + nir_set_vertex_and_primitive_count( + b, intrin->src[0].ssa, intrin->src[1].ssa, intrin->src[2].ssa); + } + nir_pop_if(b, NULL); + + nir_instr_remove(&intrin->instr); + + return true; +} + +bool +nvk_nir_lower_mesh_shader(nir_shader *nir, VkShaderCreateFlagsEXT shader_flags) +{ + if (nir->info.stage != MESA_SHADER_MESH) + return false; + + bool progress = false; + + if ((shader_flags & VK_SHADER_CREATE_NO_TASK_SHADER_BIT_EXT) == 0) + progress |= nvk_nir_lower_common_task_payload(nir); + + progress |= nir_shader_intrinsics_pass( + nir, lower_set_vertex_and_primitive_count_intrin, nir_metadata_none, + NULL); + + return progress; +} + +static bool +launch_mesh_workgroups_intrin(nir_builder *b, nir_intrinsic_instr *intrin, + UNUSED void *data) +{ + if (intrin->intrinsic != nir_intrinsic_launch_mesh_workgroups) + return false; + + b->cursor = nir_before_instr(&intrin->instr); + + nir_def *local_invocation_index = nir_load_local_invocation_index(b); + nir_push_if(b, nir_ieq(b, local_invocation_index, nir_imm_int(b, 0))); + { + nir_launch_mesh_workgroups(b, intrin->src[0].ssa); + } + nir_pop_if(b, NULL); + nir_instr_remove(&intrin->instr); + + return true; +} + +static nir_intrinsic_op +task_payload_intrinsic_to_shared(nir_intrinsic_op op) +{ + switch (op) { + case nir_intrinsic_load_task_payload: + return nir_intrinsic_load_shared; + case nir_intrinsic_store_task_payload: + return nir_intrinsic_store_shared; + case nir_intrinsic_task_payload_atomic: + return nir_intrinsic_shared_atomic; + case nir_intrinsic_task_payload_atomic_swap: + return nir_intrinsic_shared_atomic_swap; + default: + return nir_num_intrinsics; + } +} + +static bool +lower_task_payload_intrin(nir_builder *b, nir_intrinsic_instr *intrin, + UNUSED void *data) +{ + nir_intrinsic_op new_op = + task_payload_intrinsic_to_shared(intrin->intrinsic); + if (new_op == nir_num_intrinsics) + return false; + + intrin->intrinsic = new_op; + return true; +} + +static bool +add_shared_base_offset(nir_builder *b, nir_intrinsic_instr *intrin, + UNUSED void *data) +{ + switch (intrin->intrinsic) { + case nir_intrinsic_load_shared: + case nir_intrinsic_store_shared: + case nir_intrinsic_shared_atomic: + case nir_intrinsic_shared_atomic_swap: + break; + default: + return false; + } + + const uint32_t shared_memory_base = b->shader->info.task_payload_size; + assert(shared_memory_base % 0x80 == 0); + + unsigned base = nir_intrinsic_base(intrin); + nir_intrinsic_set_base(intrin, base + shared_memory_base); + return true; +} + +bool +nvk_nir_lower_task_shader(nir_shader *nir) +{ + if (nir->info.stage != MESA_SHADER_TASK) + return false; + + bool progress = false; + + /* Apply common lowering for task payload */ + progress |= nvk_nir_lower_common_task_payload(nir); + + /* Ensure alignment based on ISBE mem lines size (128 bytes) */ + nir->info.task_payload_size = align(nir->info.task_payload_size, 128); + + /* Readjust shared memory size to include the task payload */ + nir->info.shared_size += nir->info.task_payload_size; + + /* Now move all shared memory after task payload range and lower task payload + * to shared memory */ + progress |= nir_shader_intrinsics_pass(nir, add_shared_base_offset, + nir_metadata_all, NULL); + progress |= nir_shader_intrinsics_pass(nir, lower_task_payload_intrin, + nir_metadata_all, NULL); + + /* Finally we ensure that launch_mesh_workgroups is only running on lane 0 */ + progress |= nir_shader_intrinsics_pass(nir, launch_mesh_workgroups_intrin, + nir_metadata_none, NULL); + + return progress; +} diff --git a/src/nouveau/vulkan/nvk_shader.c b/src/nouveau/vulkan/nvk_shader.c index 4fd898c9166..679ca420815 100644 --- a/src/nouveau/vulkan/nvk_shader.c +++ b/src/nouveau/vulkan/nvk_shader.c @@ -382,8 +382,10 @@ nvk_lower_nir(struct nvk_device *dev, nir_shader *nir, lookup_ycbcr_conversion, &ycbcr_state); nir_lower_compute_system_values_options csv_options = { - .has_base_workgroup_id = true, + .has_base_workgroup_id = mesa_shader_stage_is_compute(nir->info.stage), .lower_local_invocation_index = mesa_shader_stage_is_compute(nir->info.stage), + .lower_workgroup_id_to_index = mesa_shader_stage_is_mesh(nir->info.stage), + .lower_cs_local_id_to_index = mesa_shader_stage_is_mesh(nir->info.stage), }; NIR_PASS(_, nir, nir_lower_compute_system_values, &csv_options); @@ -461,19 +463,43 @@ nvk_lower_nir(struct nvk_device *dev, nir_shader *nir, NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_load_intrinsic, nir_metadata_none, pdev); - if (mesa_shader_stage_is_compute(nir->info.stage)) { - NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, - nir_var_mem_shared, shared_var_info); - NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_shared, + if (mesa_shader_stage_uses_workgroup(nir->info.stage)) { + nir_variable_mode var_modes = nir_var_mem_shared; + + if (mesa_shader_stage_is_mesh(nir->info.stage)) + var_modes |= nir_var_mem_task_payload; + + NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, var_modes, + shared_var_info); + NIR_PASS(_, nir, nir_lower_explicit_io, var_modes, nir_address_format_32bit_offset); + if (nir->info.stage == MESA_SHADER_TASK) + NIR_PASS(_, nir, nvk_nir_lower_task_shader); + else if (nir->info.stage == MESA_SHADER_MESH) + NIR_PASS(_, nir, nvk_nir_lower_mesh_shader, shader_flags); + if (nir->info.zero_initialize_shared_memory && nir->info.shared_size > 0) { - /* QMD::SHARED_MEMORY_SIZE requires an alignment of 256B so it's safe to - * align everything up to 16B so we can write whole vec4s. - */ - nir->info.shared_size = align(nir->info.shared_size, 16); + uint32_t alignment; + uint32_t chunk_size; + + if (mesa_shader_stage_is_mesh(nir->info.stage)) { + /* With task/mesh shaders, shared is in ISBE attribute space and is + * allocated in "lines" of 128 bytes. Additionally, we ISBE I/O + * instructions only support 1B and 4B granualities.*/ + alignment = 128; + chunk_size = 4; + } else { + /* QMD::SHARED_MEMORY_SIZE requires an alignment of 256B so it's + * safe to align everything up to 16B so we can write whole vec4s. + */ + alignment = 16; + chunk_size = 16; + } + + nir->info.shared_size = align(nir->info.shared_size, alignment); NIR_PASS(_, nir, nir_zero_initialize_shared_memory, - nir->info.shared_size, 16); + nir->info.shared_size, chunk_size); /* We need to call lower_compute_system_values again because * nir_zero_initialize_shared_memory generates load_invocation_id which diff --git a/src/nouveau/vulkan/nvk_shader.h b/src/nouveau/vulkan/nvk_shader.h index be9aebec05c..8efcff22812 100644 --- a/src/nouveau/vulkan/nvk_shader.h +++ b/src/nouveau/vulkan/nvk_shader.h @@ -158,6 +158,9 @@ nvk_nir_lower_descriptors(nir_shader *nir, struct vk_descriptor_set_layout * const *set_layouts, struct nvk_cbuf_map *cbuf_map_out); +bool nvk_nir_lower_mesh_shader(nir_shader *nir, VkShaderCreateFlagsEXT shader_flags); +bool nvk_nir_lower_task_shader(nir_shader *nir); + VkResult nvk_compile_nir_shader(struct nvk_device *dev, nir_shader *nir, const VkAllocationCallbacks *alloc,