mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-06-09 23:08:18 +02:00
nvk: Lower mesh and task shaders
Signed-off-by: Mary Guillemard <mary@mary.zone> Reviewed-by: Mel Henning <mhenning@darkrefraction.com> Tested-by: Thomas H.P. Andersen <phomes@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27196>
This commit is contained in:
parent
cf29933de1
commit
cdb0dea462
13 changed files with 1117 additions and 18 deletions
|
|
@ -23,6 +23,7 @@ libnak_c_files = files(
|
|||
'nak_nir_lower_fs_inputs.c',
|
||||
'nak_nir_lower_gs_intrinsics.c',
|
||||
'nak_nir_lower_image_addrs.c',
|
||||
'nak_nir_lower_mesh_intrinsics.c',
|
||||
'nak_nir_lower_non_uniform_ldcx.c',
|
||||
'nak_nir_lower_scan_reduce.c',
|
||||
'nak_nir_lower_shared_atomics.c',
|
||||
|
|
|
|||
|
|
@ -109,7 +109,8 @@ const extern struct nak_constant_offset_info nak_const_offsets_turing_graphics;
|
|||
|
||||
void nak_postprocess_nir(nir_shader *nir, const struct nak_compiler *nak,
|
||||
nir_variable_mode robust2_modes,
|
||||
const struct nak_fs_key *fs_key);
|
||||
const struct nak_fs_key *fs_key,
|
||||
bool has_task_shader);
|
||||
|
||||
enum ENUM_PACKED nak_ts_domain {
|
||||
NAK_TS_DOMAIN_ISOLINE = 0,
|
||||
|
|
|
|||
|
|
@ -485,7 +485,9 @@ fn nak_compile_shader_internal(
|
|||
fs_key: *const nak_fs_key,
|
||||
has_task_shader: bool,
|
||||
) -> *mut nak_shader_bin {
|
||||
unsafe { nak_postprocess_nir(nir, nak, robust2_modes, fs_key) };
|
||||
unsafe {
|
||||
nak_postprocess_nir(nir, nak, robust2_modes, fs_key, has_task_shader)
|
||||
};
|
||||
let nak = unsafe { &*nak };
|
||||
let nir = unsafe { &*nir };
|
||||
let fs_key = if fs_key.is_null() {
|
||||
|
|
|
|||
|
|
@ -3748,6 +3748,11 @@ impl<'a> ShaderFromNir<'a> {
|
|||
self.set_dst(&intrin.def, dst.into());
|
||||
}
|
||||
nir_intrinsic_shared_atomic_nv => {
|
||||
assert!(
|
||||
self.nir.info.stage() == MESA_SHADER_COMPUTE
|
||||
|| self.nir.info.stage() == MESA_SHADER_KERNEL
|
||||
);
|
||||
|
||||
let bit_size = intrin.def.bit_size();
|
||||
let addr = self.get_src(&srcs[0]);
|
||||
let uaddr = self.get_src(&srcs[1]);
|
||||
|
|
@ -3775,6 +3780,11 @@ impl<'a> ShaderFromNir<'a> {
|
|||
self.set_dst(&intrin.def, dst);
|
||||
}
|
||||
nir_intrinsic_shared_atomic_swap_nv => {
|
||||
assert!(
|
||||
self.nir.info.stage() == MESA_SHADER_COMPUTE
|
||||
|| self.nir.info.stage() == MESA_SHADER_KERNEL
|
||||
);
|
||||
|
||||
assert!(intrin.atomic_op() == nir_atomic_op_cmpxchg);
|
||||
let bit_size = intrin.def.bit_size();
|
||||
let addr = self.get_src(&srcs[0]);
|
||||
|
|
|
|||
|
|
@ -69,6 +69,14 @@ nak_nir_workgroup_has_one_subgroup(const nir_shader *nir)
|
|||
*/
|
||||
return true;
|
||||
|
||||
case MESA_SHADER_TASK:
|
||||
case MESA_SHADER_MESH:
|
||||
/*
|
||||
* Task and Mesh runs on the Vertex and Tesselation stage and follows the
|
||||
* same rules.
|
||||
*/
|
||||
return true;
|
||||
|
||||
case MESA_SHADER_COMPUTE:
|
||||
case MESA_SHADER_KERNEL: {
|
||||
if (nir->info.workgroup_size_variable)
|
||||
|
|
@ -414,6 +422,7 @@ nak_varying_attr_addr(const struct nak_compiler *nak, gl_varying_slot slot)
|
|||
case VARYING_SLOT_POS: return NAK_ATTR_POSITION;
|
||||
case VARYING_SLOT_CLIP_DIST0: return NAK_ATTR_CLIP_CULL_DIST_0;
|
||||
case VARYING_SLOT_CLIP_DIST1: return NAK_ATTR_CLIP_CULL_DIST_4;
|
||||
case VARYING_SLOT_VIEWPORT_MASK: return NAK_ATTR_VIEWPORT_MASK;
|
||||
case VARYING_SLOT_PRIMITIVE_SHADING_RATE:
|
||||
return nak->sm >= 86 ? NAK_ATTR_VPRS_TABLE_INDEX
|
||||
: NAK_ATTR_VIEWPORT_INDEX;
|
||||
|
|
@ -422,6 +431,23 @@ nak_varying_attr_addr(const struct nak_compiler *nak, gl_varying_slot slot)
|
|||
}
|
||||
}
|
||||
|
||||
uint16_t
|
||||
nak_varying_mesh_skew_attr_addr(const struct nak_compiler *nak, gl_varying_slot slot)
|
||||
{
|
||||
switch (slot) {
|
||||
/* Don't map to anything in SPH */
|
||||
case VARYING_SLOT_PRIMITIVE_COUNT:
|
||||
case VARYING_SLOT_PRIMITIVE_INDICES:
|
||||
return 0;
|
||||
case VARYING_SLOT_VIEWPORT:
|
||||
case VARYING_SLOT_CULL_PRIMITIVE:
|
||||
UNREACHABLE("Should have been lowered by nak_nir_lower_mesh_emulated_attributes");
|
||||
|
||||
default: return nak_varying_attr_addr(nak, slot);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static uint16_t
|
||||
nak_fs_out_addr(gl_frag_result slot, uint32_t blend_idx)
|
||||
{
|
||||
|
|
@ -552,6 +578,9 @@ nak_nir_lower_system_value_intrin(nir_builder *b, nir_intrinsic_instr *intrin,
|
|||
}
|
||||
|
||||
case nir_intrinsic_load_local_invocation_id: {
|
||||
/* Should have been lowered earlier */
|
||||
assert(!mesa_shader_stage_is_mesh(b->shader->info.stage));
|
||||
|
||||
nir_def *x = nak_nir_load_sysval(b, NAK_SV_TID_X,
|
||||
ACCESS_CAN_REORDER);
|
||||
nir_def *y = nak_nir_load_sysval(b, NAK_SV_TID_Y,
|
||||
|
|
@ -607,6 +636,15 @@ nak_nir_lower_system_value_intrin(nir_builder *b, nir_intrinsic_instr *intrin,
|
|||
}
|
||||
break;
|
||||
|
||||
case nir_intrinsic_load_local_invocation_index: {
|
||||
if (b->shader->info.stage != MESA_SHADER_TASK &&
|
||||
b->shader->info.stage != MESA_SHADER_MESH)
|
||||
return false;
|
||||
|
||||
val = nak_nir_load_sysval(b, NAK_SV_LANE_ID, ACCESS_CAN_REORDER);
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_is_helper_invocation:
|
||||
case nir_intrinsic_load_helper_invocation: {
|
||||
val = nak_nir_load_sysval(b, NAK_SV_THREAD_KILL, 0);
|
||||
|
|
@ -964,6 +1002,31 @@ nak_mem_access_size_align(nir_intrinsic_op intrin,
|
|||
}
|
||||
}
|
||||
|
||||
static nir_mem_access_size_align
|
||||
nak_mesh_mem_access_size_align(nir_intrinsic_op intrin,
|
||||
uint8_t bytes, uint8_t bit_size,
|
||||
uint32_t align_mul, uint32_t align_offset,
|
||||
bool offset_is_const, enum gl_access_qualifier access,
|
||||
const void *cb_data)
|
||||
{
|
||||
switch (intrin) {
|
||||
case nir_intrinsic_load_shared:
|
||||
case nir_intrinsic_load_task_payload:
|
||||
case nir_intrinsic_store_shared:
|
||||
return (nir_mem_access_size_align) {
|
||||
.bit_size = 32,
|
||||
.num_components = 1,
|
||||
.align = 4,
|
||||
.shift = nir_mem_access_shift_method_scalar,
|
||||
};
|
||||
|
||||
default:
|
||||
return nak_mem_access_size_align(intrin, bytes, bit_size, align_mul,
|
||||
align_offset, offset_is_const, access,
|
||||
cb_data);
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
nir_shader_has_local_variables(const nir_shader *nir)
|
||||
{
|
||||
|
|
@ -1284,14 +1347,108 @@ nak_nir_max_imm_offset(nir_intrinsic_instr *intrin, const void *data)
|
|||
}
|
||||
}
|
||||
|
||||
static void
|
||||
nak_mesh_skew_attr_mark_used(struct lower_mesh_intrinsics_ctx *ctx,
|
||||
uint32_t base_addr,
|
||||
uint32_t range,
|
||||
bool per_primitive)
|
||||
{
|
||||
if (base_addr == 0)
|
||||
return;
|
||||
|
||||
const uint32_t start_bit_idx = nak_mesh_skew_attr_used_index(base_addr);
|
||||
const uint32_t end_bit_idx = nak_mesh_skew_attr_used_index(base_addr + range);
|
||||
|
||||
if (per_primitive)
|
||||
BITSET_SET_RANGE(ctx->skew_prim_attr_used, start_bit_idx, end_bit_idx - 1);
|
||||
else
|
||||
BITSET_SET_RANGE(ctx->skew_vert_attr_used, start_bit_idx, end_bit_idx - 1);
|
||||
}
|
||||
|
||||
static bool
|
||||
nak_nir_gather_mesh_outputs(nir_shader *nir, struct lower_mesh_intrinsics_ctx *ctx)
|
||||
{
|
||||
bool progress = false;
|
||||
|
||||
nir_foreach_function(func, nir) {
|
||||
nir_foreach_block_safe(block, func->impl) {
|
||||
nir_foreach_instr_safe(instr, block) {
|
||||
if (instr->type != nir_instr_type_intrinsic)
|
||||
continue;
|
||||
|
||||
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
||||
|
||||
if (intrin->intrinsic != nir_intrinsic_store_per_primitive_output &&
|
||||
intrin->intrinsic != nir_intrinsic_store_per_vertex_output)
|
||||
continue;
|
||||
|
||||
nir_def *offset = intrin->src[2].ssa;
|
||||
nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
|
||||
uint32_t component = nir_intrinsic_component(intrin);
|
||||
uint32_t base_addr = nak_varying_mesh_skew_attr_addr(ctx->nak, sem.location);
|
||||
|
||||
/* Skip non SPH attributes */
|
||||
if (base_addr == 0)
|
||||
continue;
|
||||
|
||||
base_addr += 4 * component;
|
||||
|
||||
uint32_t range;
|
||||
if (nir_src_is_const(nir_src_for_ssa(offset))) {
|
||||
uint32_t const_offset = nir_src_as_uint(nir_src_for_ssa(offset));
|
||||
|
||||
/* Tighten the range */
|
||||
base_addr += const_offset * 16;
|
||||
range = 4 * intrin->num_components;
|
||||
} else {
|
||||
range = (sem.num_slots - 1) * 16 + intrin->num_components * 4;
|
||||
}
|
||||
|
||||
const bool is_per_primitive = intrin->intrinsic == nir_intrinsic_store_per_primitive_output;
|
||||
|
||||
nak_mesh_skew_attr_mark_used(ctx, base_addr, range, is_per_primitive);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
||||
void
|
||||
nak_postprocess_nir(nir_shader *nir,
|
||||
const struct nak_compiler *nak,
|
||||
nir_variable_mode robust2_modes,
|
||||
const struct nak_fs_key *fs_key)
|
||||
const struct nak_fs_key *fs_key,
|
||||
bool has_task_shader)
|
||||
{
|
||||
UNUSED bool progress = false;
|
||||
|
||||
const bool is_mesh_stage = nir->info.stage == MESA_SHADER_TASK ||
|
||||
nir->info.stage == MESA_SHADER_MESH;
|
||||
|
||||
if (is_mesh_stage) {
|
||||
const uint32_t wg_size = nir->info.workgroup_size[0] *
|
||||
nir->info.workgroup_size[1] *
|
||||
nir->info.workgroup_size[2];
|
||||
|
||||
/* As the mesh stages run as vertex or tessellation stages, we only have
|
||||
* 32 local invocations in hardware, so if the user requests more than 32
|
||||
* local invocations, we need to lower them. */
|
||||
if (wg_size > 32) {
|
||||
/* Make sure that all system values are lowered and no halt/return/goto
|
||||
* are present for nir_lower_workgroup_size. */
|
||||
OPT(nir, nir_lower_system_values);
|
||||
OPT(nir, nir_lower_halt_to_return);
|
||||
OPT(nir, nir_lower_returns);
|
||||
OPT(nir, nir_lower_workgroup_size, 32);
|
||||
|
||||
nak_optimize_nir(nir, nak);
|
||||
nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
|
||||
}
|
||||
|
||||
OPT(nir, nak_nir_lower_mesh_stages_shared_atomics);
|
||||
}
|
||||
|
||||
nak_optimize_nir(nir, nak);
|
||||
|
||||
const nir_lower_subgroups_options subgroups_options = {
|
||||
|
|
@ -1333,14 +1490,17 @@ nak_postprocess_nir(nir_shader *nir,
|
|||
vectorize_opts.modes = nir_var_mem_global |
|
||||
nir_var_mem_ssbo |
|
||||
nir_var_mem_shared |
|
||||
nir_var_mem_task_payload |
|
||||
nir_var_shader_temp;
|
||||
vectorize_opts.callback = nak_mem_vectorize_cb;
|
||||
vectorize_opts.robust_modes = robust2_modes;
|
||||
OPT(nir, nir_opt_load_store_vectorize, &vectorize_opts);
|
||||
|
||||
nir_lower_mem_access_bit_sizes_options mem_bit_size_options = {
|
||||
.modes = nir_var_mem_constant | nir_var_mem_ubo | nir_var_mem_generic,
|
||||
.callback = nak_mem_access_size_align,
|
||||
.modes = nir_var_mem_constant | nir_var_mem_ubo | nir_var_mem_generic |
|
||||
nir_var_mem_task_payload,
|
||||
.callback = is_mesh_stage ? nak_mesh_mem_access_size_align
|
||||
: nak_mem_access_size_align,
|
||||
};
|
||||
OPT(nir, nir_lower_mem_access_bit_sizes, &mem_bit_size_options);
|
||||
OPT(nir, nir_lower_bit_size, lower_bit_size_cb, (void *)nak);
|
||||
|
|
@ -1417,6 +1577,30 @@ nak_postprocess_nir(nir_shader *nir,
|
|||
OPT(nir, nir_opt_constant_folding);
|
||||
break;
|
||||
|
||||
case MESA_SHADER_TASK: {
|
||||
OPT(nir, nak_nir_lower_task_intrinsics);
|
||||
OPT(nir, nir_opt_constant_folding);
|
||||
break;
|
||||
}
|
||||
case MESA_SHADER_MESH: {
|
||||
OPT(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
|
||||
type_size_vec4, nir_lower_io_lower_64bit_to_32);
|
||||
OPT(nir, nir_opt_constant_folding);
|
||||
|
||||
OPT(nir, nak_nir_lower_mesh_emulated_attributes);
|
||||
|
||||
struct lower_mesh_intrinsics_ctx ctx = {
|
||||
.nak = nak,
|
||||
.max_vertices_out = nir->info.mesh.max_vertices_out,
|
||||
.max_primitives_out = nir->info.mesh.max_primitives_out,
|
||||
.has_task_shader = has_task_shader,
|
||||
};
|
||||
OPT(nir, nak_nir_gather_mesh_outputs, &ctx);
|
||||
OPT(nir, nak_nir_lower_mesh_intrinsics, &ctx);
|
||||
OPT(nir, nir_opt_constant_folding);
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
UNREACHABLE("Unsupported shader stage");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -237,7 +237,8 @@ lower_fs_input_intrin(nir_builder *b, nir_intrinsic_instr *intrin, void *data)
|
|||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_load_input: {
|
||||
case nir_intrinsic_load_input:
|
||||
case nir_intrinsic_load_per_primitive_input: {
|
||||
const uint16_t addr = fs_input_intrin_addr(intrin, ctx->nak);
|
||||
res = load_fs_input(b, intrin->def.num_components, addr, ctx->nak);
|
||||
break;
|
||||
|
|
|
|||
586
src/nouveau/compiler/nak_nir_lower_mesh_intrinsics.c
Normal file
586
src/nouveau/compiler/nak_nir_lower_mesh_intrinsics.c
Normal file
|
|
@ -0,0 +1,586 @@
|
|||
/*
|
||||
* Copyright © 2026 Valve Corporation.
|
||||
* Copyright © 2023 Collabora, Ltd.
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#include "nak_private.h"
|
||||
#include "nir_builder.h"
|
||||
|
||||
static bool
|
||||
lower_mesh_io_intrin(nir_builder *b,
|
||||
nir_intrinsic_instr *intrin,
|
||||
const struct lower_mesh_intrinsics_ctx *ctx)
|
||||
{
|
||||
b->cursor = nir_before_instr(&intrin->instr);
|
||||
|
||||
nir_def *vtx = NULL, *offset = NULL, *data = NULL;
|
||||
|
||||
switch (intrin->intrinsic) {
|
||||
case nir_intrinsic_load_per_vertex_output:
|
||||
case nir_intrinsic_load_per_primitive_output:
|
||||
vtx = intrin->src[0].ssa;
|
||||
offset = intrin->src[1].ssa;
|
||||
break;
|
||||
|
||||
case nir_intrinsic_store_per_vertex_output:
|
||||
case nir_intrinsic_store_per_primitive_output:
|
||||
data = intrin->src[0].ssa;
|
||||
vtx = intrin->src[1].ssa;
|
||||
offset = intrin->src[2].ssa;
|
||||
break;
|
||||
|
||||
default:
|
||||
UNREACHABLE("unknown intrinsic");
|
||||
}
|
||||
|
||||
const bool is_per_primitive = intrin->intrinsic == nir_intrinsic_load_per_primitive_output ||
|
||||
intrin->intrinsic == nir_intrinsic_store_per_primitive_output;
|
||||
|
||||
const bool is_store = data != NULL;
|
||||
nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
|
||||
|
||||
const bool is_primitive_indices = sem.location == VARYING_SLOT_PRIMITIVE_INDICES;
|
||||
|
||||
const struct nak_nir_isbe_flags flags = {
|
||||
.access = is_primitive_indices ? NAK_ISBE_ACCESS_MAP : NAK_ISBE_ACCESS_ATTR,
|
||||
.output = true,
|
||||
.skew = !is_primitive_indices,
|
||||
.per_primitive = is_per_primitive,
|
||||
};
|
||||
|
||||
uint32_t base_addr =
|
||||
nak_varying_mesh_skew_attr_addr(ctx->nak, sem.location) +
|
||||
4 * nir_intrinsic_component(intrin);
|
||||
|
||||
uint32_t range;
|
||||
if (nir_src_is_const(nir_src_for_ssa(offset))) {
|
||||
uint32_t const_offset = nir_src_as_uint(nir_src_for_ssa(offset));
|
||||
/* Tighten the range */
|
||||
base_addr += const_offset * 16;
|
||||
range = 4 * intrin->num_components;
|
||||
|
||||
if (const_offset != 0)
|
||||
offset = nir_imm_int(b, 0);
|
||||
} else {
|
||||
/* Offsets from NIR are in vec4's */
|
||||
offset = nir_imul_imm(b, offset, 16);
|
||||
range = (sem.num_slots - 1) * 16 + intrin->num_components * 4;
|
||||
}
|
||||
|
||||
nir_def *isbe_offset;
|
||||
uint32_t stride;
|
||||
if (is_primitive_indices) {
|
||||
const uint32_t vertices_per_prim = mesa_vertices_per_prim(b->shader->info.mesh.primitive_type);
|
||||
|
||||
/* Indices are 8 bits on hardware */
|
||||
isbe_offset = nir_iadd(b, offset, nir_iadd_imm(b, nir_imul_imm(b, vtx, vertices_per_prim), 4));
|
||||
stride = 1;
|
||||
} else {
|
||||
uint16_t skew_attr_offset = nak_mesh_skew_offset(ctx, sem.location, base_addr, is_per_primitive);
|
||||
nir_def *skew_start_offset;
|
||||
uint16_t skew_group_size;
|
||||
|
||||
if (is_per_primitive) {
|
||||
skew_start_offset = nir_imm_int(b, nak_mesh_skew_vert_total_size(ctx));
|
||||
skew_group_size = nak_mesh_skew_prim_group_size(ctx);
|
||||
} else {
|
||||
skew_start_offset = nir_imm_int(b, 0);
|
||||
skew_group_size = nak_mesh_skew_vert_group_size(ctx);
|
||||
}
|
||||
|
||||
/* Readjust offset to take into account SKEW groups */
|
||||
nir_def *offset_ajusted = nir_imul_imm(b, offset, NAK_MESH_SKEW_GROUP_COUNT);
|
||||
skew_start_offset = nir_iadd(b, skew_start_offset, nir_imul_imm(b, nir_udiv_imm(b, vtx, 32), skew_group_size));
|
||||
|
||||
isbe_offset = nir_iadd(b, nir_iadd_imm(b, nir_iadd(b, nir_imul_imm(b, nir_imod_imm(b, vtx, 32), 4),
|
||||
skew_start_offset),
|
||||
skew_attr_offset),
|
||||
offset_ajusted);
|
||||
stride = 4 * NAK_MESH_SKEW_GROUP_COUNT;
|
||||
}
|
||||
|
||||
if (is_store) {
|
||||
u_foreach_bit(c, nir_intrinsic_write_mask(intrin)) {
|
||||
nir_def *c_offset = nir_iadd_imm(b, isbe_offset, c * stride);
|
||||
nir_def *c_data = nir_channel(b, data, c);
|
||||
|
||||
/* Handle indices conversion */
|
||||
if (is_primitive_indices)
|
||||
c_data = nir_u2u8(b, c_data);
|
||||
|
||||
nir_isbewr_nv(b, c_data, c_offset, .range_base = base_addr,
|
||||
.range = range, .flags = NAK_AS_U32(flags));
|
||||
}
|
||||
} else {
|
||||
const uint8_t bit_size = is_primitive_indices ? 8 : intrin->def.bit_size;
|
||||
|
||||
nir_def *comps[NIR_MAX_VEC_COMPONENTS];
|
||||
for (uint32_t c = 0; c < intrin->num_components; c++) {
|
||||
nir_def *c_offset = nir_iadd_imm(b, isbe_offset, c * stride);
|
||||
nir_def *c_data =
|
||||
nir_isberd_nv(b, bit_size, c_offset, .range_base = base_addr,
|
||||
.range = range, .flags = NAK_AS_U32(flags));
|
||||
|
||||
/* Handle indices conversion */
|
||||
if (is_primitive_indices)
|
||||
c_data = nir_u2u32(b, c_data);
|
||||
|
||||
comps[c] = c_data;
|
||||
}
|
||||
|
||||
nir_def *dst = nir_vec(b, comps, intrin->num_components);
|
||||
nir_def_rewrite_uses(&intrin->def, dst);
|
||||
}
|
||||
|
||||
nir_instr_remove(&intrin->instr);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_set_vertex_and_primitive_count(nir_builder *b,
|
||||
nir_intrinsic_instr *intrin)
|
||||
{
|
||||
b->cursor = nir_before_instr(&intrin->instr);
|
||||
|
||||
nir_def *primitive_count = intrin->src[1].ssa;
|
||||
nir_def *offset = nir_imm_int(b, 0x3);
|
||||
|
||||
const struct nak_nir_isbe_flags flags = {
|
||||
.access = NAK_ISBE_ACCESS_MAP,
|
||||
.output = true,
|
||||
.skew = false,
|
||||
.per_primitive = false,
|
||||
};
|
||||
|
||||
nir_isbewr_nv(b, primitive_count, offset,
|
||||
.flags = NAK_AS_U32(flags));
|
||||
|
||||
nir_instr_remove(&intrin->instr);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_load_workgroup_index(nir_builder *b,
|
||||
nir_intrinsic_instr *intrin,
|
||||
bool from_skew)
|
||||
{
|
||||
nir_function_impl *impl = nir_shader_get_entrypoint(b->shader);
|
||||
|
||||
/* We need to make sure that this is read before any writes to allow ISBE
|
||||
* space sharing optimisation to happen */
|
||||
b->cursor = nir_before_impl(impl);
|
||||
|
||||
const struct nak_nir_isbe_flags flags = {
|
||||
.access = NAK_ISBE_ACCESS_ATTR,
|
||||
.output = false,
|
||||
.skew = from_skew,
|
||||
.per_primitive = false,
|
||||
};
|
||||
|
||||
nir_def *dst = nir_isberd_nv(b, 32, nir_imm_int(b, 0),
|
||||
.range_base = NAK_ATTR_VERTEX_ID,
|
||||
.range = 4,
|
||||
.flags = NAK_AS_U32(flags));
|
||||
|
||||
nir_def_rewrite_uses(&intrin->def, dst);
|
||||
nir_instr_remove(&intrin->instr);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_load_num_workgroups(nir_builder *b, nir_intrinsic_instr *intrin)
|
||||
{
|
||||
/* If we are here, we have a task shader */
|
||||
b->cursor = nir_before_instr(&intrin->instr);
|
||||
|
||||
const struct nak_nir_isbe_flags flags = {
|
||||
.access = NAK_ISBE_ACCESS_ATTR,
|
||||
.output = false,
|
||||
.skew = false,
|
||||
.per_primitive = false,
|
||||
};
|
||||
|
||||
nir_def *x =
|
||||
nir_isberd_nv(b, 32, nir_imm_int(b, 0x8), .flags = NAK_AS_U32(flags),
|
||||
.access = ACCESS_CAN_REORDER);
|
||||
nir_def *y =
|
||||
nir_isberd_nv(b, 32, nir_imm_int(b, 0xC), .flags = NAK_AS_U32(flags),
|
||||
.access = ACCESS_CAN_REORDER);
|
||||
nir_def *z =
|
||||
nir_isberd_nv(b, 32, nir_imm_int(b, 0x10), .flags = NAK_AS_U32(flags),
|
||||
.access = ACCESS_CAN_REORDER);
|
||||
nir_def *dst = nir_vec3(b, x, y, z);
|
||||
nir_def_rewrite_uses(&intrin->def, dst);
|
||||
nir_instr_remove(&intrin->instr);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_load_shared(nir_builder *b, nir_intrinsic_instr *intrin,
|
||||
uint32_t base_offset)
|
||||
{
|
||||
b->cursor = nir_before_instr(&intrin->instr);
|
||||
|
||||
nir_def *offset = intrin->src[0].ssa;
|
||||
|
||||
const uint8_t bit_size = intrin->def.bit_size;
|
||||
assert(bit_size == 32 && intrin->def.num_components == 1);
|
||||
|
||||
const uint32_t base = nir_intrinsic_base(intrin);
|
||||
const struct nak_nir_isbe_flags flags = {
|
||||
.access = NAK_ISBE_ACCESS_ATTR,
|
||||
.output = true,
|
||||
.skew = false,
|
||||
.per_primitive = false,
|
||||
};
|
||||
|
||||
offset = nir_iadd_imm(b, offset, base_offset + base);
|
||||
nir_def *dst = nir_isberd_nv(b, 32, offset, .flags = NAK_AS_U32(flags));
|
||||
nir_def_rewrite_uses(&intrin->def, dst);
|
||||
nir_instr_remove(&intrin->instr);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_store_shared(nir_builder *b, nir_intrinsic_instr *intrin,
|
||||
uint32_t base_offset)
|
||||
{
|
||||
b->cursor = nir_before_instr(&intrin->instr);
|
||||
|
||||
nir_def *value = intrin->src[0].ssa;
|
||||
nir_def *offset = intrin->src[1].ssa;
|
||||
|
||||
const uint8_t bit_size = value->bit_size;
|
||||
assert(bit_size == 32 &&
|
||||
nir_intrinsic_write_mask(intrin) == nir_component_mask(1));
|
||||
|
||||
const uint32_t base = nir_intrinsic_base(intrin);
|
||||
const struct nak_nir_isbe_flags flags = {
|
||||
.access = NAK_ISBE_ACCESS_ATTR,
|
||||
.output = true,
|
||||
.skew = false,
|
||||
.per_primitive = false,
|
||||
};
|
||||
|
||||
offset = nir_iadd_imm(b, offset, base_offset + base);
|
||||
nir_isbewr_nv(b, value, offset, .flags = NAK_AS_U32(flags));
|
||||
nir_instr_remove(&intrin->instr);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_load_task_payload(nir_builder *b, nir_intrinsic_instr *intrin)
|
||||
{
|
||||
b->cursor = nir_before_instr(&intrin->instr);
|
||||
|
||||
nir_def *offset = intrin->src[0].ssa;
|
||||
|
||||
const uint8_t bit_size = intrin->def.bit_size;
|
||||
assert(bit_size == 32 && intrin->def.num_components == 1);
|
||||
|
||||
const uint32_t base = nir_intrinsic_base(intrin);
|
||||
const struct nak_nir_isbe_flags flags = {
|
||||
.access = NAK_ISBE_ACCESS_ATTR,
|
||||
.output = false,
|
||||
.skew = false,
|
||||
.per_primitive = false,
|
||||
};
|
||||
|
||||
offset = nir_iadd_imm(b, offset, base);
|
||||
nir_def *dst = nir_isberd_nv(b, 32, offset, .flags = NAK_AS_U32(flags),
|
||||
.access = ACCESS_CAN_REORDER);
|
||||
nir_def_rewrite_uses(&intrin->def, dst);
|
||||
nir_instr_remove(&intrin->instr);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_mesh_intrin(nir_builder *b,
|
||||
nir_intrinsic_instr *intrin,
|
||||
void *cb_data)
|
||||
{
|
||||
const struct lower_mesh_intrinsics_ctx *ctx = cb_data;
|
||||
|
||||
/* Shared memory is after attributes on mesh shaders */
|
||||
const uint32_t shared_memory_base = nak_mesh_skew_total_size(ctx);
|
||||
assert(shared_memory_base % 0x80 == 0);
|
||||
|
||||
switch (intrin->intrinsic) {
|
||||
case nir_intrinsic_load_per_vertex_output:
|
||||
case nir_intrinsic_load_per_primitive_output:
|
||||
case nir_intrinsic_store_per_vertex_output:
|
||||
case nir_intrinsic_store_per_primitive_output:
|
||||
return lower_mesh_io_intrin(b, intrin, ctx);
|
||||
case nir_intrinsic_set_vertex_and_primitive_count:
|
||||
return lower_set_vertex_and_primitive_count(b, intrin);
|
||||
case nir_intrinsic_load_workgroup_index:
|
||||
return lower_load_workgroup_index(b, intrin, !ctx->has_task_shader);
|
||||
case nir_intrinsic_load_num_workgroups:
|
||||
return lower_load_num_workgroups(b, intrin);
|
||||
case nir_intrinsic_load_shared:
|
||||
return lower_load_shared(b, intrin, shared_memory_base);
|
||||
case nir_intrinsic_store_shared:
|
||||
return lower_store_shared(b, intrin, shared_memory_base);
|
||||
case nir_intrinsic_load_task_payload:
|
||||
return lower_load_task_payload(b, intrin);
|
||||
case nir_intrinsic_shared_atomic:
|
||||
case nir_intrinsic_shared_atomic_swap:
|
||||
UNREACHABLE(
|
||||
"Should have been lowered by nak_nir_lower_mesh_stages_shared_atomics");
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
struct lower_emulated_attributes_state {
|
||||
uint32_t viewport_shared_offset;
|
||||
uint32_t cullprimitive_shared_offset;
|
||||
};
|
||||
|
||||
static bool
|
||||
lower_emulated_attributes_intrin(nir_builder *b, nir_intrinsic_instr *intrin,
|
||||
void *_data)
|
||||
{
|
||||
const struct lower_emulated_attributes_state *state = _data;
|
||||
nir_def *vtx = NULL, *offset = NULL;
|
||||
|
||||
switch (intrin->intrinsic) {
|
||||
case nir_intrinsic_load_per_primitive_output:
|
||||
vtx = intrin->src[0].ssa;
|
||||
offset = intrin->src[1].ssa;
|
||||
break;
|
||||
|
||||
case nir_intrinsic_store_per_primitive_output:
|
||||
vtx = intrin->src[1].ssa;
|
||||
offset = intrin->src[2].ssa;
|
||||
break;
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
|
||||
|
||||
if (sem.location != VARYING_SLOT_VIEWPORT &&
|
||||
sem.location != VARYING_SLOT_CULL_PRIMITIVE)
|
||||
return false;
|
||||
|
||||
b->cursor = nir_before_instr(&intrin->instr);
|
||||
nir_def *shared_offset = nir_iadd_imm(b, offset, nir_intrinsic_base(intrin));
|
||||
shared_offset = nir_iadd(b, shared_offset, nir_imul_imm(b, vtx, 4));
|
||||
|
||||
if (sem.location == VARYING_SLOT_CULL_PRIMITIVE)
|
||||
shared_offset =
|
||||
nir_iadd_imm(b, shared_offset, state->cullprimitive_shared_offset);
|
||||
else
|
||||
shared_offset =
|
||||
nir_iadd_imm(b, shared_offset, state->viewport_shared_offset);
|
||||
|
||||
if (intrin->intrinsic == nir_intrinsic_store_per_primitive_output) {
|
||||
nir_def *data = intrin->src[0].ssa;
|
||||
switch (sem.location) {
|
||||
case VARYING_SLOT_VIEWPORT:
|
||||
/* In case of Viewport, the data needs to be translated to a proper
|
||||
* mask value to map to ViewportMask */
|
||||
data = nir_ishl(b, nir_imm_int(b, 1), data);
|
||||
break;
|
||||
case VARYING_SLOT_CULL_PRIMITIVE:
|
||||
/* In case of CullPrimitive, the data is already a 32-bit value so no
|
||||
* translation is needed */
|
||||
break;
|
||||
default:
|
||||
UNREACHABLE("Should never happen");
|
||||
}
|
||||
|
||||
nir_store_shared(b, data, shared_offset);
|
||||
nir_instr_remove(&intrin->instr);
|
||||
} else {
|
||||
/* Reading back isn't allowed by VK_EXT_mesh_shader but allowed by
|
||||
* VK_NV_mesh_shader. We support readback for completeness and in case we
|
||||
* add support for NV specific extension in the future */
|
||||
nir_def *data = nir_load_shared(b, 1, 32, shared_offset);
|
||||
switch (sem.location) {
|
||||
case VARYING_SLOT_VIEWPORT:
|
||||
/* In case of Viewport, find the first index that is set. */
|
||||
data = nir_find_lsb(b, data);
|
||||
break;
|
||||
case VARYING_SLOT_CULL_PRIMITIVE:
|
||||
/* In case of CullPrimitive, we check if no bits are set */
|
||||
data = nir_ine_imm(b, data, 0);
|
||||
break;
|
||||
default:
|
||||
UNREACHABLE("Should never happen");
|
||||
}
|
||||
nir_def_replace(&intrin->def, data);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
nak_nir_lower_mesh_emulated_attributes(nir_shader *nir)
|
||||
{
|
||||
if (nir->info.stage != MESA_SHADER_MESH)
|
||||
return false;
|
||||
|
||||
/* Only apply this pass when really needed */
|
||||
if ((nir->info.per_primitive_outputs &
|
||||
(VARYING_BIT_CULL_PRIMITIVE | VARYING_BIT_VIEWPORT)) == 0)
|
||||
return false;
|
||||
|
||||
/* If we are here, we need to emulate Viewport / CullPrimitive with the
|
||||
* ViewportMask. This means if we need to always keep a shadow copy of the
|
||||
* ViewportMask and CullPrimitive in shared memory and write the actual
|
||||
* ViewportMask at the end of the shader. */
|
||||
bool progress = false;
|
||||
|
||||
/* Reserve space for the Viewport and CullPrimitive shadow copies */
|
||||
uint32_t shared_memory_offset = nir->info.shared_size;
|
||||
nir->info.shared_size += 8 * nir->info.mesh.max_primitives_out;
|
||||
|
||||
struct lower_emulated_attributes_state state = {
|
||||
.viewport_shared_offset = shared_memory_offset,
|
||||
.cullprimitive_shared_offset =
|
||||
shared_memory_offset + 4 * nir->info.mesh.max_primitives_out,
|
||||
};
|
||||
|
||||
/* First we lower things to shared memory */
|
||||
progress |= nir_shader_intrinsics_pass(nir, lower_emulated_attributes_intrin,
|
||||
nir_metadata_control_flow, &state);
|
||||
|
||||
/* Finally, we ensure that the shared region is init at the start of the
|
||||
* shader and we add primitive writes at the end of the shader to write the
|
||||
* real value depending on the culling state.*/
|
||||
if (progress) {
|
||||
nir_function_impl *impl = nir_shader_get_entrypoint(nir);
|
||||
nir_builder b = nir_builder_at(nir_before_impl(impl));
|
||||
nir_def *zero = nir_imm_int(&b, 0);
|
||||
nir_def *viewport_default = nir_imm_int(&b, 1 << 0);
|
||||
|
||||
nir_def *lane_id =
|
||||
nak_nir_load_sysval(&b, NAK_SV_LANE_ID, ACCESS_CAN_REORDER);
|
||||
nir_push_if(&b, nir_ieq(&b, lane_id, zero));
|
||||
{
|
||||
for (uint32_t i = 0; i < nir->info.mesh.max_primitives_out; i++) {
|
||||
nir_store_shared(
|
||||
&b, viewport_default,
|
||||
nir_imm_int(&b, state.viewport_shared_offset + i * 4));
|
||||
nir_store_shared(
|
||||
&b, zero,
|
||||
nir_imm_int(&b, state.cullprimitive_shared_offset + i * 4));
|
||||
}
|
||||
}
|
||||
nir_pop_if(&b, NULL);
|
||||
nir_barrier(&b, SCOPE_WORKGROUP, SCOPE_WORKGROUP, NIR_MEMORY_ACQ_REL,
|
||||
nir_var_mem_shared);
|
||||
|
||||
b = nir_builder_at(nir_after_impl(impl));
|
||||
nir_barrier(&b, SCOPE_WORKGROUP, SCOPE_WORKGROUP, NIR_MEMORY_ACQ_REL,
|
||||
nir_var_mem_shared);
|
||||
nir_push_if(&b, nir_ieq(&b, lane_id, zero));
|
||||
{
|
||||
for (uint32_t i = 0; i < nir->info.mesh.max_primitives_out; i++) {
|
||||
nir_def *viewport_mask = nir_load_shared(
|
||||
&b, 1, 32,
|
||||
nir_imm_int(&b, state.viewport_shared_offset + i * 4));
|
||||
nir_def *cull_primitive = nir_load_shared(
|
||||
&b, 1, 32,
|
||||
nir_imm_int(&b, state.cullprimitive_shared_offset + i * 4));
|
||||
|
||||
viewport_mask = nir_bcsel(&b, nir_ine_imm(&b, cull_primitive, 0),
|
||||
zero, viewport_mask);
|
||||
nir_store_per_primitive_output(
|
||||
&b, viewport_mask, nir_imm_int(&b, i), zero, .base = 0,
|
||||
.src_type = nir_type_uint32,
|
||||
.io_semantics = (nir_io_semantics){
|
||||
.location = VARYING_SLOT_VIEWPORT_MASK,
|
||||
.num_slots = 1,
|
||||
});
|
||||
}
|
||||
}
|
||||
nir_pop_if(&b, NULL);
|
||||
}
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
||||
bool
|
||||
nak_nir_lower_mesh_intrinsics(nir_shader *nir,
|
||||
struct lower_mesh_intrinsics_ctx *ctx)
|
||||
{
|
||||
return nir_shader_intrinsics_pass(
|
||||
nir, lower_mesh_intrin, nir_metadata_block_index | nir_metadata_dominance,
|
||||
ctx);
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_launch_mesh_workgroups(nir_builder *b, nir_intrinsic_instr *intrin)
|
||||
{
|
||||
b->cursor = nir_before_instr(&intrin->instr);
|
||||
|
||||
nir_def *dim = intrin->src[0].ssa;
|
||||
nir_def *x = nir_channel(b, dim, 0);
|
||||
nir_def *y = nir_channel(b, dim, 1);
|
||||
nir_def *z = nir_channel(b, dim, 2);
|
||||
nir_def *task_count = nir_imul(b, nir_imul(b, x, y), z);
|
||||
|
||||
const struct nak_nir_isbe_flags flags = {
|
||||
.access = NAK_ISBE_ACCESS_ATTR,
|
||||
.output = true,
|
||||
.skew = false,
|
||||
.per_primitive = false,
|
||||
};
|
||||
|
||||
nir_isbewr_nv(b, task_count, nir_imm_int(b, 0x4),
|
||||
.flags = NAK_AS_U32(flags));
|
||||
nir_isbewr_nv(b, x, nir_imm_int(b, 0x8), .flags = NAK_AS_U32(flags));
|
||||
nir_isbewr_nv(b, y, nir_imm_int(b, 0xC), .flags = NAK_AS_U32(flags));
|
||||
nir_isbewr_nv(b, z, nir_imm_int(b, 0x10), .flags = NAK_AS_U32(flags));
|
||||
nir_instr_remove(&intrin->instr);
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_task_intrin(nir_builder *b,
|
||||
nir_intrinsic_instr *intrin,
|
||||
void *cb_data)
|
||||
{
|
||||
switch (intrin->intrinsic) {
|
||||
case nir_intrinsic_load_shared:
|
||||
return lower_load_shared(b, intrin, 0);
|
||||
case nir_intrinsic_store_shared:
|
||||
return lower_store_shared(b, intrin, 0);
|
||||
case nir_intrinsic_load_workgroup_index:
|
||||
return lower_load_workgroup_index(b, intrin, true);
|
||||
case nir_intrinsic_launch_mesh_workgroups:
|
||||
return lower_launch_mesh_workgroups(b, intrin);
|
||||
case nir_intrinsic_shared_atomic:
|
||||
case nir_intrinsic_shared_atomic_swap:
|
||||
UNREACHABLE(
|
||||
"Should have been lowered by nak_nir_lower_mesh_stages_shared_atomics");
|
||||
case nir_intrinsic_load_task_payload:
|
||||
case nir_intrinsic_store_task_payload:
|
||||
case nir_intrinsic_task_payload_atomic:
|
||||
case nir_intrinsic_task_payload_atomic_swap:
|
||||
UNREACHABLE("Should have been lowered by nvk_nir_lower_task_shader");
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
nak_nir_lower_task_intrinsics(nir_shader *nir)
|
||||
{
|
||||
return nir_shader_intrinsics_pass(nir, lower_task_intrin,
|
||||
nir_metadata_block_index |
|
||||
nir_metadata_dominance,
|
||||
NULL);
|
||||
}
|
||||
|
|
@ -76,6 +76,8 @@ enum ENUM_PACKED nak_attr {
|
|||
NAK_ATTR_INSTANCE_ID = 0x2f8,
|
||||
NAK_ATTR_VERTEX_ID = 0x2fc,
|
||||
|
||||
/* System values D */
|
||||
NAK_ATTR_VIEWPORT_MASK = 0x3a0,
|
||||
NAK_ATTR_BARY_COORD_NO_PERSP_X = 0x3a8,
|
||||
NAK_ATTR_BARY_COORD_NO_PERSP_Y = 0x3ac,
|
||||
NAK_ATTR_BARY_COORD_NO_PERSP_Z = 0x3b0,
|
||||
|
|
@ -84,7 +86,8 @@ enum ENUM_PACKED nak_attr {
|
|||
NAK_ATTR_BARY_COORD_X = 0x3b4,
|
||||
NAK_ATTR_BARY_COORD_Y = 0x3b8,
|
||||
NAK_ATTR_BARY_COORD_Z = 0x3bc,
|
||||
NAK_ATTR_BARY_COORD = NAK_ATTR_BARY_COORD_X,
|
||||
NAK_ATTR_BARY_COORD = NAK_ATTR_BARY_COORD_X,
|
||||
NAK_ATTR_SPH_END = NAK_ATTR_BARY_COORD_Z + 4,
|
||||
|
||||
/* Not in SPH */
|
||||
NAK_ATTR_FRONT_FACE = 0x3fc,
|
||||
|
|
@ -100,6 +103,8 @@ nak_attribute_attr_addr(UNUSED const struct nak_compiler *nak,
|
|||
|
||||
uint16_t nak_varying_attr_addr(const struct nak_compiler *nak,
|
||||
gl_varying_slot slot);
|
||||
uint16_t nak_varying_mesh_skew_attr_addr(const struct nak_compiler *nak,
|
||||
gl_varying_slot slot);
|
||||
uint16_t nak_sysval_attr_addr(const struct nak_compiler *nak,
|
||||
gl_system_value sysval);
|
||||
|
||||
|
|
@ -274,6 +279,81 @@ struct nak_nir_isbe_flags {
|
|||
uint32_t pad : 27;
|
||||
};
|
||||
|
||||
struct lower_mesh_intrinsics_ctx {
|
||||
const struct nak_compiler *nak;
|
||||
|
||||
uint32_t max_vertices_out;
|
||||
uint32_t max_primitives_out;
|
||||
bool has_task_shader;
|
||||
|
||||
BITSET_DECLARE(skew_vert_attr_used, NAK_ATTR_SPH_END);
|
||||
BITSET_DECLARE(skew_prim_attr_used, NAK_ATTR_SPH_END);
|
||||
};
|
||||
|
||||
#define NAK_MESH_SKEW_GROUP_COUNT 32
|
||||
|
||||
static uint32_t
|
||||
nak_mesh_skew_attr_used_index(uint32_t base_addr)
|
||||
{
|
||||
assert(base_addr < NAK_ATTR_SPH_END);
|
||||
|
||||
return base_addr / 4;
|
||||
}
|
||||
|
||||
static uint32_t
|
||||
nak_mesh_skew_vert_group_size(const struct lower_mesh_intrinsics_ctx *ctx)
|
||||
{
|
||||
return BITSET_COUNT(ctx->skew_vert_attr_used) * 4 * NAK_MESH_SKEW_GROUP_COUNT;
|
||||
}
|
||||
|
||||
static uint32_t
|
||||
nak_mesh_skew_vert_total_size(const struct lower_mesh_intrinsics_ctx *ctx)
|
||||
{
|
||||
return nak_mesh_skew_vert_group_size(ctx) * DIV_ROUND_UP(ctx->max_vertices_out, NAK_MESH_SKEW_GROUP_COUNT);
|
||||
}
|
||||
|
||||
static uint32_t
|
||||
nak_mesh_skew_prim_group_size(const struct lower_mesh_intrinsics_ctx *ctx)
|
||||
{
|
||||
return BITSET_COUNT(ctx->skew_prim_attr_used) * 4 * NAK_MESH_SKEW_GROUP_COUNT;
|
||||
}
|
||||
|
||||
static uint32_t
|
||||
nak_mesh_skew_prim_total_size(const struct lower_mesh_intrinsics_ctx *ctx)
|
||||
{
|
||||
return nak_mesh_skew_prim_group_size(ctx) * DIV_ROUND_UP(ctx->max_primitives_out, NAK_MESH_SKEW_GROUP_COUNT);
|
||||
}
|
||||
|
||||
static uint32_t
|
||||
nak_mesh_skew_total_size(const struct lower_mesh_intrinsics_ctx *ctx)
|
||||
{
|
||||
return nak_mesh_skew_vert_total_size(ctx) + nak_mesh_skew_prim_total_size(ctx);
|
||||
}
|
||||
|
||||
static uint32_t
|
||||
nak_mesh_skew_offset(const struct lower_mesh_intrinsics_ctx *ctx,
|
||||
gl_varying_slot slot,
|
||||
uint32_t base_addr,
|
||||
bool per_primitive)
|
||||
{
|
||||
const uint32_t bit_idx = nak_mesh_skew_attr_used_index(base_addr);
|
||||
|
||||
uint32_t bit_count;
|
||||
|
||||
if (per_primitive)
|
||||
bit_count = BITSET_PREFIX_SUM(ctx->skew_prim_attr_used, bit_idx);
|
||||
else
|
||||
bit_count = BITSET_PREFIX_SUM(ctx->skew_vert_attr_used, bit_idx);
|
||||
|
||||
uint32_t size = bit_count * 4 * NAK_MESH_SKEW_GROUP_COUNT;
|
||||
|
||||
return size;
|
||||
}
|
||||
|
||||
bool nak_nir_lower_mesh_emulated_attributes(nir_shader *nir);
|
||||
bool nak_nir_lower_mesh_intrinsics(nir_shader *nir, struct lower_mesh_intrinsics_ctx *ctx);
|
||||
bool nak_nir_lower_task_intrinsics(nir_shader *nir);
|
||||
|
||||
enum nak_interp_mode {
|
||||
NAK_INTERP_MODE_PERSPECTIVE,
|
||||
NAK_INTERP_MODE_SCREEN_LINEAR,
|
||||
|
|
|
|||
|
|
@ -52,6 +52,7 @@ nvk_files = files(
|
|||
'nvk_mme.c',
|
||||
'nvk_mme.h',
|
||||
'nvk_nir_lower_descriptors.c',
|
||||
'nvk_nir_lower_mesh_shader.c',
|
||||
'nvk_physical_device.c',
|
||||
'nvk_physical_device.h',
|
||||
'nvk_private.h',
|
||||
|
|
|
|||
|
|
@ -60,6 +60,7 @@ struct lower_descriptors_ctx {
|
|||
bool use_edb_buffer_views;
|
||||
bool clamp_desc_array_bounds;
|
||||
bool indirect_bind;
|
||||
bool has_task_shader;
|
||||
nir_address_format ubo_addr_format;
|
||||
nir_address_format ssbo_addr_format;
|
||||
|
||||
|
|
@ -1122,6 +1123,8 @@ static bool
|
|||
try_lower_intrin(nir_builder *b, nir_intrinsic_instr *intrin,
|
||||
const struct lower_descriptors_ctx *ctx)
|
||||
{
|
||||
const mesa_shader_stage stage = b->shader->info.stage;
|
||||
|
||||
switch (intrin->intrinsic) {
|
||||
case nir_intrinsic_load_constant:
|
||||
return lower_load_constant(b, intrin, ctx);
|
||||
|
|
@ -1133,10 +1136,20 @@ try_lower_intrin(nir_builder *b, nir_intrinsic_instr *intrin,
|
|||
UNREACHABLE("Should have been lowered by nir_lower_cs_intrinsics()");
|
||||
|
||||
case nir_intrinsic_load_num_workgroups:
|
||||
/* We use ISBE.ATTR to pass this from task. */
|
||||
if (stage == MESA_SHADER_MESH && ctx->has_task_shader)
|
||||
return false;
|
||||
|
||||
if (stage == MESA_SHADER_MESH || stage == MESA_SHADER_TASK)
|
||||
return lower_sysval_to_root_table(b, intrin, draw.mesh.group_count, ctx);
|
||||
|
||||
return lower_sysval_to_root_table(b, intrin, cs.group_count, ctx);
|
||||
|
||||
case nir_intrinsic_load_base_workgroup_id:
|
||||
return lower_sysval_to_root_table(b, intrin, cs.base_group, ctx);
|
||||
if (stage == MESA_SHADER_COMPUTE)
|
||||
return lower_sysval_to_root_table(b, intrin, cs.base_group, ctx);
|
||||
|
||||
return false;
|
||||
|
||||
case nir_intrinsic_load_push_constant:
|
||||
return lower_load_push_constant(b, intrin, ctx);
|
||||
|
|
@ -1548,6 +1561,7 @@ nvk_nir_lower_descriptors(nir_shader *nir,
|
|||
rs->images != VK_PIPELINE_ROBUSTNESS_IMAGE_BEHAVIOR_DISABLED_EXT,
|
||||
.indirect_bind =
|
||||
shader_flags & VK_SHADER_CREATE_INDIRECT_BINDABLE_BIT_EXT,
|
||||
.has_task_shader = (shader_flags & VK_SHADER_CREATE_NO_TASK_SHADER_BIT_EXT) == 0,
|
||||
.ssbo_addr_format = nvk_ssbo_addr_format(pdev, rs),
|
||||
.ubo_addr_format = nvk_ubo_addr_format(pdev, rs),
|
||||
};
|
||||
|
|
|
|||
190
src/nouveau/vulkan/nvk_nir_lower_mesh_shader.c
Normal file
190
src/nouveau/vulkan/nvk_nir_lower_mesh_shader.c
Normal file
|
|
@ -0,0 +1,190 @@
|
|||
/*
|
||||
* Copyright © 2026 Valve Corporation.
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
#include "util/macros.h"
|
||||
#include "nir.h"
|
||||
#include "nir_builder.h"
|
||||
#include "nir_defines.h"
|
||||
#include "nvk_shader.h"
|
||||
#include "shader_enums.h"
|
||||
|
||||
static bool
|
||||
add_task_payload_base_offset(nir_builder *b, nir_intrinsic_instr *intrin,
|
||||
void *data)
|
||||
{
|
||||
const uint32_t *offset = data;
|
||||
|
||||
switch (intrin->intrinsic) {
|
||||
case nir_intrinsic_load_task_payload:
|
||||
case nir_intrinsic_store_task_payload:
|
||||
case nir_intrinsic_task_payload_atomic:
|
||||
case nir_intrinsic_task_payload_atomic_swap:
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
unsigned base = nir_intrinsic_base(intrin);
|
||||
nir_intrinsic_set_base(intrin, base + *offset);
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
nvk_nir_lower_common_task_payload(nir_shader *nir)
|
||||
{
|
||||
/* The first 0x20 bytes are used by launch_mesh_workgroups */
|
||||
uint32_t task_payload_reserved_size = 0x20;
|
||||
|
||||
/* Take into account the reserved chunk in task memory */
|
||||
nir->info.task_payload_size += task_payload_reserved_size;
|
||||
|
||||
/* Add the reserved chunk to every task payload accesses */
|
||||
return nir_shader_intrinsics_pass(nir, add_task_payload_base_offset,
|
||||
nir_metadata_all,
|
||||
&task_payload_reserved_size);
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_set_vertex_and_primitive_count_intrin(nir_builder *b,
|
||||
nir_intrinsic_instr *intrin,
|
||||
UNUSED void *data)
|
||||
{
|
||||
if (intrin->intrinsic != nir_intrinsic_set_vertex_and_primitive_count)
|
||||
return false;
|
||||
|
||||
b->cursor = nir_before_instr(&intrin->instr);
|
||||
|
||||
nir_def *local_invocation_index = nir_load_local_invocation_index(b);
|
||||
nir_push_if(b, nir_ieq(b, local_invocation_index, nir_imm_int(b, 0)));
|
||||
{
|
||||
nir_set_vertex_and_primitive_count(
|
||||
b, intrin->src[0].ssa, intrin->src[1].ssa, intrin->src[2].ssa);
|
||||
}
|
||||
nir_pop_if(b, NULL);
|
||||
|
||||
nir_instr_remove(&intrin->instr);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
nvk_nir_lower_mesh_shader(nir_shader *nir, VkShaderCreateFlagsEXT shader_flags)
|
||||
{
|
||||
if (nir->info.stage != MESA_SHADER_MESH)
|
||||
return false;
|
||||
|
||||
bool progress = false;
|
||||
|
||||
if ((shader_flags & VK_SHADER_CREATE_NO_TASK_SHADER_BIT_EXT) == 0)
|
||||
progress |= nvk_nir_lower_common_task_payload(nir);
|
||||
|
||||
progress |= nir_shader_intrinsics_pass(
|
||||
nir, lower_set_vertex_and_primitive_count_intrin, nir_metadata_none,
|
||||
NULL);
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
||||
static bool
|
||||
launch_mesh_workgroups_intrin(nir_builder *b, nir_intrinsic_instr *intrin,
|
||||
UNUSED void *data)
|
||||
{
|
||||
if (intrin->intrinsic != nir_intrinsic_launch_mesh_workgroups)
|
||||
return false;
|
||||
|
||||
b->cursor = nir_before_instr(&intrin->instr);
|
||||
|
||||
nir_def *local_invocation_index = nir_load_local_invocation_index(b);
|
||||
nir_push_if(b, nir_ieq(b, local_invocation_index, nir_imm_int(b, 0)));
|
||||
{
|
||||
nir_launch_mesh_workgroups(b, intrin->src[0].ssa);
|
||||
}
|
||||
nir_pop_if(b, NULL);
|
||||
nir_instr_remove(&intrin->instr);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static nir_intrinsic_op
|
||||
task_payload_intrinsic_to_shared(nir_intrinsic_op op)
|
||||
{
|
||||
switch (op) {
|
||||
case nir_intrinsic_load_task_payload:
|
||||
return nir_intrinsic_load_shared;
|
||||
case nir_intrinsic_store_task_payload:
|
||||
return nir_intrinsic_store_shared;
|
||||
case nir_intrinsic_task_payload_atomic:
|
||||
return nir_intrinsic_shared_atomic;
|
||||
case nir_intrinsic_task_payload_atomic_swap:
|
||||
return nir_intrinsic_shared_atomic_swap;
|
||||
default:
|
||||
return nir_num_intrinsics;
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_task_payload_intrin(nir_builder *b, nir_intrinsic_instr *intrin,
|
||||
UNUSED void *data)
|
||||
{
|
||||
nir_intrinsic_op new_op =
|
||||
task_payload_intrinsic_to_shared(intrin->intrinsic);
|
||||
if (new_op == nir_num_intrinsics)
|
||||
return false;
|
||||
|
||||
intrin->intrinsic = new_op;
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
add_shared_base_offset(nir_builder *b, nir_intrinsic_instr *intrin,
|
||||
UNUSED void *data)
|
||||
{
|
||||
switch (intrin->intrinsic) {
|
||||
case nir_intrinsic_load_shared:
|
||||
case nir_intrinsic_store_shared:
|
||||
case nir_intrinsic_shared_atomic:
|
||||
case nir_intrinsic_shared_atomic_swap:
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
const uint32_t shared_memory_base = b->shader->info.task_payload_size;
|
||||
assert(shared_memory_base % 0x80 == 0);
|
||||
|
||||
unsigned base = nir_intrinsic_base(intrin);
|
||||
nir_intrinsic_set_base(intrin, base + shared_memory_base);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
nvk_nir_lower_task_shader(nir_shader *nir)
|
||||
{
|
||||
if (nir->info.stage != MESA_SHADER_TASK)
|
||||
return false;
|
||||
|
||||
bool progress = false;
|
||||
|
||||
/* Apply common lowering for task payload */
|
||||
progress |= nvk_nir_lower_common_task_payload(nir);
|
||||
|
||||
/* Ensure alignment based on ISBE mem lines size (128 bytes) */
|
||||
nir->info.task_payload_size = align(nir->info.task_payload_size, 128);
|
||||
|
||||
/* Readjust shared memory size to include the task payload */
|
||||
nir->info.shared_size += nir->info.task_payload_size;
|
||||
|
||||
/* Now move all shared memory after task payload range and lower task payload
|
||||
* to shared memory */
|
||||
progress |= nir_shader_intrinsics_pass(nir, add_shared_base_offset,
|
||||
nir_metadata_all, NULL);
|
||||
progress |= nir_shader_intrinsics_pass(nir, lower_task_payload_intrin,
|
||||
nir_metadata_all, NULL);
|
||||
|
||||
/* Finally we ensure that launch_mesh_workgroups is only running on lane 0 */
|
||||
progress |= nir_shader_intrinsics_pass(nir, launch_mesh_workgroups_intrin,
|
||||
nir_metadata_none, NULL);
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
|
@ -382,8 +382,10 @@ nvk_lower_nir(struct nvk_device *dev, nir_shader *nir,
|
|||
lookup_ycbcr_conversion, &ycbcr_state);
|
||||
|
||||
nir_lower_compute_system_values_options csv_options = {
|
||||
.has_base_workgroup_id = true,
|
||||
.has_base_workgroup_id = mesa_shader_stage_is_compute(nir->info.stage),
|
||||
.lower_local_invocation_index = mesa_shader_stage_is_compute(nir->info.stage),
|
||||
.lower_workgroup_id_to_index = mesa_shader_stage_is_mesh(nir->info.stage),
|
||||
.lower_cs_local_id_to_index = mesa_shader_stage_is_mesh(nir->info.stage),
|
||||
};
|
||||
NIR_PASS(_, nir, nir_lower_compute_system_values, &csv_options);
|
||||
|
||||
|
|
@ -461,19 +463,43 @@ nvk_lower_nir(struct nvk_device *dev, nir_shader *nir,
|
|||
NIR_PASS(_, nir, nir_shader_intrinsics_pass,
|
||||
lower_load_intrinsic, nir_metadata_none, pdev);
|
||||
|
||||
if (mesa_shader_stage_is_compute(nir->info.stage)) {
|
||||
NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
|
||||
nir_var_mem_shared, shared_var_info);
|
||||
NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_shared,
|
||||
if (mesa_shader_stage_uses_workgroup(nir->info.stage)) {
|
||||
nir_variable_mode var_modes = nir_var_mem_shared;
|
||||
|
||||
if (mesa_shader_stage_is_mesh(nir->info.stage))
|
||||
var_modes |= nir_var_mem_task_payload;
|
||||
|
||||
NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, var_modes,
|
||||
shared_var_info);
|
||||
NIR_PASS(_, nir, nir_lower_explicit_io, var_modes,
|
||||
nir_address_format_32bit_offset);
|
||||
|
||||
if (nir->info.stage == MESA_SHADER_TASK)
|
||||
NIR_PASS(_, nir, nvk_nir_lower_task_shader);
|
||||
else if (nir->info.stage == MESA_SHADER_MESH)
|
||||
NIR_PASS(_, nir, nvk_nir_lower_mesh_shader, shader_flags);
|
||||
|
||||
if (nir->info.zero_initialize_shared_memory && nir->info.shared_size > 0) {
|
||||
/* QMD::SHARED_MEMORY_SIZE requires an alignment of 256B so it's safe to
|
||||
* align everything up to 16B so we can write whole vec4s.
|
||||
*/
|
||||
nir->info.shared_size = align(nir->info.shared_size, 16);
|
||||
uint32_t alignment;
|
||||
uint32_t chunk_size;
|
||||
|
||||
if (mesa_shader_stage_is_mesh(nir->info.stage)) {
|
||||
/* With task/mesh shaders, shared is in ISBE attribute space and is
|
||||
* allocated in "lines" of 128 bytes. Additionally, we ISBE I/O
|
||||
* instructions only support 1B and 4B granualities.*/
|
||||
alignment = 128;
|
||||
chunk_size = 4;
|
||||
} else {
|
||||
/* QMD::SHARED_MEMORY_SIZE requires an alignment of 256B so it's
|
||||
* safe to align everything up to 16B so we can write whole vec4s.
|
||||
*/
|
||||
alignment = 16;
|
||||
chunk_size = 16;
|
||||
}
|
||||
|
||||
nir->info.shared_size = align(nir->info.shared_size, alignment);
|
||||
NIR_PASS(_, nir, nir_zero_initialize_shared_memory,
|
||||
nir->info.shared_size, 16);
|
||||
nir->info.shared_size, chunk_size);
|
||||
|
||||
/* We need to call lower_compute_system_values again because
|
||||
* nir_zero_initialize_shared_memory generates load_invocation_id which
|
||||
|
|
|
|||
|
|
@ -158,6 +158,9 @@ nvk_nir_lower_descriptors(nir_shader *nir,
|
|||
struct vk_descriptor_set_layout * const *set_layouts,
|
||||
struct nvk_cbuf_map *cbuf_map_out);
|
||||
|
||||
bool nvk_nir_lower_mesh_shader(nir_shader *nir, VkShaderCreateFlagsEXT shader_flags);
|
||||
bool nvk_nir_lower_task_shader(nir_shader *nir);
|
||||
|
||||
VkResult
|
||||
nvk_compile_nir_shader(struct nvk_device *dev, nir_shader *nir,
|
||||
const VkAllocationCallbacks *alloc,
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue