From 976f22a5da30c503b05e2873e4d265fb0d4ecb07 Mon Sep 17 00:00:00 2001 From: Faith Ekstrand Date: Thu, 22 Aug 2024 13:22:41 -0500 Subject: [PATCH] nvk: Implement CmdProcess/ExecuteGeneratedCommandsEXT Part-of: --- src/nouveau/vulkan/meson.build | 1 + src/nouveau/vulkan/nvk_cmd_buffer.h | 19 +- src/nouveau/vulkan/nvk_cmd_indirect.c | 1104 +++++++++++++++++++++++++ 3 files changed, 1122 insertions(+), 2 deletions(-) create mode 100644 src/nouveau/vulkan/nvk_cmd_indirect.c diff --git a/src/nouveau/vulkan/meson.build b/src/nouveau/vulkan/meson.build index bcb217c4608..842a724a129 100644 --- a/src/nouveau/vulkan/meson.build +++ b/src/nouveau/vulkan/meson.build @@ -13,6 +13,7 @@ nvk_files = files( 'nvk_cmd_copy.c', 'nvk_cmd_dispatch.c', 'nvk_cmd_draw.c', + 'nvk_cmd_indirect.c', 'nvk_cmd_meta.c', 'nvk_cmd_pool.c', 'nvk_cmd_pool.h', diff --git a/src/nouveau/vulkan/nvk_cmd_buffer.h b/src/nouveau/vulkan/nvk_cmd_buffer.h index 11741a39080..fa9e83f479d 100644 --- a/src/nouveau/vulkan/nvk_cmd_buffer.h +++ b/src/nouveau/vulkan/nvk_cmd_buffer.h @@ -198,7 +198,7 @@ struct nvk_cmd_push { struct nvk_cmd_buffer { struct vk_command_buffer vk; - struct { + struct nvk_cmd_state { uint64_t descriptor_buffers[NVK_MAX_SETS]; struct nvk_graphics_state gfx; struct nvk_compute_state cs; @@ -311,7 +311,22 @@ nvk_get_descriptors_state(struct nvk_cmd_buffer *cmd, default: unreachable("Unhandled bind point"); } -}; +} + +static inline struct nvk_descriptor_state * +nvk_get_descriptor_state_for_stages(struct nvk_cmd_buffer *cmd, + VkShaderStageFlags stages) +{ + if (stages & VK_SHADER_STAGE_COMPUTE_BIT) { + assert(stages == VK_SHADER_STAGE_COMPUTE_BIT); + return &cmd->state.cs.descriptors; + } else if (stages & NVK_SHADER_STAGE_GRAPHICS_BITS) { + assert(!(stages & ~NVK_SHADER_STAGE_GRAPHICS_BITS)); + return &cmd->state.gfx.descriptors; + } else { + unreachable("Unknown shader stage"); + } +} VkResult nvk_cmd_buffer_upload_alloc(struct nvk_cmd_buffer *cmd, uint32_t size, uint32_t alignment, diff --git a/src/nouveau/vulkan/nvk_cmd_indirect.c b/src/nouveau/vulkan/nvk_cmd_indirect.c new file mode 100644 index 00000000000..698c3885afc --- /dev/null +++ b/src/nouveau/vulkan/nvk_cmd_indirect.c @@ -0,0 +1,1104 @@ +/* + * Copyright © 2024 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ +#include "nvk_cmd_buffer.h" +#include "nvk_device.h" +#include "nvk_entrypoints.h" +#include "nvk_indirect_execution_set.h" +#include "nvk_mme.h" +#include "nvk_physical_device.h" +#include "nvk_shader.h" + +#include "nir_builder.h" +#include "vk_pipeline.h" + +#include "nv_push.h" +#include "nv_push_cl9097.h" +#include "nv_push_cl906f.h" +#include "nv_push_cla0c0.h" +#include "nv_push_clb1c0.h" +#include "nv_push_clc6c0.h" + +struct nvk_indirect_commands_layout { + struct vk_object_base base; + + VkShaderStageFlags stages; + + /* Stages set by the first TOKEN_TYPE_EXECUTION_SET */ + VkShaderStageFlags set_stages; + + uint32_t cmd_seq_stride_B; + uint32_t qmd_size_per_seq_B; + + struct nvk_shader *init; + struct nvk_shader *process; +}; + +VK_DEFINE_NONDISP_HANDLE_CASTS(nvk_indirect_commands_layout, base, + VkIndirectCommandsLayoutEXT, + VK_OBJECT_TYPE_INDIRECT_COMMANDS_LAYOUT_EXT); + +struct process_cmd_push { + uint64_t in_addr; + uint64_t out_addr; + uint64_t qmd_pool_addr; + uint64_t count_addr; + uint32_t max_seq_count; + uint32_t ies_stride; + uint64_t ies_addr; + uint64_t root_addr; +}; + +struct process_cmd_in { + nir_def *in_addr; + nir_def *out_addr; + nir_def *qmd_pool_addr; + nir_def *count_addr; + nir_def *max_seq_count; + nir_def *ies_stride; + nir_def *ies_addr; + nir_def *root_addr; +}; + +static nir_def * +load_struct_var(nir_builder *b, nir_variable *var, uint32_t field) +{ + nir_deref_instr *deref = + nir_build_deref_struct(b, nir_build_deref_var(b, var), field); + return nir_load_deref(b, deref); +} + +static struct process_cmd_in +load_process_cmd_in(nir_builder *b) +{ + struct glsl_struct_field push_fields[] = { + { .type = glsl_uint64_t_type(), .name = "in_addr", .offset = 0x00 }, + { .type = glsl_uint64_t_type(), .name = "out_addr", .offset = 0x08 }, + { .type = glsl_uint64_t_type(), .name = "qmd_pool_addr", .offset = 0x10 }, + { .type = glsl_uint64_t_type(), .name = "count_addr", .offset = 0x18 }, + { .type = glsl_uint_type(), .name = "max_seq_count", .offset = 0x20 }, + { .type = glsl_uint_type(), .name = "ies_stride", .offset = 0x24 }, + { .type = glsl_uint64_t_type(), .name = "ies_addr", .offset = 0x28 }, + { .type = glsl_uint64_t_type(), .name = "root_addr", .offset = 0x30 }, + }; + const struct glsl_type *push_iface_type = + glsl_interface_type(push_fields, ARRAY_SIZE(push_fields), + GLSL_INTERFACE_PACKING_STD140, + false /* row_major */, "push"); + nir_variable *push = nir_variable_create(b->shader, nir_var_mem_push_const, + push_iface_type, "push"); + + return (struct process_cmd_in) { + .in_addr = load_struct_var(b, push, 0), + .out_addr = load_struct_var(b, push, 1), + .qmd_pool_addr = load_struct_var(b, push, 2), + .count_addr = load_struct_var(b, push, 3), + .max_seq_count = load_struct_var(b, push, 4), + .ies_stride = load_struct_var(b, push, 5), + .ies_addr = load_struct_var(b, push, 6), + .root_addr = load_struct_var(b, push, 7), + }; +} + +static nir_def * +build_exec_set_addr(nir_builder *b, struct process_cmd_in *in, nir_def *idx) +{ + nir_def *offset = nir_imul_2x32_64(b, in->ies_stride, idx); + return nir_iadd(b, in->ies_addr, offset); +} + +static nir_def * +load_global_dw(nir_builder *b, nir_def *addr, uint32_t offset_dw) +{ + return nir_load_global(b, nir_iadd_imm(b, addr, offset_dw * 4), 4, 1, 32); +} + +static void +store_global_dw(nir_builder *b, nir_def *addr, uint32_t offset_dw, + nir_def *data) +{ + assert(data->bit_size == 32 && data->num_components == 1); + nir_store_global(b, nir_iadd_imm(b, addr, offset_dw * 4), 4, data, 0x1); +} + +static void +nir_iadd_to_var(nir_builder *b, nir_variable *x_var, nir_def *y) +{ + nir_def *x = nir_load_var(b, x_var); + x = nir_iadd(b, x, y); + nir_store_var(b, x_var, x, 0x1); +} + +static void +nir_iadd_to_var_imm(nir_builder *b, nir_variable *x_var, uint64_t y) +{ + nir_def *x = nir_load_var(b, x_var); + x = nir_iadd_imm(b, x, y); + nir_store_var(b, x_var, x, 0x1); +} + +struct nvk_nir_push { + nir_variable *addr; + nir_variable *dw_count; + unsigned max_dw_count; +}; + +static void +nvk_nir_push_start(nir_builder *b, struct nvk_nir_push *p, nir_def *addr) +{ + p->addr = nir_local_variable_create(b->impl, glsl_uint64_t_type(), + "nvk_nir_push::addr"); + nir_store_var(b, p->addr, addr, 0x1); + p->dw_count = nir_local_variable_create(b->impl, glsl_uint_type(), + "nvk_nir_push::dw_count"); + nir_store_var(b, p->dw_count, nir_imm_int(b, 0), 0x1); + p->max_dw_count = 0; +} + +static inline void +nvk_nir_push_dw(nir_builder *b, struct nvk_nir_push *p, nir_def *dw) +{ + store_global_dw(b, nir_load_var(b, p->addr), 0, dw); + nir_iadd_to_var_imm(b, p->addr, 4); + nir_iadd_to_var_imm(b, p->dw_count, 1); + p->max_dw_count++; +} + +static inline void +nvk_nir_push_copy_dws(nir_builder *b, struct nvk_nir_push *p, + nir_def *src_dw_addr_in, nir_def *dw_count, + uint32_t max_dw_count) +{ + nir_variable *i = nir_local_variable_create(b->impl, glsl_uint_type(), "i"); + nir_store_var(b, i, nir_imm_int(b, 0), 0x1); + + nir_variable *src_dw_addr = + nir_local_variable_create(b->impl, glsl_uint64_t_type(), "src_dw_addr"); + nir_store_var(b, src_dw_addr, src_dw_addr_in, 0x1); + + nir_push_loop(b); + { + nir_push_if(b, nir_uge(b, nir_load_var(b, i), dw_count)); + { + nir_jump(b, nir_jump_break); + } + nir_pop_if(b, NULL); + + nir_def *dw = load_global_dw(b, nir_load_var(b, src_dw_addr), 0); + store_global_dw(b, nir_load_var(b, p->addr), 0, dw); + + nir_iadd_to_var_imm(b, i, 1); + nir_iadd_to_var_imm(b, p->addr, 4); + nir_iadd_to_var_imm(b, src_dw_addr, 4); + } + nir_pop_loop(b, NULL); + + nir_iadd_to_var(b, p->dw_count, dw_count); + p->max_dw_count += max_dw_count; +} + +static inline void +nvk_nir_build_p_1inc(nir_builder *b, struct nvk_nir_push *p, + int subc, int mthd, unsigned dw_count) +{ + uint32_t hdr = NVC0_FIFO_PKHDR_1I(subc, mthd, dw_count); + nvk_nir_push_dw(b, p, nir_imm_int(b, hdr)); +} +#define nvk_nir_P_1INC(b, p, class, mthd, size_dw) \ + nvk_nir_build_p_1inc((b), (p), SUBC_##class, class##_##mthd, (size_dw)) + +static void +nvk_nir_build_pad_NOP(nir_builder *b, struct nvk_nir_push *p, uint32_t nop) +{ + nir_push_loop(b); + { + nir_push_if(b, nir_uge_imm(b, nir_load_var(b, p->dw_count), + p->max_dw_count)); + { + nir_jump(b, nir_jump_break); + } + nir_pop_if(b, NULL); + + store_global_dw(b, nir_load_var(b, p->addr), 0, nir_imm_int(b, nop)); + nir_iadd_to_var_imm(b, p->addr, 4); + nir_iadd_to_var_imm(b, p->dw_count, 1); + } + nir_pop_loop(b, NULL); +} +#define nvk_nir_pad_NOP(b, p, class) \ + nvk_nir_build_pad_NOP((b), (p), \ + NVC0_FIFO_PKHDR_IL(SUBC_##class, class##_NO_OPERATION, 0)) + +#define QMD_ALIGN 0x100 +#define QMD_ALLOC_SIZE QMD_ALIGN +#define QMD_ROOT_SIZE (sizeof(struct nvk_ies_cs_qmd) + \ + sizeof(struct nvk_root_descriptor_table)) + +static_assert(sizeof(struct nvk_ies_cs_qmd) % QMD_ALIGN == 0, + "QMD size is not properly algined"); +static_assert(sizeof(struct nvk_root_descriptor_table) % QMD_ALIGN == 0, + "Root descriptor table size is not aligned"); + +static void +copy_repl_global_dw(nir_builder *b, nir_def *dst_addr, nir_def *src_addr, + nir_def **repl_dw, uint32_t dw_count) +{ + for (uint32_t i = 0; i < dw_count; i++) { + nir_def *dw; + if (repl_dw[i] == NULL) + dw = load_global_dw(b, src_addr, i); + else + dw = repl_dw[i]; + store_global_dw(b, dst_addr, i, dw); + } +} + +static void +build_process_cs_cmd_seq(nir_builder *b, struct nvk_nir_push *p, + nir_def *in_addr, nir_def *seq_idx, + struct process_cmd_in *in, + struct nvk_physical_device *pdev, + const VkIndirectCommandsLayoutCreateInfoEXT *info, + uint32_t *qmd_size_per_seq_B_out) +{ + /* If we don't have any indirect execution set, the currently bound shader + * will be passed in there. + */ + nir_def *shader_qmd_addr = in->ies_addr; + + nir_def *root_repl[sizeof(struct nvk_root_descriptor_table) / 4] = {}; + +#define root_dw(member) ( \ + assert(nvk_root_descriptor_offset(member) % 4 == 0), \ + nvk_root_descriptor_offset(member) / 4) + + root_repl[root_dw(cs.base_group[0])] = nir_imm_int(b, 0); + root_repl[root_dw(cs.base_group[1])] = nir_imm_int(b, 0); + root_repl[root_dw(cs.base_group[2])] = nir_imm_int(b, 0); + + *qmd_size_per_seq_B_out = 0; + for (uint32_t t = 0; t < info->tokenCount; t++) { + const VkIndirectCommandsLayoutTokenEXT *token = &info->pTokens[t]; + + nir_def *token_addr = nir_iadd_imm(b, in_addr, token->offset); + switch (token->type) { + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT: { + assert(token->data.pExecutionSet->shaderStages == + VK_SHADER_STAGE_COMPUTE_BIT); + assert(t == 0); + + nir_def *idx = load_global_dw(b, token_addr, 0); + shader_qmd_addr = build_exec_set_addr(b, in, idx); + break; + } + + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_PUSH_CONSTANT_EXT: { + const VkPushConstantRange *pc_range = + &token->data.pPushConstant->updateRange; + + assert(pc_range->offset % 4 == 0); + assert(pc_range->size % 4 == 0); + + const uint32_t start_dw = root_dw(push) + (pc_range->offset / 4); + for (uint32_t i = 0; i < pc_range->size / 4; i++) + root_repl[start_dw + i] = load_global_dw(b, token_addr, i); + break; + } + + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_SEQUENCE_INDEX_EXT: { + const VkPushConstantRange *pc_range = + &token->data.pPushConstant->updateRange; + + assert(pc_range->offset % 4 == 0); + assert(pc_range->size == 4); + + const uint32_t dw = root_dw(push) + (pc_range->offset / 4); + root_repl[dw] = seq_idx; + break; + } + + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DISPATCH_EXT: { + nir_def *disp_size_x = load_global_dw(b, token_addr, 0); + nir_def *disp_size_y = load_global_dw(b, token_addr, 1); + nir_def *disp_size_z = load_global_dw(b, token_addr, 2); + + *qmd_size_per_seq_B_out += QMD_ROOT_SIZE; + + nir_push_if(b, nir_ior(b, nir_ior(b, nir_ine_imm(b, disp_size_x, 0), + nir_ine_imm(b, disp_size_y, 0)), + nir_ine_imm(b, disp_size_z, 0))); + { + /* The first dword in qmd_addr is an allocator in units of 256 + * bytes. + */ + nir_def *qmd_idx = + nir_global_atomic(b, 32, in->qmd_pool_addr, + nir_imm_int(b, QMD_ROOT_SIZE / QMD_ALIGN), + .atomic_op = nir_atomic_op_iadd); + nir_def *qmd_offset = + nir_imul_imm(b, nir_u2u64(b, qmd_idx), QMD_ALIGN); + nir_def *qmd_addr = nir_iadd(b, in->qmd_pool_addr, qmd_offset); + nir_def *root_addr = + nir_iadd_imm(b, qmd_addr, sizeof(struct nvk_ies_cs_qmd)); + + /* Upload and patch the root descriptor table */ + root_repl[root_dw(cs.group_count[0])] = disp_size_x; + root_repl[root_dw(cs.group_count[1])] = disp_size_y; + root_repl[root_dw(cs.group_count[2])] = disp_size_z; + copy_repl_global_dw(b, root_addr, in->root_addr, + root_repl, ARRAY_SIZE(root_repl)); + + /* Upload and patch the QMD */ + const struct nak_qmd_dispatch_size_layout qmd_layout = + nak_get_qmd_dispatch_size_layout(&pdev->info); + assert(qmd_layout.x_start % 32 == 0); + assert(qmd_layout.y_start == qmd_layout.x_start + 32); + assert(qmd_layout.z_start == qmd_layout.x_start + 64); + + nir_def *qmd_repl[sizeof(struct nvk_ies_cs_qmd) / 4] = {}; + qmd_repl[qmd_layout.x_start / 32] = disp_size_x; + qmd_repl[qmd_layout.y_start / 32] = disp_size_y; + qmd_repl[qmd_layout.z_start / 32] = disp_size_z; + + /* TODO: Get these from NAK? */ + const uint32_t cb0_lo_start = 1024, cb0_hi_start = 1056; + qmd_repl[cb0_lo_start / 32] = nir_unpack_64_2x32_split_x(b, root_addr); + qmd_repl[cb0_hi_start / 32] = + nir_ior(b, load_global_dw(b, shader_qmd_addr, cb0_hi_start / 32), + nir_unpack_64_2x32_split_y(b, root_addr)); + + copy_repl_global_dw(b, qmd_addr, shader_qmd_addr, + qmd_repl, ARRAY_SIZE(qmd_repl)); + + /* Now emit commands */ + nir_def *invoc = nir_imul_2x32_64(b, disp_size_x, disp_size_y); + invoc = nir_imul(b, invoc, nir_u2u64(b, disp_size_z)); + nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_ADD_CS_INVOCATIONS), 2); + nvk_nir_push_dw(b, p, nir_unpack_64_2x32_split_y(b, invoc)); + nvk_nir_push_dw(b, p, nir_unpack_64_2x32_split_x(b, invoc)); + + nvk_nir_P_1INC(b, p, NVA0C0, SEND_PCAS_A, 1); + nvk_nir_push_dw(b, p, nir_u2u32(b, nir_ushr_imm(b, qmd_addr, 8))); + + if (pdev->info.cls_compute >= AMPERE_COMPUTE_A) { + uint32_t signal; + V_NVC6C0_SEND_SIGNALING_PCAS2_B(signal, + PCAS_ACTION_INVALIDATE_COPY_SCHEDULE); + nvk_nir_P_1INC(b, p, NVC6C0, SEND_SIGNALING_PCAS2_B, 1); + nvk_nir_push_dw(b, p, nir_imm_int(b, signal)); + } else { + uint32_t signal; + V_NVA0C0_SEND_SIGNALING_PCAS_B(signal, { + .invalidate = INVALIDATE_TRUE, + .schedule = SCHEDULE_TRUE + }); + nvk_nir_P_1INC(b, p, NVA0C0, SEND_SIGNALING_PCAS_B, 1); + nvk_nir_push_dw(b, p, nir_imm_int(b, signal)); + } + } + nir_pop_if(b, NULL); + break; + } + + default: + unreachable("Unsupported indirect token type"); + } + } +} + +/* + * Graphics + */ + +static void +build_gfx_set_exec(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr, + struct process_cmd_in *in, + struct nvk_physical_device *pdev, + const VkIndirectCommandsExecutionSetTokenEXT *token) +{ + switch (token->type) { + case VK_INDIRECT_EXECUTION_SET_INFO_TYPE_PIPELINES_EXT: { + nir_def *idx = load_global_dw(b, token_addr, 0); + nir_def *push_addr = build_exec_set_addr(b, in, idx); + nir_def *dw_count = load_global_dw(b, push_addr, 0); + const uint16_t max_dw_count = + nvk_ies_gfx_pipeline_max_dw_count(pdev, token->shaderStages); + nvk_nir_push_copy_dws(b, p, nir_iadd_imm(b, push_addr, 4), + dw_count, max_dw_count); + break; + } + + case VK_INDIRECT_EXECUTION_SET_INFO_TYPE_SHADER_OBJECTS_EXT: { + int32_t i = 0; + gl_shader_stage type_stage[6] = {}; + nir_def *type_shader_idx[6] = {}; + gl_shader_stage last_vtgm = MESA_SHADER_VERTEX; + u_foreach_bit(s, token->shaderStages) { + gl_shader_stage stage = vk_to_mesa_shader_stage(1 << s); + + if (stage != MESA_SHADER_FRAGMENT) + last_vtgm = stage; + + uint32_t type = mesa_to_nv9097_shader_type(stage); + type_stage[type] = stage; + type_shader_idx[type] = load_global_dw(b, token_addr, i++); + } + + for (uint32_t type = 0; type < 6; type++) { + nir_def *shader_idx = type_shader_idx[type]; + if (shader_idx == NULL) + continue; + + bool is_last_vtgm = type_stage[type] == last_vtgm; + + nir_def *push_addr = build_exec_set_addr(b, in, shader_idx); + nir_def *hdr = load_global_dw(b, push_addr, 0); + nir_def *dw_count = + nir_extract_u16(b, hdr, nir_imm_int(b, is_last_vtgm)); + const uint16_t max_dw_count = + nvk_ies_gfx_shader_max_dw_count(pdev, token->shaderStages, + is_last_vtgm); + nvk_nir_push_copy_dws(b, p, nir_iadd_imm(b, push_addr, 4), + dw_count, max_dw_count); + } + break; + } + + default: + unreachable("Unknown indirect execution set type"); + } +} + +static void +build_push_gfx_const(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr, + const VkIndirectCommandsPushConstantTokenEXT *token) +{ + const VkPushConstantRange *pc_range = &token->updateRange; + + // TODO: Compute + assert(!(pc_range->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT)); + + assert(pc_range->offset % 4 == 0); + assert(pc_range->size % 4 == 0); + const uint32_t dw_count = pc_range->size / 4; + + nvk_nir_P_1INC(b, p, NV9097, LOAD_CONSTANT_BUFFER_OFFSET, 1 + dw_count); + nvk_nir_push_dw(b, p, nir_imm_int(b, + nvk_root_descriptor_offset(push) + pc_range->offset)); + for (uint32_t i = 0; i < dw_count; i++) + nvk_nir_push_dw(b, p, load_global_dw(b, token_addr, i)); +} + +static void +build_push_gfx_seq_idx(nir_builder *b, struct nvk_nir_push *p, + nir_def *token_addr, nir_def *seq_idx, + const VkIndirectCommandsPushConstantTokenEXT *token) +{ + const VkPushConstantRange *pc_range = &token->updateRange; + + // TODO: Compute + assert(!(pc_range->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT)); + + assert(pc_range->offset % 4 == 0); + assert(pc_range->size == 4); + nvk_nir_P_1INC(b, p, NV9097, LOAD_CONSTANT_BUFFER_OFFSET, 2); + nvk_nir_push_dw(b, p, nir_imm_int(b, + nvk_root_descriptor_offset(push) + pc_range->offset)); + nvk_nir_push_dw(b, p, seq_idx); +} + +static void +build_set_ib(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr, + UNUSED const VkIndirectCommandsIndexBufferTokenEXT *token) +{ + nir_def *addr_lo = load_global_dw(b, token_addr, 0); + nir_def *addr_hi = load_global_dw(b, token_addr, 1); + nir_def *size_B = load_global_dw(b, token_addr, 2); + nir_def *idx_fmt = load_global_dw(b, token_addr, 3); + + nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_IB), 4); + nvk_nir_push_dw(b, p, addr_hi); + nvk_nir_push_dw(b, p, addr_lo); + nvk_nir_push_dw(b, p, size_B); + nvk_nir_push_dw(b, p, idx_fmt); +} + +static nir_def * +nvk_nir_vb_stride(nir_builder *b, nir_def *vb_idx, nir_def *stride) +{ + return nir_pack_32_2x16_split(b, nir_u2u16(b, stride), + nir_u2u16(b, vb_idx)); +} + +static void +build_set_vb(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr, + const VkIndirectCommandsVertexBufferTokenEXT *token) +{ + nir_def *vb_idx = nir_imm_int(b, token->vertexBindingUnit); + nir_def *addr_lo = load_global_dw(b, token_addr, 0); + nir_def *addr_hi = load_global_dw(b, token_addr, 1); + nir_def *size_B = load_global_dw(b, token_addr, 2); + nir_def *stride_B = load_global_dw(b, token_addr, 3); + + nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_VB), 4); + nvk_nir_push_dw(b, p, vb_idx); + nvk_nir_push_dw(b, p, addr_hi); + nvk_nir_push_dw(b, p, addr_lo); + nvk_nir_push_dw(b, p, size_B); + + nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_SET_VB_STRIDE), 1); + nvk_nir_push_dw(b, p, nvk_nir_vb_stride(b, vb_idx, stride_B)); +} + +static void +build_draw(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr) +{ + nir_def *vertex_count = load_global_dw(b, token_addr, 0); + nir_def *instance_count = load_global_dw(b, token_addr, 1); + nir_def *first_vertex = load_global_dw(b, token_addr, 2); + nir_def *first_instance = load_global_dw(b, token_addr, 3); + + nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW), 5); + nvk_nir_push_dw(b, p, nir_imm_int(b, 0)); /* draw_index */ + nvk_nir_push_dw(b, p, vertex_count); + nvk_nir_push_dw(b, p, instance_count); + nvk_nir_push_dw(b, p, first_vertex); + nvk_nir_push_dw(b, p, first_instance); +} + +static void +build_draw_indexed(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr) +{ + nir_def *index_count = load_global_dw(b, token_addr, 0); + nir_def *instance_count = load_global_dw(b, token_addr, 1); + nir_def *first_index = load_global_dw(b, token_addr, 2); + nir_def *vertex_offset = load_global_dw(b, token_addr, 3); + nir_def *first_instance = load_global_dw(b, token_addr, 4); + + nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED), 6); + nvk_nir_push_dw(b, p, nir_imm_int(b, 0)); /* draw_index */ + nvk_nir_push_dw(b, p, index_count); + nvk_nir_push_dw(b, p, instance_count); + nvk_nir_push_dw(b, p, first_index); + nvk_nir_push_dw(b, p, vertex_offset); + nvk_nir_push_dw(b, p, first_instance); +} + +static void +build_draw_count(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr) +{ + nir_def *addr_lo = load_global_dw(b, token_addr, 0); + nir_def *addr_hi = load_global_dw(b, token_addr, 1); + nir_def *stride = load_global_dw(b, token_addr, 2); + nir_def *count = load_global_dw(b, token_addr, 3); + + nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDIRECT), 4); + nvk_nir_push_dw(b, p, addr_hi); + nvk_nir_push_dw(b, p, addr_lo); + nvk_nir_push_dw(b, p, count); + nvk_nir_push_dw(b, p, stride); +} + +static void +build_draw_indexed_count(nir_builder *b, struct nvk_nir_push *p, + nir_def *token_addr) +{ + nir_def *addr_lo = load_global_dw(b, token_addr, 0); + nir_def *addr_hi = load_global_dw(b, token_addr, 1); + nir_def *stride = load_global_dw(b, token_addr, 2); + nir_def *count = load_global_dw(b, token_addr, 3); + + nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED_INDIRECT), 4); + nvk_nir_push_dw(b, p, addr_hi); + nvk_nir_push_dw(b, p, addr_lo); + nvk_nir_push_dw(b, p, count); + nvk_nir_push_dw(b, p, stride); +} + +static void +build_process_gfx_cmd_seq(nir_builder *b, struct nvk_nir_push *p, + nir_def *in_addr, nir_def *seq_idx, + struct process_cmd_in *in, + struct nvk_physical_device *pdev, + const VkIndirectCommandsLayoutCreateInfoEXT *info) +{ + for (uint32_t t = 0; t < info->tokenCount; t++) { + const VkIndirectCommandsLayoutTokenEXT *token = &info->pTokens[t]; + + nir_def *token_addr = nir_iadd_imm(b, in_addr, token->offset); + switch (token->type) { + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT: + assert(t == 0); + build_gfx_set_exec(b, p, token_addr, in, pdev, + token->data.pExecutionSet); + break; + + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_PUSH_CONSTANT_EXT: + build_push_gfx_const(b, p, token_addr, token->data.pPushConstant); + break; + + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_SEQUENCE_INDEX_EXT: + build_push_gfx_seq_idx(b, p, token_addr, seq_idx, + token->data.pPushConstant); + break; + + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_INDEX_BUFFER_EXT: + build_set_ib(b, p, token_addr, token->data.pIndexBuffer); + break; + + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_VERTEX_BUFFER_EXT: + build_set_vb(b, p, token_addr, token->data.pVertexBuffer); + break; + + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_INDEXED_EXT: + build_draw_indexed(b, p, token_addr); + break; + + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_EXT: + build_draw(b, p, token_addr); + break; + + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_INDEXED_COUNT_EXT: + build_draw_indexed_count(b, p, token_addr); + break; + + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_COUNT_EXT: + build_draw_count(b, p, token_addr); + break; + + default: + unreachable("Unsupported indirect token type"); + } + } +} + +static VkResult +build_init_shader(struct nvk_device *dev, + const VkIndirectCommandsLayoutCreateInfoEXT *info, + uint32_t qmd_size_per_seq_B, + const VkAllocationCallbacks *pAllocator, + struct nvk_shader **shader_out) +{ + /* There's nothing to initialize for graphics */ + if (info->shaderStages & NVK_SHADER_STAGE_GRAPHICS_BITS) { + assert(!(info->shaderStages & ~NVK_SHADER_STAGE_GRAPHICS_BITS)); + *shader_out = NULL; + return VK_SUCCESS; + } + + if (qmd_size_per_seq_B == 0) { + *shader_out = NULL; + return VK_SUCCESS; + } + + nir_builder build = + nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL, + "nvk-init-indirect-commands"); + build.shader->info.workgroup_size[0] = 32; + nir_builder *b = &build; + + struct process_cmd_in in = load_process_cmd_in(b); + + if (qmd_size_per_seq_B > 0) { + /* Initialize the QMD allocator to 1 * QMD_ALIGN so that the QMDs we + * allocate don't stomp the allocator. + */ + assert(info->shaderStages == VK_SHADER_STAGE_COMPUTE_BIT); + store_global_dw(b, in.qmd_pool_addr, 0, nir_imm_int(b, 1)); + } + + return nvk_compile_nir_shader(dev, build.shader, pAllocator, shader_out); +} + +static VkResult +build_process_shader(struct nvk_device *dev, + const VkIndirectCommandsLayoutCreateInfoEXT *info, + const VkAllocationCallbacks *pAllocator, + struct nvk_shader **shader_out, + uint32_t *cmd_seq_stride_B_out, + uint32_t *qmd_size_per_seq_B_out) +{ + struct nvk_physical_device *pdev = nvk_device_physical(dev); + + nir_builder build = + nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL, + "nvk-process-indirect-commands"); + build.shader->info.workgroup_size[0] = 32; + nir_builder *b = &build; + + struct process_cmd_in in = load_process_cmd_in(b); + + nir_def *seq_idx = nir_channel(b, nir_load_global_invocation_id(b, 32), 0); + + /* We always execute a 32-wide shader and nothing guarantees that + * max_seq_count is a multiple of 32 so we need to bail if our index is + * above the maximum. If we're inside the maximum but less than the count, + * we setill need to emit a bunch of NOP. + */ + nir_push_if(b, nir_uge(b, seq_idx, in.max_seq_count)); + { + nir_jump(b, nir_jump_halt); + } + nir_pop_if(b, NULL); + + nir_def *ind_count; + nir_push_if(b, nir_ine_imm(b, in.count_addr, 0)); + { + ind_count = load_global_dw(b, in.count_addr, 0); + ind_count = nir_umin(b, ind_count, in.max_seq_count); + } + nir_pop_if(b, NULL); + nir_def *count = nir_if_phi(b, ind_count, in.max_seq_count); + + nir_def *in_seq_addr = nir_iadd(b, in.in_addr, + nir_imul_imm(b, nir_u2u64(b, seq_idx), info->indirectStride)); + /* We'll replace this later once we know what it is */ + nir_def *out_stride = nir_imm_int(b, 0xc0ffee0); + nir_def *out_seq_addr = nir_iadd(b, in.out_addr, + nir_imul_2x32_64(b, seq_idx, out_stride)); + + struct nvk_nir_push push = {}; + nvk_nir_push_start(b, &push, out_seq_addr); + + nir_push_if(b, nir_ult(b, seq_idx, count)); + { + if (info->shaderStages & VK_SHADER_STAGE_COMPUTE_BIT) { + assert(info->shaderStages == VK_SHADER_STAGE_COMPUTE_BIT); + build_process_cs_cmd_seq(b, &push, in_seq_addr, seq_idx, + &in, pdev, info, qmd_size_per_seq_B_out); + } else if (info->shaderStages & NVK_SHADER_STAGE_GRAPHICS_BITS) { + assert(!(info->shaderStages & ~NVK_SHADER_STAGE_GRAPHICS_BITS)); + *qmd_size_per_seq_B_out = 0; + build_process_gfx_cmd_seq(b, &push, in_seq_addr, seq_idx, + &in, pdev, info); + } else { + unreachable("Unknown shader stage"); + } + } + nir_pop_if(b, NULL); + + /* Always pad the command buffer. In the case where seq_idx >= count, the + * entire sequence will be NO_OPERATION. + */ + if (info->shaderStages & VK_SHADER_STAGE_COMPUTE_BIT) { + nvk_nir_pad_NOP(b, &push, NVA0C0); + } else if (info->shaderStages & NVK_SHADER_STAGE_GRAPHICS_BITS) { + nvk_nir_pad_NOP(b, &push, NV9097); + } else { + unreachable("Unknown shader stage"); + } + + /* Replace the out stride with the actual size of a command stream */ + nir_load_const_instr *out_stride_const = + nir_instr_as_load_const(out_stride->parent_instr); + out_stride_const->value[0].u32 = push.max_dw_count * 4; + + /* We also output this stride to go in the layout struct */ + *cmd_seq_stride_B_out = push.max_dw_count * 4; + + return nvk_compile_nir_shader(dev, build.shader, pAllocator, shader_out); +} + +static void +nvk_indirect_commands_layout_destroy(struct nvk_device *dev, + struct nvk_indirect_commands_layout *layout, + const VkAllocationCallbacks *alloc) +{ + if (layout->init != NULL) + vk_shader_destroy(&dev->vk, &layout->init->vk, alloc); + if (layout->process != NULL) + vk_shader_destroy(&dev->vk, &layout->process->vk, alloc); + vk_object_free(&dev->vk, alloc, layout); +} + +VKAPI_ATTR VkResult VKAPI_CALL +nvk_CreateIndirectCommandsLayoutEXT( + VkDevice _device, + const VkIndirectCommandsLayoutCreateInfoEXT *pCreateInfo, + const VkAllocationCallbacks *pAllocator, + VkIndirectCommandsLayoutEXT *pIndirectCommandsLayout) +{ + VK_FROM_HANDLE(nvk_device, dev, _device); + VkResult result; + + struct nvk_indirect_commands_layout *layout = + vk_object_zalloc(&dev->vk, pAllocator, sizeof(*layout), + VK_OBJECT_TYPE_INDIRECT_COMMANDS_LAYOUT_EXT); + if (layout == NULL) + return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY); + + layout->stages = pCreateInfo->shaderStages; + + /* From the Vulkan 1.3.XXX spec: + * + * VUID-VkIndirectCommandsLayoutCreateInfoEXT-pTokens-11093 + * + * "The number of tokens in the pTokens array with type equal to + * VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT must be less than + * or equal to 1" + * + * and + * + * VUID-VkIndirectCommandsLayoutCreateInfoEXT-pTokens-11139 + * + * "If the pTokens array contains a + * VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT token, then this + * token must be the first token in the array" + */ + if (pCreateInfo->tokenCount > 0 && + pCreateInfo->pTokens[0].type == + VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT) { + const VkIndirectCommandsExecutionSetTokenEXT *token = + pCreateInfo->pTokens[0].data.pExecutionSet; + + /* Pipelines should never mismatch here. */ + if (token->type == VK_INDIRECT_EXECUTION_SET_INFO_TYPE_PIPELINES_EXT) + assert(token->shaderStages == pCreateInfo->shaderStages); + + layout->set_stages = token->shaderStages; + } + + result = build_process_shader(dev, pCreateInfo, pAllocator, + &layout->process, &layout->cmd_seq_stride_B, + &layout->qmd_size_per_seq_B); + if (result != VK_SUCCESS) { + nvk_indirect_commands_layout_destroy(dev, layout, pAllocator); + return result; + } + + if (layout->cmd_seq_stride_B > (NV_PUSH_MAX_COUNT * 4)) { + nvk_indirect_commands_layout_destroy(dev, layout, pAllocator); + return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY, + "Too many tokens in IndirectCommandsLayout"); + } + + result = build_init_shader(dev, pCreateInfo, layout->qmd_size_per_seq_B, + pAllocator, &layout->init); + if (result != VK_SUCCESS) { + nvk_indirect_commands_layout_destroy(dev, layout, pAllocator); + return result; + } + + *pIndirectCommandsLayout = nvk_indirect_commands_layout_to_handle(layout); + + return VK_SUCCESS; +} + +VKAPI_ATTR void VKAPI_CALL +nvk_DestroyIndirectCommandsLayoutEXT( + VkDevice _device, + VkIndirectCommandsLayoutEXT indirectCommandsLayout, + const VkAllocationCallbacks *pAllocator) +{ + VK_FROM_HANDLE(nvk_device, dev, _device); + VK_FROM_HANDLE(nvk_indirect_commands_layout, layout, + indirectCommandsLayout); + + if (layout == NULL) + return; + + nvk_indirect_commands_layout_destroy(dev, layout, pAllocator); +} + +VKAPI_ATTR void VKAPI_CALL +nvk_GetGeneratedCommandsMemoryRequirementsEXT( + VkDevice _device, + const VkGeneratedCommandsMemoryRequirementsInfoEXT *pInfo, + VkMemoryRequirements2 *pMemoryRequirements) +{ + VK_FROM_HANDLE(nvk_device, dev, _device); + VK_FROM_HANDLE(nvk_indirect_commands_layout, layout, + pInfo->indirectCommandsLayout); + struct nvk_physical_device *pdev = nvk_device_physical(dev); + + uint64_t size = layout->cmd_seq_stride_B * (uint64_t)pInfo->maxSequenceCount; + if (layout->qmd_size_per_seq_B > 0) { + size = align64(size, QMD_ALIGN); + size += QMD_ALLOC_SIZE; + size += layout->qmd_size_per_seq_B * pInfo->maxSequenceCount; + } + + pMemoryRequirements->memoryRequirements = (VkMemoryRequirements) { + .size = size, + .alignment = QMD_ALIGN, + .memoryTypeBits = BITFIELD_MASK(pdev->mem_type_count), + }; +} + +static void +nvk_cmd_process_cmds(struct nvk_cmd_buffer *cmd, + const VkGeneratedCommandsInfoEXT *info, + const struct nvk_cmd_state *state) +{ + VK_FROM_HANDLE(nvk_indirect_execution_set, ies, info->indirectExecutionSet); + VK_FROM_HANDLE(nvk_indirect_commands_layout, layout, + info->indirectCommandsLayout); + + struct process_cmd_push push = { + .in_addr = info->indirectAddress, + .out_addr = info->preprocessAddress, + .count_addr = info->sequenceCountAddress, + .max_seq_count = info->maxSequenceCount, + }; + + uint64_t qmd_addr = 0; + if (layout->stages & VK_SHADER_STAGE_COMPUTE_BIT) { + uint32_t global_size[3] = { 0, 0, 0 }; + VkResult result = nvk_cmd_flush_cs_qmd(cmd, global_size, &qmd_addr, + &push.root_addr); + if (unlikely(result != VK_SUCCESS)) { + vk_command_buffer_set_error(&cmd->vk, result); + return; + } + } + + if (layout->set_stages == 0) { + push.ies_addr = qmd_addr; + } else { + assert(layout->set_stages == layout->stages); + push.ies_addr = ies->mem->va->addr; + push.ies_stride = ies->stride_B; + } + + if (layout->qmd_size_per_seq_B > 0) { + assert(info->preprocessAddress % QMD_ALIGN == 0); + uint64_t qmd_offset = + layout->cmd_seq_stride_B * (uint64_t)info->maxSequenceCount; + qmd_offset = align64(qmd_offset, QMD_ALIGN); + push.qmd_pool_addr = info->preprocessAddress + qmd_offset; + } + + if (layout->init != NULL) { + nvk_cmd_dispatch_shader(cmd, layout->init, &push, sizeof(push), 1, 1, 1); + + struct nv_push *p = nvk_cmd_buffer_push(cmd, 2); + P_IMMD(p, NVA0C0, WAIT_FOR_IDLE, 0); + } + + nvk_cmd_dispatch_shader(cmd, layout->process, &push, sizeof(push), + DIV_ROUND_UP(info->maxSequenceCount, 32), 1, 1); +} + +static void +nvk_cmd_flush_process_state(struct nvk_cmd_buffer *cmd, + const VkGeneratedCommandsInfoEXT *info) +{ + struct nvk_descriptor_state *desc = + nvk_get_descriptor_state_for_stages(cmd, info->shaderStages); + nvk_cmd_buffer_flush_push_descriptors(cmd, desc); +} + +VKAPI_ATTR void VKAPI_CALL +nvk_CmdPreprocessGeneratedCommandsEXT(VkCommandBuffer commandBuffer, + const VkGeneratedCommandsInfoEXT *info, + VkCommandBuffer stateCommandBuffer) +{ + VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer); + VK_FROM_HANDLE(nvk_cmd_buffer, state_cmd, stateCommandBuffer); + + nvk_cmd_flush_process_state(state_cmd, info); + nvk_cmd_process_cmds(cmd, info, &state_cmd->state); +} + +VKAPI_ATTR void VKAPI_CALL +nvk_CmdExecuteGeneratedCommandsEXT(VkCommandBuffer commandBuffer, + VkBool32 isPreprocessed, + const VkGeneratedCommandsInfoEXT *info) +{ + VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer); + VK_FROM_HANDLE(nvk_indirect_commands_layout, layout, + info->indirectCommandsLayout); + + if (!isPreprocessed) { + nvk_cmd_flush_process_state(cmd, info); + nvk_cmd_process_cmds(cmd, info, &cmd->state); + + struct nv_push *p = nvk_cmd_buffer_push(cmd, 5); + P_IMMD(p, NVA0C0, INVALIDATE_SHADER_CACHES, { + .data = DATA_TRUE, + .constant = CONSTANT_TRUE, + .flush_data = FLUSH_DATA_TRUE, + }); + P_IMMD(p, NVB1C0, INVALIDATE_SKED_CACHES, 0); + __push_immd(p, SUBC_NV9097, NV906F_SET_REFERENCE, 0); + } + + if (layout->stages & VK_SHADER_STAGE_COMPUTE_BIT) { + assert(info->shaderStages == VK_SHADER_STAGE_COMPUTE_BIT); + nvk_cmd_buffer_flush_push_descriptors(cmd, &cmd->state.cs.descriptors); + } else if (layout->stages & NVK_SHADER_STAGE_GRAPHICS_BITS) { + assert(!(layout->stages & ~NVK_SHADER_STAGE_GRAPHICS_BITS)); + + nvk_cmd_buffer_flush_push_descriptors(cmd, &cmd->state.gfx.descriptors); + nvk_cmd_flush_gfx_dynamic_state(cmd); + + if (layout->set_stages == 0) { + /* In this case, we're using the CPU-bound shaders */ + nvk_cmd_flush_gfx_shaders(cmd); + nvk_cmd_flush_gfx_cbufs(cmd); + } else { + /* From the Vulkan 1.3.XXX spec: + * + * "If indirectCommandsLayout was created with a token sequence + * that contained the + * VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT token and + * indirectExecutionSet was created using + * VK_INDIRECT_EXECUTION_SET_INFO_TYPE_SHADER_OBJECTS_EXT, every + * executed VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT + * token must bind all the shader stages set in the + * VkIndirectCommandsExecutionSetTokenEXT::shaderStages used to + * create indirectCommandsLayout" + * + * So we unbind anything not explicitly bound by the layout and trust + * the layout to bind the rest. + */ + assert(layout->set_stages == layout->stages); + + uint8_t set_types = 0; + u_foreach_bit(s, layout->set_stages) { + gl_shader_stage stage = vk_to_mesa_shader_stage(1 << s); + uint32_t type = mesa_to_nv9097_shader_type(stage); + set_types |= BITFIELD_BIT(type); + } + + uint8_t unset_types = BITFIELD_MASK(6) & ~set_types; + + struct nv_push *p = nvk_cmd_buffer_push(cmd, 12); + u_foreach_bit(type, unset_types) { + P_IMMD(p, NV9097, SET_PIPELINE_SHADER(type), { + .enable = ENABLE_FALSE, + .type = type, + }); + } + } + } + + ASSERTED const uint64_t size = + layout->cmd_seq_stride_B * (uint64_t)info->maxSequenceCount; + assert(size <= info->preprocessSize); + + uint64_t addr = info->preprocessAddress; + uint64_t seq_count = info->maxSequenceCount; + + /* Break it into pices that are a multiple of cmd_seq_stride_B so that, if + * the kernel inserts a sync point between two of our pushes, it doesn't + * break a single command. + */ + const uint32_t max_seq_per_push = + (NV_PUSH_MAX_COUNT * 4) / layout->cmd_seq_stride_B; + + while (seq_count > 0) { + uint32_t push_seq = MIN2(seq_count, max_seq_per_push); + uint32_t push_size_B = push_seq * layout->cmd_seq_stride_B; + nvk_cmd_buffer_push_indirect(cmd, addr, push_size_B); + addr += push_size_B; + seq_count -= push_seq; + } + + if (layout->set_stages != 0) { + if (layout->stages & NVK_SHADER_STAGE_GRAPHICS_BITS) { + cmd->state.gfx.shaders_dirty |= NVK_SHADER_STAGE_GRAPHICS_BITS; + } + } +}