nvk: VK_KHR_copy_memory_indirect

Reviewed-by: Mary Guillemard <mary@mary.zone> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39869>
2026-06-09 07:58:16 +02:00 · 2026-02-11 16:58:51 -05:00 · 2026-02-11 16:58:51 -05:00 · cbec12627b
commit cbec12627b
parent 6824004a0b
11 changed files with 243 additions and 4 deletions
--- a/docs/features.txt
+++ b/docs/features.txt
@ -540,6 +540,7 @@ Khronos extensions that are not part of any Vulkan version:
  VK_KHR_calibrated_timestamps                          DONE (anv, hk, kk, nvk, panvk/v10+, radv, tu/a750+, vn)
  VK_KHR_compute_shader_derivatives                     DONE (anv, lvp, nvk, radv, tu, vn)
  VK_KHR_cooperative_matrix                             DONE (anv, nvk/Turing+, radv/gfx11+, vn)
+  VK_KHR_copy_memory_indirect                           DONE (nvk)
  VK_KHR_depth_clamp_zero_one                           DONE (anv, nvk, panvk, radv, tu, vn)
  VK_KHR_deferred_host_operations                       DONE (anv, hasvk, lvp, radv, tu, vn)
  VK_KHR_display                                        DONE (anv, nvk, panvk, pvr, radv, tu, v3dv, vn)
--- a/docs/relnotes/new_features.txt
+++ b/docs/relnotes/new_features.txt
@ -14,3 +14,4 @@ VK_EXT_zero_initialize_device_memory on panvk
 GL_EXT_shader_image_load_store on panfrost
 VK_KHR_swapchain_mutable_format on panvk
 VK_EXT_astc_decode_mode on panvk
+VK_KHR_copy_memory_indirect on nvk
--- a/src/nouveau/headers/meson.build
+++ b/src/nouveau/headers/meson.build
@ -107,7 +107,7 @@ nv_push_class_dump_h = custom_target(
  command : [prog_python, '@INPUT0@', '--classes', flat_nv_classes, '--out-h', '@OUTPUT0@'],
 )

-_libnvidia_headers = static_library(
+libnvidia_headers = static_library(
  'nvidia_headers_c',
  ['nv_push.c', cl_generated, nv_push_class_dump_h],
  include_directories : ['.', 'nvidia/classes', inc_include, inc_src],
@ -118,7 +118,7 @@ _libnvidia_headers = static_library(
 idep_nvidia_headers = declare_dependency(
  include_directories : include_directories('.', 'nvidia/classes'),
  sources : cl_generated,
-  link_with : _libnvidia_headers,
+  link_with : libnvidia_headers,
 )

 executable(
--- a/src/nouveau/vulkan/cl/nvk_copy_indirect.cl
+++ b/src/nouveau/vulkan/cl/nvk_copy_indirect.cl
@ -0,0 +1,50 @@
+/*
+ * Copyright 2026 Valve Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "nvk_copy_indirect.h"
+
+#include "nv_push.h"
+#include "nv_push_cl90b5.h"
+
+#include "compiler/libcl/libcl_vk.h"
+
+void
+nvk_copy_indirect(const __global void* in, uintptr_t in_stride,
+                  __global uint32_t* restrict out, uint32_t count)
+{
+   uint i = get_sub_group_local_id() + cl_group_id.x * 32;
+   if (i >= count)
+      return;
+
+   in += i * in_stride;
+   VkCopyMemoryIndirectCommandKHR cmd = *((__global VkCopyMemoryIndirectCommandKHR*)in);
+
+   uint32_t push_data[NVK_COPY_INDIRECT_CMD_WORDS];
+   struct nv_push push;
+   struct nv_push *p = &push;
+   nv_push_init(p, push_data, ARRAY_SIZE(push_data),
+                BITFIELD_BIT(SUBC_NV90B5));
+
+   P_MTHD(p, NV90B5, OFFSET_IN_UPPER);
+   P_NV90B5_OFFSET_IN_UPPER(p, cmd.srcAddress >> 32);
+   P_NV90B5_OFFSET_IN_LOWER(p, cmd.srcAddress & 0xffffffff);
+   P_NV90B5_OFFSET_OUT_UPPER(p, cmd.dstAddress >> 32);
+   P_NV90B5_OFFSET_OUT_LOWER(p, cmd.dstAddress & 0xffffffff);
+
+   P_MTHD(p, NV90B5, LINE_LENGTH_IN);
+   P_NV90B5_LINE_LENGTH_IN(p, cmd.size);
+
+   P_IMMD_WORD(p, NV90B5, LAUNCH_DMA, {
+      .data_transfer_type = DATA_TRANSFER_TYPE_PIPELINED,
+      .multi_line_enable = MULTI_LINE_ENABLE_FALSE,
+      .flush_enable = FLUSH_ENABLE_TRUE,
+      .src_memory_layout = SRC_MEMORY_LAYOUT_PITCH,
+      .dst_memory_layout = DST_MEMORY_LAYOUT_PITCH,
+      .remap_enable = REMAP_ENABLE_TRUE,
+   });
+
+   assert(nv_push_dw_count(p) == NVK_COPY_INDIRECT_CMD_WORDS);
+   memcpy(out + i * NVK_COPY_INDIRECT_CMD_WORDS, push_data, sizeof(push_data));
+}
--- a/src/nouveau/vulkan/cl/nvk_copy_indirect.h
+++ b/src/nouveau/vulkan/cl/nvk_copy_indirect.h
@ -0,0 +1,8 @@
+/*
+ * Copyright 2026 Valve Corporation
+ * SPDX-License-Identifier: MIT
+ */
+#pragma once
+
+#define NVK_COPY_INDIRECT_CMD_WORDS 8
+#define NVK_COPY_INDIRECT_CMD_BYTES (NVK_COPY_INDIRECT_CMD_WORDS * sizeof(uint32_t))
--- a/src/nouveau/vulkan/meson.build
+++ b/src/nouveau/vulkan/meson.build
@ -79,6 +79,7 @@ nvk_files = files(

 nvkcl_files = files(
  'cl/nvk_query.cl',
+  'cl/nvk_copy_indirect.cl',
 )

 nvk_entrypoints = custom_target(
@ -94,16 +95,19 @@ nvk_entrypoints = custom_target(
 )

 nvkcl_spv = custom_target(
-  input : nvkcl_files,
+  input : [nvkcl_files, libnvidia_headers],
  output : 'nvkcl.spv',
  command : [
    prog_mesa_clc, '-o', '@OUTPUT@', '--depfile', '@DEPFILE@', nvkcl_files, '--',
    '-I' + join_paths(meson.project_source_root(), 'src/compiler/libcl'),
    '-I' + join_paths(meson.current_source_dir(), '.'),
    '-I' + join_paths(meson.project_source_root(), 'src'),
+    '-I' + join_paths(meson.project_source_root(), 'src/nouveau/headers'),
+    '-I' + join_paths(meson.project_source_root(), 'src/nouveau/headers/nvidia/classes'),
+    '-I' + join_paths(meson.project_build_root(), 'src/nouveau/headers'),
    cl_args,
    ],
-  depfile : '@PLAINNAME@.d',
+  depfile : 'nvkcl.spv.d',
 )

 nvkcl = custom_target(
--- a/src/nouveau/vulkan/nvk_cmd_buffer.c
+++ b/src/nouveau/vulkan/nvk_cmd_buffer.c
@ -95,6 +95,7 @@ nvk_create_cmd_buffer(struct vk_command_pool *vk_pool,
   list_inithead(&cmd->owned_gart_mem);
   list_inithead(&cmd->owned_qmd);
   cmd->pushes = UTIL_DYNARRAY_INIT;
+   cmd->copy_memory_indirect_temps = UTIL_DYNARRAY_INIT;

   cmd->prev_subc = ffs(nvk_cmd_buffer_subchannel_mask(cmd)) - 1;

@ -126,6 +127,7 @@ nvk_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer,
   cmd->cond_render_mem = NULL;

   util_dynarray_clear(&cmd->pushes);
+   util_dynarray_clear(&cmd->copy_memory_indirect_temps);

   memset(&cmd->state, 0, sizeof(cmd->state));
 }
--- a/src/nouveau/vulkan/nvk_cmd_buffer.h
+++ b/src/nouveau/vulkan/nvk_cmd_buffer.h
@ -226,6 +226,8 @@ struct nvk_cmd_buffer {
   uint32_t upload_offset;

   struct nvk_cmd_mem *cond_render_mem;
+   /** Array of struct nvk_cmd_mem* */
+   struct util_dynarray copy_memory_indirect_temps;

   struct nvk_cmd_mem *push_mem;
   uint32_t *push_mem_limit;
--- a/src/nouveau/vulkan/nvk_cmd_indirect.c
+++ b/src/nouveau/vulkan/nvk_cmd_indirect.c
@ -13,6 +13,9 @@
 #include "nir_builder.h"
 #include "vk_pipeline.h"

+#include "nvkcl.h"
+#include "cl/nvk_copy_indirect.h"
+
 #include "clcb97.h"
 #include "nv_push.h"
 #include "nv_push_cl9097.h"
@ -1133,3 +1136,163 @@ nvk_CmdExecuteGeneratedCommandsEXT(VkCommandBuffer commandBuffer,
      }
   }
 }
+
+struct nvk_copy_indirect_push {
+   uint64_t in;
+   uint64_t in_stride;
+   uint64_t out;
+   uint32_t count;
+};
+
+static nir_shader *
+build_copy_indierct_shader(void)
+{
+   nir_builder build =
+      nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL,
+                                     "nvk-meta-copy-indirect");
+   nir_builder *b = &build;
+
+   struct glsl_struct_field push_fields[] = {
+      { .type = glsl_uint64_t_type(), .name = "in", .offset = 0 },
+      { .type = glsl_uint64_t_type(), .name = "in_stride", .offset = 8 },
+      { .type = glsl_uint64_t_type(), .name = "out", .offset = 16 },
+      { .type = glsl_uint_type(), .name = "count", .offset = 24 },
+   };
+   const struct glsl_type *push_iface_type =
+      glsl_interface_type(push_fields, ARRAY_SIZE(push_fields),
+                          GLSL_INTERFACE_PACKING_STD140,
+                          false /* row_major */, "push");
+   nir_variable *push = nir_variable_create(b->shader, nir_var_mem_push_const,
+                                            push_iface_type, "push");
+
+   b->shader->info.workgroup_size[0] = 32;
+
+   nvk_copy_indirect(b, load_struct_var(b, push, 0),
+                     load_struct_var(b, push, 1),
+                     load_struct_var(b, push, 2),
+                     load_struct_var(b, push, 3));
+
+   return build.shader;
+}
+
+static struct nvk_shader *
+atomic_set_or_destroy_shader(struct nvk_device *dev,
+                             struct nvk_shader **shader_ptr,
+                             struct nvk_shader *shader,
+                             const VkAllocationCallbacks *alloc)
+{
+   struct nvk_shader *old_shader = p_atomic_cmpxchg(shader_ptr, NULL, shader);
+   if (old_shader == NULL) {
+      return shader;
+   } else {
+      vk_shader_destroy(&dev->vk, &shader->vk, alloc);
+      return old_shader;
+   }
+}
+
+static VkResult
+get_copy_indirect_shader(struct nvk_device *dev,
+                        struct nvk_shader **shader_out)
+{
+   struct nvk_shader *shader = p_atomic_read(&dev->copy_indirect);
+   if (shader != NULL) {
+      *shader_out = shader;
+      return VK_SUCCESS;
+   }
+
+   nir_shader *nir = build_copy_indierct_shader();
+   VkResult result = nvk_compile_nir_shader(dev, nir, &dev->vk.alloc, &shader);
+   if (result != VK_SUCCESS)
+      return result;
+
+   *shader_out = atomic_set_or_destroy_shader(dev, &dev->copy_indirect,
+                                              shader, &dev->vk.alloc);
+
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+nvk_CmdCopyMemoryIndirectKHR(VkCommandBuffer commandBuffer,
+                             const VkCopyMemoryIndirectInfoKHR* info)
+{
+   VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
+   struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
+   const struct nvk_physical_device *pdev = nvk_device_physical(dev);
+   VkResult result;
+
+   if (info->copyCount == 0)
+      return;
+
+   struct nvk_shader *shader;
+   result = get_copy_indirect_shader(dev, &shader);
+   if (result != VK_SUCCESS) {
+      vk_command_buffer_set_error(&cmd->vk, result);
+      return;
+   }
+
+   const uint32_t cmds_per_buffer =
+      NVK_CMD_MEM_SIZE / NVK_COPY_INDIRECT_CMD_BYTES;
+   const uint32_t num_buffers = DIV_ROUND_UP(info->copyCount, cmds_per_buffer);
+
+   if (!util_dynarray_ensure_cap(&cmd->copy_memory_indirect_temps,
+                                 sizeof(struct nvk_cmd_mem*) * num_buffers)) {
+      vk_command_buffer_set_error(&cmd->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return;
+   }
+
+   while (util_dynarray_num_elements(&cmd->copy_memory_indirect_temps,
+                                     struct nvk_cmd_mem*) < num_buffers) {
+      struct nvk_cmd_mem *temp_mem;
+      result = nvk_cmd_buffer_alloc_mem(cmd, false, &temp_mem);
+      if (result != VK_SUCCESS) {
+         vk_command_buffer_set_error(&cmd->vk, result);
+         return;
+      }
+      util_dynarray_append_typed(&cmd->copy_memory_indirect_temps,
+                                 struct nvk_cmd_mem*, temp_mem);
+   }
+
+   for (int i = 0; i < info->copyCount; i += cmds_per_buffer) {
+      struct nvk_cmd_mem *temp_mem =
+         *util_dynarray_element(&cmd->copy_memory_indirect_temps,
+                                struct nvk_cmd_mem*, i / cmds_per_buffer);
+      uint32_t count = MIN2(cmds_per_buffer, info->copyCount - i);
+
+      const struct nvk_copy_indirect_push push_constants = {
+         .in = info->copyAddressRange.address + i * info->copyAddressRange.stride,
+         .in_stride = info->copyAddressRange.stride,
+         .out = temp_mem->mem->va->addr,
+         .count = count
+      };
+
+      nvk_cmd_dispatch_shader(cmd, shader,
+                              &push_constants, sizeof(push_constants),
+                              DIV_ROUND_UP(count, 32), 1, 1);
+   }
+
+   if (pdev->info.cls_eng3d >= HOPPER_A) {
+      struct nv_push *p = nvk_cmd_buffer_push(cmd, 1);
+      P_IMMD_WORD(p, NVC86F, WFI, 0);
+   } else {
+      struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
+      P_IMMD_WORD(p, NVA0C0, WAIT_FOR_IDLE, 0);
+      __push_immd(p, SUBC_NV9097, NV906F_SET_REFERENCE, 0);
+   }
+
+   for (int i = 0; i < info->copyCount; i += cmds_per_buffer) {
+      struct nvk_cmd_mem *temp_mem =
+         *util_dynarray_element(&cmd->copy_memory_indirect_temps,
+                                struct nvk_cmd_mem*, i / cmds_per_buffer);
+      uint32_t count = MIN2(cmds_per_buffer, info->copyCount - i);
+
+      nvk_cmd_buffer_push_indirect(cmd, temp_mem->mem->va->addr,
+                                   NVK_COPY_INDIRECT_CMD_BYTES * count);
+   }
+}
+
+void nvk_CmdCopyMemoryToImageIndirectKHR(VkCommandBuffer commandBuffer,
+      const VkCopyMemoryToImageIndirectInfoKHR* info)
+{
+   /* Feature unimplemented */
+   assert(false);
+}
--- a/src/nouveau/vulkan/nvk_device.h
+++ b/src/nouveau/vulkan/nvk_device.h
@ -55,6 +55,7 @@ struct nvk_device {
   struct vk_meta_device meta;

   struct nvk_shader *copy_queries;
+   struct nvk_shader *copy_indirect;
 };

 VK_DEFINE_HANDLE_CASTS(nvk_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)
--- a/src/nouveau/vulkan/nvk_physical_device.c
+++ b/src/nouveau/vulkan/nvk_physical_device.c
@ -120,6 +120,7 @@ nvk_get_device_extensions(const struct nvk_instance *instance,
      .KHR_compute_shader_derivatives = info->cls_eng3d >= TURING_A,
      .KHR_cooperative_matrix = info->cls_eng3d >= TURING_A,
      .KHR_copy_commands2 = true,
+      .KHR_copy_memory_indirect = true,
      .KHR_create_renderpass2 = true,
      .KHR_dedicated_allocation = true,
      .KHR_depth_stencil_resolve = true,
@ -476,6 +477,9 @@ nvk_get_device_features(const struct nv_device_info *info,
      .hostImageCopy = info->cls_eng3d >= TURING_A,
      .pushDescriptor = true,

+      /* VK_KHR_copy_memory_indirect */
+      .indirectMemoryCopy = true,
+
      /* VK_KHR_cooperative_matrix */
      /* TU11X can run coop matrix but the performances are abysal */
      .cooperativeMatrix = info->cls_eng3d >= TURING_A && !is_tu11x,
@ -1063,6 +1067,9 @@ nvk_get_device_properties(const struct nvk_instance *instance,
       */
      .pipelineBinaryCompressedData = false,

+      /* VK_KHR_copy_memory_indirect */
+      .supportedQueues = VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT,
+
      /* VK_EXT_conservative_rasterization */
      .primitiveOverestimationSize = info->cls_eng3d >= VOLTA_A ? 1.0f / 512.0f : 0.0,
      .maxExtraPrimitiveOverestimationSize = 0.75,