From cbec12627bbb1fbe2700de79b5c7d871f5ed0145 Mon Sep 17 00:00:00 2001 From: Mel Henning Date: Wed, 11 Feb 2026 16:58:51 -0500 Subject: [PATCH] nvk: VK_KHR_copy_memory_indirect Reviewed-by: Mary Guillemard Part-of: --- docs/features.txt | 1 + docs/relnotes/new_features.txt | 1 + src/nouveau/headers/meson.build | 4 +- src/nouveau/vulkan/cl/nvk_copy_indirect.cl | 50 +++++++ src/nouveau/vulkan/cl/nvk_copy_indirect.h | 8 + src/nouveau/vulkan/meson.build | 8 +- src/nouveau/vulkan/nvk_cmd_buffer.c | 2 + src/nouveau/vulkan/nvk_cmd_buffer.h | 2 + src/nouveau/vulkan/nvk_cmd_indirect.c | 163 +++++++++++++++++++++ src/nouveau/vulkan/nvk_device.h | 1 + src/nouveau/vulkan/nvk_physical_device.c | 7 + 11 files changed, 243 insertions(+), 4 deletions(-) create mode 100644 src/nouveau/vulkan/cl/nvk_copy_indirect.cl create mode 100644 src/nouveau/vulkan/cl/nvk_copy_indirect.h diff --git a/docs/features.txt b/docs/features.txt index 5e8eeb79848..aa5ace23242 100644 --- a/docs/features.txt +++ b/docs/features.txt @@ -540,6 +540,7 @@ Khronos extensions that are not part of any Vulkan version: VK_KHR_calibrated_timestamps DONE (anv, hk, kk, nvk, panvk/v10+, radv, tu/a750+, vn) VK_KHR_compute_shader_derivatives DONE (anv, lvp, nvk, radv, tu, vn) VK_KHR_cooperative_matrix DONE (anv, nvk/Turing+, radv/gfx11+, vn) + VK_KHR_copy_memory_indirect DONE (nvk) VK_KHR_depth_clamp_zero_one DONE (anv, nvk, panvk, radv, tu, vn) VK_KHR_deferred_host_operations DONE (anv, hasvk, lvp, radv, tu, vn) VK_KHR_display DONE (anv, nvk, panvk, pvr, radv, tu, v3dv, vn) diff --git a/docs/relnotes/new_features.txt b/docs/relnotes/new_features.txt index 50dc41192c3..2cb47b4d9f4 100644 --- a/docs/relnotes/new_features.txt +++ b/docs/relnotes/new_features.txt @@ -14,3 +14,4 @@ VK_EXT_zero_initialize_device_memory on panvk GL_EXT_shader_image_load_store on panfrost VK_KHR_swapchain_mutable_format on panvk VK_EXT_astc_decode_mode on panvk +VK_KHR_copy_memory_indirect on nvk diff --git a/src/nouveau/headers/meson.build b/src/nouveau/headers/meson.build index d9717ac6ce3..06fbcc92970 100644 --- a/src/nouveau/headers/meson.build +++ b/src/nouveau/headers/meson.build @@ -107,7 +107,7 @@ nv_push_class_dump_h = custom_target( command : [prog_python, '@INPUT0@', '--classes', flat_nv_classes, '--out-h', '@OUTPUT0@'], ) -_libnvidia_headers = static_library( +libnvidia_headers = static_library( 'nvidia_headers_c', ['nv_push.c', cl_generated, nv_push_class_dump_h], include_directories : ['.', 'nvidia/classes', inc_include, inc_src], @@ -118,7 +118,7 @@ _libnvidia_headers = static_library( idep_nvidia_headers = declare_dependency( include_directories : include_directories('.', 'nvidia/classes'), sources : cl_generated, - link_with : _libnvidia_headers, + link_with : libnvidia_headers, ) executable( diff --git a/src/nouveau/vulkan/cl/nvk_copy_indirect.cl b/src/nouveau/vulkan/cl/nvk_copy_indirect.cl new file mode 100644 index 00000000000..00a18f9d40a --- /dev/null +++ b/src/nouveau/vulkan/cl/nvk_copy_indirect.cl @@ -0,0 +1,50 @@ +/* + * Copyright 2026 Valve Corporation + * SPDX-License-Identifier: MIT + */ + +#include "nvk_copy_indirect.h" + +#include "nv_push.h" +#include "nv_push_cl90b5.h" + +#include "compiler/libcl/libcl_vk.h" + +void +nvk_copy_indirect(const __global void* in, uintptr_t in_stride, + __global uint32_t* restrict out, uint32_t count) +{ + uint i = get_sub_group_local_id() + cl_group_id.x * 32; + if (i >= count) + return; + + in += i * in_stride; + VkCopyMemoryIndirectCommandKHR cmd = *((__global VkCopyMemoryIndirectCommandKHR*)in); + + uint32_t push_data[NVK_COPY_INDIRECT_CMD_WORDS]; + struct nv_push push; + struct nv_push *p = &push; + nv_push_init(p, push_data, ARRAY_SIZE(push_data), + BITFIELD_BIT(SUBC_NV90B5)); + + P_MTHD(p, NV90B5, OFFSET_IN_UPPER); + P_NV90B5_OFFSET_IN_UPPER(p, cmd.srcAddress >> 32); + P_NV90B5_OFFSET_IN_LOWER(p, cmd.srcAddress & 0xffffffff); + P_NV90B5_OFFSET_OUT_UPPER(p, cmd.dstAddress >> 32); + P_NV90B5_OFFSET_OUT_LOWER(p, cmd.dstAddress & 0xffffffff); + + P_MTHD(p, NV90B5, LINE_LENGTH_IN); + P_NV90B5_LINE_LENGTH_IN(p, cmd.size); + + P_IMMD_WORD(p, NV90B5, LAUNCH_DMA, { + .data_transfer_type = DATA_TRANSFER_TYPE_PIPELINED, + .multi_line_enable = MULTI_LINE_ENABLE_FALSE, + .flush_enable = FLUSH_ENABLE_TRUE, + .src_memory_layout = SRC_MEMORY_LAYOUT_PITCH, + .dst_memory_layout = DST_MEMORY_LAYOUT_PITCH, + .remap_enable = REMAP_ENABLE_TRUE, + }); + + assert(nv_push_dw_count(p) == NVK_COPY_INDIRECT_CMD_WORDS); + memcpy(out + i * NVK_COPY_INDIRECT_CMD_WORDS, push_data, sizeof(push_data)); +} diff --git a/src/nouveau/vulkan/cl/nvk_copy_indirect.h b/src/nouveau/vulkan/cl/nvk_copy_indirect.h new file mode 100644 index 00000000000..76caf49bfd5 --- /dev/null +++ b/src/nouveau/vulkan/cl/nvk_copy_indirect.h @@ -0,0 +1,8 @@ +/* + * Copyright 2026 Valve Corporation + * SPDX-License-Identifier: MIT + */ +#pragma once + +#define NVK_COPY_INDIRECT_CMD_WORDS 8 +#define NVK_COPY_INDIRECT_CMD_BYTES (NVK_COPY_INDIRECT_CMD_WORDS * sizeof(uint32_t)) diff --git a/src/nouveau/vulkan/meson.build b/src/nouveau/vulkan/meson.build index 9ecfa1308a6..9e91c135da8 100644 --- a/src/nouveau/vulkan/meson.build +++ b/src/nouveau/vulkan/meson.build @@ -79,6 +79,7 @@ nvk_files = files( nvkcl_files = files( 'cl/nvk_query.cl', + 'cl/nvk_copy_indirect.cl', ) nvk_entrypoints = custom_target( @@ -94,16 +95,19 @@ nvk_entrypoints = custom_target( ) nvkcl_spv = custom_target( - input : nvkcl_files, + input : [nvkcl_files, libnvidia_headers], output : 'nvkcl.spv', command : [ prog_mesa_clc, '-o', '@OUTPUT@', '--depfile', '@DEPFILE@', nvkcl_files, '--', '-I' + join_paths(meson.project_source_root(), 'src/compiler/libcl'), '-I' + join_paths(meson.current_source_dir(), '.'), '-I' + join_paths(meson.project_source_root(), 'src'), + '-I' + join_paths(meson.project_source_root(), 'src/nouveau/headers'), + '-I' + join_paths(meson.project_source_root(), 'src/nouveau/headers/nvidia/classes'), + '-I' + join_paths(meson.project_build_root(), 'src/nouveau/headers'), cl_args, ], - depfile : '@PLAINNAME@.d', + depfile : 'nvkcl.spv.d', ) nvkcl = custom_target( diff --git a/src/nouveau/vulkan/nvk_cmd_buffer.c b/src/nouveau/vulkan/nvk_cmd_buffer.c index abcd38974c1..85e35ddc5ee 100644 --- a/src/nouveau/vulkan/nvk_cmd_buffer.c +++ b/src/nouveau/vulkan/nvk_cmd_buffer.c @@ -95,6 +95,7 @@ nvk_create_cmd_buffer(struct vk_command_pool *vk_pool, list_inithead(&cmd->owned_gart_mem); list_inithead(&cmd->owned_qmd); cmd->pushes = UTIL_DYNARRAY_INIT; + cmd->copy_memory_indirect_temps = UTIL_DYNARRAY_INIT; cmd->prev_subc = ffs(nvk_cmd_buffer_subchannel_mask(cmd)) - 1; @@ -126,6 +127,7 @@ nvk_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer, cmd->cond_render_mem = NULL; util_dynarray_clear(&cmd->pushes); + util_dynarray_clear(&cmd->copy_memory_indirect_temps); memset(&cmd->state, 0, sizeof(cmd->state)); } diff --git a/src/nouveau/vulkan/nvk_cmd_buffer.h b/src/nouveau/vulkan/nvk_cmd_buffer.h index 95ba436e607..99d5bca8391 100644 --- a/src/nouveau/vulkan/nvk_cmd_buffer.h +++ b/src/nouveau/vulkan/nvk_cmd_buffer.h @@ -226,6 +226,8 @@ struct nvk_cmd_buffer { uint32_t upload_offset; struct nvk_cmd_mem *cond_render_mem; + /** Array of struct nvk_cmd_mem* */ + struct util_dynarray copy_memory_indirect_temps; struct nvk_cmd_mem *push_mem; uint32_t *push_mem_limit; diff --git a/src/nouveau/vulkan/nvk_cmd_indirect.c b/src/nouveau/vulkan/nvk_cmd_indirect.c index 140e1d2f238..15b9b56865a 100644 --- a/src/nouveau/vulkan/nvk_cmd_indirect.c +++ b/src/nouveau/vulkan/nvk_cmd_indirect.c @@ -13,6 +13,9 @@ #include "nir_builder.h" #include "vk_pipeline.h" +#include "nvkcl.h" +#include "cl/nvk_copy_indirect.h" + #include "clcb97.h" #include "nv_push.h" #include "nv_push_cl9097.h" @@ -1133,3 +1136,163 @@ nvk_CmdExecuteGeneratedCommandsEXT(VkCommandBuffer commandBuffer, } } } + +struct nvk_copy_indirect_push { + uint64_t in; + uint64_t in_stride; + uint64_t out; + uint32_t count; +}; + +static nir_shader * +build_copy_indierct_shader(void) +{ + nir_builder build = + nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL, + "nvk-meta-copy-indirect"); + nir_builder *b = &build; + + struct glsl_struct_field push_fields[] = { + { .type = glsl_uint64_t_type(), .name = "in", .offset = 0 }, + { .type = glsl_uint64_t_type(), .name = "in_stride", .offset = 8 }, + { .type = glsl_uint64_t_type(), .name = "out", .offset = 16 }, + { .type = glsl_uint_type(), .name = "count", .offset = 24 }, + }; + const struct glsl_type *push_iface_type = + glsl_interface_type(push_fields, ARRAY_SIZE(push_fields), + GLSL_INTERFACE_PACKING_STD140, + false /* row_major */, "push"); + nir_variable *push = nir_variable_create(b->shader, nir_var_mem_push_const, + push_iface_type, "push"); + + b->shader->info.workgroup_size[0] = 32; + + nvk_copy_indirect(b, load_struct_var(b, push, 0), + load_struct_var(b, push, 1), + load_struct_var(b, push, 2), + load_struct_var(b, push, 3)); + + return build.shader; +} + +static struct nvk_shader * +atomic_set_or_destroy_shader(struct nvk_device *dev, + struct nvk_shader **shader_ptr, + struct nvk_shader *shader, + const VkAllocationCallbacks *alloc) +{ + struct nvk_shader *old_shader = p_atomic_cmpxchg(shader_ptr, NULL, shader); + if (old_shader == NULL) { + return shader; + } else { + vk_shader_destroy(&dev->vk, &shader->vk, alloc); + return old_shader; + } +} + +static VkResult +get_copy_indirect_shader(struct nvk_device *dev, + struct nvk_shader **shader_out) +{ + struct nvk_shader *shader = p_atomic_read(&dev->copy_indirect); + if (shader != NULL) { + *shader_out = shader; + return VK_SUCCESS; + } + + nir_shader *nir = build_copy_indierct_shader(); + VkResult result = nvk_compile_nir_shader(dev, nir, &dev->vk.alloc, &shader); + if (result != VK_SUCCESS) + return result; + + *shader_out = atomic_set_or_destroy_shader(dev, &dev->copy_indirect, + shader, &dev->vk.alloc); + + return VK_SUCCESS; +} + +VKAPI_ATTR void VKAPI_CALL +nvk_CmdCopyMemoryIndirectKHR(VkCommandBuffer commandBuffer, + const VkCopyMemoryIndirectInfoKHR* info) +{ + VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer); + struct nvk_device *dev = nvk_cmd_buffer_device(cmd); + const struct nvk_physical_device *pdev = nvk_device_physical(dev); + VkResult result; + + if (info->copyCount == 0) + return; + + struct nvk_shader *shader; + result = get_copy_indirect_shader(dev, &shader); + if (result != VK_SUCCESS) { + vk_command_buffer_set_error(&cmd->vk, result); + return; + } + + const uint32_t cmds_per_buffer = + NVK_CMD_MEM_SIZE / NVK_COPY_INDIRECT_CMD_BYTES; + const uint32_t num_buffers = DIV_ROUND_UP(info->copyCount, cmds_per_buffer); + + if (!util_dynarray_ensure_cap(&cmd->copy_memory_indirect_temps, + sizeof(struct nvk_cmd_mem*) * num_buffers)) { + vk_command_buffer_set_error(&cmd->vk, VK_ERROR_OUT_OF_HOST_MEMORY); + return; + } + + while (util_dynarray_num_elements(&cmd->copy_memory_indirect_temps, + struct nvk_cmd_mem*) < num_buffers) { + struct nvk_cmd_mem *temp_mem; + result = nvk_cmd_buffer_alloc_mem(cmd, false, &temp_mem); + if (result != VK_SUCCESS) { + vk_command_buffer_set_error(&cmd->vk, result); + return; + } + util_dynarray_append_typed(&cmd->copy_memory_indirect_temps, + struct nvk_cmd_mem*, temp_mem); + } + + for (int i = 0; i < info->copyCount; i += cmds_per_buffer) { + struct nvk_cmd_mem *temp_mem = + *util_dynarray_element(&cmd->copy_memory_indirect_temps, + struct nvk_cmd_mem*, i / cmds_per_buffer); + uint32_t count = MIN2(cmds_per_buffer, info->copyCount - i); + + const struct nvk_copy_indirect_push push_constants = { + .in = info->copyAddressRange.address + i * info->copyAddressRange.stride, + .in_stride = info->copyAddressRange.stride, + .out = temp_mem->mem->va->addr, + .count = count + }; + + nvk_cmd_dispatch_shader(cmd, shader, + &push_constants, sizeof(push_constants), + DIV_ROUND_UP(count, 32), 1, 1); + } + + if (pdev->info.cls_eng3d >= HOPPER_A) { + struct nv_push *p = nvk_cmd_buffer_push(cmd, 1); + P_IMMD_WORD(p, NVC86F, WFI, 0); + } else { + struct nv_push *p = nvk_cmd_buffer_push(cmd, 2); + P_IMMD_WORD(p, NVA0C0, WAIT_FOR_IDLE, 0); + __push_immd(p, SUBC_NV9097, NV906F_SET_REFERENCE, 0); + } + + for (int i = 0; i < info->copyCount; i += cmds_per_buffer) { + struct nvk_cmd_mem *temp_mem = + *util_dynarray_element(&cmd->copy_memory_indirect_temps, + struct nvk_cmd_mem*, i / cmds_per_buffer); + uint32_t count = MIN2(cmds_per_buffer, info->copyCount - i); + + nvk_cmd_buffer_push_indirect(cmd, temp_mem->mem->va->addr, + NVK_COPY_INDIRECT_CMD_BYTES * count); + } +} + +void nvk_CmdCopyMemoryToImageIndirectKHR(VkCommandBuffer commandBuffer, + const VkCopyMemoryToImageIndirectInfoKHR* info) +{ + /* Feature unimplemented */ + assert(false); +} diff --git a/src/nouveau/vulkan/nvk_device.h b/src/nouveau/vulkan/nvk_device.h index bbef86faf5a..2f46b120911 100644 --- a/src/nouveau/vulkan/nvk_device.h +++ b/src/nouveau/vulkan/nvk_device.h @@ -55,6 +55,7 @@ struct nvk_device { struct vk_meta_device meta; struct nvk_shader *copy_queries; + struct nvk_shader *copy_indirect; }; VK_DEFINE_HANDLE_CASTS(nvk_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE) diff --git a/src/nouveau/vulkan/nvk_physical_device.c b/src/nouveau/vulkan/nvk_physical_device.c index bdb7013619b..24d05e36404 100644 --- a/src/nouveau/vulkan/nvk_physical_device.c +++ b/src/nouveau/vulkan/nvk_physical_device.c @@ -120,6 +120,7 @@ nvk_get_device_extensions(const struct nvk_instance *instance, .KHR_compute_shader_derivatives = info->cls_eng3d >= TURING_A, .KHR_cooperative_matrix = info->cls_eng3d >= TURING_A, .KHR_copy_commands2 = true, + .KHR_copy_memory_indirect = true, .KHR_create_renderpass2 = true, .KHR_dedicated_allocation = true, .KHR_depth_stencil_resolve = true, @@ -476,6 +477,9 @@ nvk_get_device_features(const struct nv_device_info *info, .hostImageCopy = info->cls_eng3d >= TURING_A, .pushDescriptor = true, + /* VK_KHR_copy_memory_indirect */ + .indirectMemoryCopy = true, + /* VK_KHR_cooperative_matrix */ /* TU11X can run coop matrix but the performances are abysal */ .cooperativeMatrix = info->cls_eng3d >= TURING_A && !is_tu11x, @@ -1063,6 +1067,9 @@ nvk_get_device_properties(const struct nvk_instance *instance, */ .pipelineBinaryCompressedData = false, + /* VK_KHR_copy_memory_indirect */ + .supportedQueues = VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT, + /* VK_EXT_conservative_rasterization */ .primitiveOverestimationSize = info->cls_eng3d >= VOLTA_A ? 1.0f / 512.0f : 0.0, .maxExtraPrimitiveOverestimationSize = 0.75,