nvk: VK_KHR_copy_memory_indirect

Reviewed-by: Mary Guillemard <mary@mary.zone>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39869>
This commit is contained in:
Mel Henning 2026-02-11 16:58:51 -05:00 committed by Marge Bot
parent 6824004a0b
commit cbec12627b
11 changed files with 243 additions and 4 deletions

View file

@ -540,6 +540,7 @@ Khronos extensions that are not part of any Vulkan version:
VK_KHR_calibrated_timestamps DONE (anv, hk, kk, nvk, panvk/v10+, radv, tu/a750+, vn)
VK_KHR_compute_shader_derivatives DONE (anv, lvp, nvk, radv, tu, vn)
VK_KHR_cooperative_matrix DONE (anv, nvk/Turing+, radv/gfx11+, vn)
VK_KHR_copy_memory_indirect DONE (nvk)
VK_KHR_depth_clamp_zero_one DONE (anv, nvk, panvk, radv, tu, vn)
VK_KHR_deferred_host_operations DONE (anv, hasvk, lvp, radv, tu, vn)
VK_KHR_display DONE (anv, nvk, panvk, pvr, radv, tu, v3dv, vn)

View file

@ -14,3 +14,4 @@ VK_EXT_zero_initialize_device_memory on panvk
GL_EXT_shader_image_load_store on panfrost
VK_KHR_swapchain_mutable_format on panvk
VK_EXT_astc_decode_mode on panvk
VK_KHR_copy_memory_indirect on nvk

View file

@ -107,7 +107,7 @@ nv_push_class_dump_h = custom_target(
command : [prog_python, '@INPUT0@', '--classes', flat_nv_classes, '--out-h', '@OUTPUT0@'],
)
_libnvidia_headers = static_library(
libnvidia_headers = static_library(
'nvidia_headers_c',
['nv_push.c', cl_generated, nv_push_class_dump_h],
include_directories : ['.', 'nvidia/classes', inc_include, inc_src],
@ -118,7 +118,7 @@ _libnvidia_headers = static_library(
idep_nvidia_headers = declare_dependency(
include_directories : include_directories('.', 'nvidia/classes'),
sources : cl_generated,
link_with : _libnvidia_headers,
link_with : libnvidia_headers,
)
executable(

View file

@ -0,0 +1,50 @@
/*
* Copyright 2026 Valve Corporation
* SPDX-License-Identifier: MIT
*/
#include "nvk_copy_indirect.h"
#include "nv_push.h"
#include "nv_push_cl90b5.h"
#include "compiler/libcl/libcl_vk.h"
void
nvk_copy_indirect(const __global void* in, uintptr_t in_stride,
__global uint32_t* restrict out, uint32_t count)
{
uint i = get_sub_group_local_id() + cl_group_id.x * 32;
if (i >= count)
return;
in += i * in_stride;
VkCopyMemoryIndirectCommandKHR cmd = *((__global VkCopyMemoryIndirectCommandKHR*)in);
uint32_t push_data[NVK_COPY_INDIRECT_CMD_WORDS];
struct nv_push push;
struct nv_push *p = &push;
nv_push_init(p, push_data, ARRAY_SIZE(push_data),
BITFIELD_BIT(SUBC_NV90B5));
P_MTHD(p, NV90B5, OFFSET_IN_UPPER);
P_NV90B5_OFFSET_IN_UPPER(p, cmd.srcAddress >> 32);
P_NV90B5_OFFSET_IN_LOWER(p, cmd.srcAddress & 0xffffffff);
P_NV90B5_OFFSET_OUT_UPPER(p, cmd.dstAddress >> 32);
P_NV90B5_OFFSET_OUT_LOWER(p, cmd.dstAddress & 0xffffffff);
P_MTHD(p, NV90B5, LINE_LENGTH_IN);
P_NV90B5_LINE_LENGTH_IN(p, cmd.size);
P_IMMD_WORD(p, NV90B5, LAUNCH_DMA, {
.data_transfer_type = DATA_TRANSFER_TYPE_PIPELINED,
.multi_line_enable = MULTI_LINE_ENABLE_FALSE,
.flush_enable = FLUSH_ENABLE_TRUE,
.src_memory_layout = SRC_MEMORY_LAYOUT_PITCH,
.dst_memory_layout = DST_MEMORY_LAYOUT_PITCH,
.remap_enable = REMAP_ENABLE_TRUE,
});
assert(nv_push_dw_count(p) == NVK_COPY_INDIRECT_CMD_WORDS);
memcpy(out + i * NVK_COPY_INDIRECT_CMD_WORDS, push_data, sizeof(push_data));
}

View file

@ -0,0 +1,8 @@
/*
* Copyright 2026 Valve Corporation
* SPDX-License-Identifier: MIT
*/
#pragma once
#define NVK_COPY_INDIRECT_CMD_WORDS 8
#define NVK_COPY_INDIRECT_CMD_BYTES (NVK_COPY_INDIRECT_CMD_WORDS * sizeof(uint32_t))

View file

@ -79,6 +79,7 @@ nvk_files = files(
nvkcl_files = files(
'cl/nvk_query.cl',
'cl/nvk_copy_indirect.cl',
)
nvk_entrypoints = custom_target(
@ -94,16 +95,19 @@ nvk_entrypoints = custom_target(
)
nvkcl_spv = custom_target(
input : nvkcl_files,
input : [nvkcl_files, libnvidia_headers],
output : 'nvkcl.spv',
command : [
prog_mesa_clc, '-o', '@OUTPUT@', '--depfile', '@DEPFILE@', nvkcl_files, '--',
'-I' + join_paths(meson.project_source_root(), 'src/compiler/libcl'),
'-I' + join_paths(meson.current_source_dir(), '.'),
'-I' + join_paths(meson.project_source_root(), 'src'),
'-I' + join_paths(meson.project_source_root(), 'src/nouveau/headers'),
'-I' + join_paths(meson.project_source_root(), 'src/nouveau/headers/nvidia/classes'),
'-I' + join_paths(meson.project_build_root(), 'src/nouveau/headers'),
cl_args,
],
depfile : '@PLAINNAME@.d',
depfile : 'nvkcl.spv.d',
)
nvkcl = custom_target(

View file

@ -95,6 +95,7 @@ nvk_create_cmd_buffer(struct vk_command_pool *vk_pool,
list_inithead(&cmd->owned_gart_mem);
list_inithead(&cmd->owned_qmd);
cmd->pushes = UTIL_DYNARRAY_INIT;
cmd->copy_memory_indirect_temps = UTIL_DYNARRAY_INIT;
cmd->prev_subc = ffs(nvk_cmd_buffer_subchannel_mask(cmd)) - 1;
@ -126,6 +127,7 @@ nvk_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer,
cmd->cond_render_mem = NULL;
util_dynarray_clear(&cmd->pushes);
util_dynarray_clear(&cmd->copy_memory_indirect_temps);
memset(&cmd->state, 0, sizeof(cmd->state));
}

View file

@ -226,6 +226,8 @@ struct nvk_cmd_buffer {
uint32_t upload_offset;
struct nvk_cmd_mem *cond_render_mem;
/** Array of struct nvk_cmd_mem* */
struct util_dynarray copy_memory_indirect_temps;
struct nvk_cmd_mem *push_mem;
uint32_t *push_mem_limit;

View file

@ -13,6 +13,9 @@
#include "nir_builder.h"
#include "vk_pipeline.h"
#include "nvkcl.h"
#include "cl/nvk_copy_indirect.h"
#include "clcb97.h"
#include "nv_push.h"
#include "nv_push_cl9097.h"
@ -1133,3 +1136,163 @@ nvk_CmdExecuteGeneratedCommandsEXT(VkCommandBuffer commandBuffer,
}
}
}
struct nvk_copy_indirect_push {
uint64_t in;
uint64_t in_stride;
uint64_t out;
uint32_t count;
};
static nir_shader *
build_copy_indierct_shader(void)
{
nir_builder build =
nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL,
"nvk-meta-copy-indirect");
nir_builder *b = &build;
struct glsl_struct_field push_fields[] = {
{ .type = glsl_uint64_t_type(), .name = "in", .offset = 0 },
{ .type = glsl_uint64_t_type(), .name = "in_stride", .offset = 8 },
{ .type = glsl_uint64_t_type(), .name = "out", .offset = 16 },
{ .type = glsl_uint_type(), .name = "count", .offset = 24 },
};
const struct glsl_type *push_iface_type =
glsl_interface_type(push_fields, ARRAY_SIZE(push_fields),
GLSL_INTERFACE_PACKING_STD140,
false /* row_major */, "push");
nir_variable *push = nir_variable_create(b->shader, nir_var_mem_push_const,
push_iface_type, "push");
b->shader->info.workgroup_size[0] = 32;
nvk_copy_indirect(b, load_struct_var(b, push, 0),
load_struct_var(b, push, 1),
load_struct_var(b, push, 2),
load_struct_var(b, push, 3));
return build.shader;
}
static struct nvk_shader *
atomic_set_or_destroy_shader(struct nvk_device *dev,
struct nvk_shader **shader_ptr,
struct nvk_shader *shader,
const VkAllocationCallbacks *alloc)
{
struct nvk_shader *old_shader = p_atomic_cmpxchg(shader_ptr, NULL, shader);
if (old_shader == NULL) {
return shader;
} else {
vk_shader_destroy(&dev->vk, &shader->vk, alloc);
return old_shader;
}
}
static VkResult
get_copy_indirect_shader(struct nvk_device *dev,
struct nvk_shader **shader_out)
{
struct nvk_shader *shader = p_atomic_read(&dev->copy_indirect);
if (shader != NULL) {
*shader_out = shader;
return VK_SUCCESS;
}
nir_shader *nir = build_copy_indierct_shader();
VkResult result = nvk_compile_nir_shader(dev, nir, &dev->vk.alloc, &shader);
if (result != VK_SUCCESS)
return result;
*shader_out = atomic_set_or_destroy_shader(dev, &dev->copy_indirect,
shader, &dev->vk.alloc);
return VK_SUCCESS;
}
VKAPI_ATTR void VKAPI_CALL
nvk_CmdCopyMemoryIndirectKHR(VkCommandBuffer commandBuffer,
const VkCopyMemoryIndirectInfoKHR* info)
{
VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
const struct nvk_physical_device *pdev = nvk_device_physical(dev);
VkResult result;
if (info->copyCount == 0)
return;
struct nvk_shader *shader;
result = get_copy_indirect_shader(dev, &shader);
if (result != VK_SUCCESS) {
vk_command_buffer_set_error(&cmd->vk, result);
return;
}
const uint32_t cmds_per_buffer =
NVK_CMD_MEM_SIZE / NVK_COPY_INDIRECT_CMD_BYTES;
const uint32_t num_buffers = DIV_ROUND_UP(info->copyCount, cmds_per_buffer);
if (!util_dynarray_ensure_cap(&cmd->copy_memory_indirect_temps,
sizeof(struct nvk_cmd_mem*) * num_buffers)) {
vk_command_buffer_set_error(&cmd->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
return;
}
while (util_dynarray_num_elements(&cmd->copy_memory_indirect_temps,
struct nvk_cmd_mem*) < num_buffers) {
struct nvk_cmd_mem *temp_mem;
result = nvk_cmd_buffer_alloc_mem(cmd, false, &temp_mem);
if (result != VK_SUCCESS) {
vk_command_buffer_set_error(&cmd->vk, result);
return;
}
util_dynarray_append_typed(&cmd->copy_memory_indirect_temps,
struct nvk_cmd_mem*, temp_mem);
}
for (int i = 0; i < info->copyCount; i += cmds_per_buffer) {
struct nvk_cmd_mem *temp_mem =
*util_dynarray_element(&cmd->copy_memory_indirect_temps,
struct nvk_cmd_mem*, i / cmds_per_buffer);
uint32_t count = MIN2(cmds_per_buffer, info->copyCount - i);
const struct nvk_copy_indirect_push push_constants = {
.in = info->copyAddressRange.address + i * info->copyAddressRange.stride,
.in_stride = info->copyAddressRange.stride,
.out = temp_mem->mem->va->addr,
.count = count
};
nvk_cmd_dispatch_shader(cmd, shader,
&push_constants, sizeof(push_constants),
DIV_ROUND_UP(count, 32), 1, 1);
}
if (pdev->info.cls_eng3d >= HOPPER_A) {
struct nv_push *p = nvk_cmd_buffer_push(cmd, 1);
P_IMMD_WORD(p, NVC86F, WFI, 0);
} else {
struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
P_IMMD_WORD(p, NVA0C0, WAIT_FOR_IDLE, 0);
__push_immd(p, SUBC_NV9097, NV906F_SET_REFERENCE, 0);
}
for (int i = 0; i < info->copyCount; i += cmds_per_buffer) {
struct nvk_cmd_mem *temp_mem =
*util_dynarray_element(&cmd->copy_memory_indirect_temps,
struct nvk_cmd_mem*, i / cmds_per_buffer);
uint32_t count = MIN2(cmds_per_buffer, info->copyCount - i);
nvk_cmd_buffer_push_indirect(cmd, temp_mem->mem->va->addr,
NVK_COPY_INDIRECT_CMD_BYTES * count);
}
}
void nvk_CmdCopyMemoryToImageIndirectKHR(VkCommandBuffer commandBuffer,
const VkCopyMemoryToImageIndirectInfoKHR* info)
{
/* Feature unimplemented */
assert(false);
}

View file

@ -55,6 +55,7 @@ struct nvk_device {
struct vk_meta_device meta;
struct nvk_shader *copy_queries;
struct nvk_shader *copy_indirect;
};
VK_DEFINE_HANDLE_CASTS(nvk_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)

View file

@ -120,6 +120,7 @@ nvk_get_device_extensions(const struct nvk_instance *instance,
.KHR_compute_shader_derivatives = info->cls_eng3d >= TURING_A,
.KHR_cooperative_matrix = info->cls_eng3d >= TURING_A,
.KHR_copy_commands2 = true,
.KHR_copy_memory_indirect = true,
.KHR_create_renderpass2 = true,
.KHR_dedicated_allocation = true,
.KHR_depth_stencil_resolve = true,
@ -476,6 +477,9 @@ nvk_get_device_features(const struct nv_device_info *info,
.hostImageCopy = info->cls_eng3d >= TURING_A,
.pushDescriptor = true,
/* VK_KHR_copy_memory_indirect */
.indirectMemoryCopy = true,
/* VK_KHR_cooperative_matrix */
/* TU11X can run coop matrix but the performances are abysal */
.cooperativeMatrix = info->cls_eng3d >= TURING_A && !is_tu11x,
@ -1063,6 +1067,9 @@ nvk_get_device_properties(const struct nvk_instance *instance,
*/
.pipelineBinaryCompressedData = false,
/* VK_KHR_copy_memory_indirect */
.supportedQueues = VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT,
/* VK_EXT_conservative_rasterization */
.primitiveOverestimationSize = info->cls_eng3d >= VOLTA_A ? 1.0f / 512.0f : 0.0,
.maxExtraPrimitiveOverestimationSize = 0.75,