From c843b43ccd0e3ac00b19cc20db49260eadae5ba9 Mon Sep 17 00:00:00 2001 From: Paulo Zanoni Date: Wed, 25 Feb 2026 11:24:40 -0800 Subject: [PATCH 1/8] intel/mi_builder: add mi_ixor() Just like mi_ior(), but for xor. We're going to use it in one of the next commits. Reviewed-by: Lionel Landwerlin Signed-off-by: Paulo Zanoni --- src/intel/common/mi_builder.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/intel/common/mi_builder.h b/src/intel/common/mi_builder.h index 41701d71ce8..b5f4bee4ee4 100644 --- a/src/intel/common/mi_builder.h +++ b/src/intel/common/mi_builder.h @@ -1066,6 +1066,17 @@ mi_ior(struct mi_builder *b, MI_ALU_STORE, MI_ALU_ACCU); } +static inline struct mi_value +mi_ixor(struct mi_builder *b, + struct mi_value src0, struct mi_value src1) +{ + if (src0.type == MI_VALUE_TYPE_IMM && src1.type == MI_VALUE_TYPE_IMM) + return mi_imm(mi_value_to_u64(src0) ^ mi_value_to_u64(src1)); + + return mi_math_binop(b, MI_ALU_XOR, src0, src1, + MI_ALU_STORE, MI_ALU_ACCU); +} + #if GFX_VERx10 >= 125 static inline struct mi_value mi_ishl(struct mi_builder *b, struct mi_value src0, struct mi_value src1) From fa38f821f43961c2bfdc3e9156229d3fd25a6ce4 Mon Sep 17 00:00:00 2001 From: Paulo Zanoni Date: Wed, 25 Feb 2026 11:27:16 -0800 Subject: [PATCH 2/8] intel/mi_builder: add mi_umax2() We're going to use this for indirect copies, as we need to iterate through the indirect buffer checking the copy sizes, then pick the maximum copy size in order to launch the indirect compute shader. Signed-off-by: Paulo Zanoni --- src/intel/common/mi_builder.h | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/src/intel/common/mi_builder.h b/src/intel/common/mi_builder.h index b5f4bee4ee4..527e649015f 100644 --- a/src/intel/common/mi_builder.h +++ b/src/intel/common/mi_builder.h @@ -1301,6 +1301,40 @@ mi_udiv32_imm(struct mi_builder *b, struct mi_value N, uint32_t D) } } +/* Finds the maximum between the two specified unsigned numbers. */ +static inline struct mi_value +mi_umax2(struct mi_builder *b, struct mi_value val1, struct mi_value val2) +{ + /* The idea of the alrogithm here is that the value of 'mask' will be + * either 0 or ~0 depending on which number is bigger. Then we use AND + * operations to ensure the smaller value becomes zero and the bigger value + * is preserved, and finally OR both values to the destination (the bigger + * and zero). + * + * In other words: + * mask = val1 < val2 ? 0xFFFFFFFF : 0x0; + * biggest = (val1 & ~mask) | (val2 & mask); + */ + + /* If 'val1' is smaller, 'mask' is ~0, otherwise it's 0. */ + struct mi_value mask = mi_ult(b, mi_value_ref(b, val1), + mi_value_ref(b, val2)); + struct mi_value notmask = mi_ixor(b, mi_value_ref(b, mask), + mi_imm(UINT64_MAX)); + /* If 'val1' is smaller, 'notmask' is 0, so we zero it, otherwise we + * preserve the value by ANDing it with ~0. + */ + struct mi_value val1_or_zero = mi_iand(b, val1, notmask); + /* If 'val2' is smaller, mask is 0, so we zero it, otherwise we preserve + * the value. + */ + struct mi_value val2_or_zero = mi_iand(b, val2, mask); + /* The smaller value was zeroed, the other was preserved, so just OR + * them now. + */ + return mi_ior(b, val1_or_zero, val2_or_zero); +} + #endif /* MI_MATH section */ /* This assumes addresses of strictly more than 32bits (aka. Gfx8+). */ From 25fe0719c5a7b01287129d21323eeca7c2a08e78 Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Wed, 25 Mar 2026 22:11:38 +0200 Subject: [PATCH 3/8] mi_builder: mi_umax2 tests v2 (From Paulo): add more magic numbers. Reviewed-by: Paulo Zanoni --- src/intel/common/tests/mi_builder_test.cpp | 52 ++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/src/intel/common/tests/mi_builder_test.cpp b/src/intel/common/tests/mi_builder_test.cpp index af0579c2a5b..a2d50ecf244 100644 --- a/src/intel/common/tests/mi_builder_test.cpp +++ b/src/intel/common/tests/mi_builder_test.cpp @@ -1252,6 +1252,58 @@ TEST_F(mi_builder_test, udiv32_imm) } } +TEST_F(mi_builder_test, umax2_32) +{ + const uint32_t values[] = { + 0x01234567, + 0x42424242, + 0xffffffff, + 0xf0000000, + 0x00000001, + 0x00000000, + 0x000f0000, + 0x000f0001, + }; + memcpy(input, values, sizeof(values)); + + uint32_t cmp_val = 0x000f0001; + for (unsigned i = 0; i < ARRAY_SIZE(values); i++) + mi_store(&b, out_mem64(i * 4), mi_umax2(&b, in_mem32(i * 4), mi_imm(cmp_val))); + + submit_batch(); + + for (unsigned i = 0; i < ARRAY_SIZE(values); i++) { + EXPECT_EQ(*(uint32_t *)(output + i * 4), + values[i] >= cmp_val ? values[i] : cmp_val); + } +} + +TEST_F(mi_builder_test, umax2_64) +{ + const uint64_t values[] = { + 0x0123456789abcdef, + 0x4242424242424242, + 0xffffffffffffffff, + 0xf000000000000000, + 0x0000000000000001, + 0x0000000000000000, + 0x00000000f0000000, + 0x00000000f0000001, + }; + memcpy(input, values, sizeof(values)); + + uint64_t cmp_val = 0x00000000f0000001; + for (unsigned i = 0; i < ARRAY_SIZE(values); i++) + mi_store(&b, out_mem64(i * 8), mi_umax2(&b, in_mem64(i * 8), mi_imm(cmp_val))); + + submit_batch(); + + for (unsigned i = 0; i < ARRAY_SIZE(values); i++) { + EXPECT_EQ(*(uint64_t *)(output + i * 8), + values[i] >= cmp_val ? values[i] : cmp_val); + } +} + TEST_F(mi_builder_test, store_if) { uint64_t u64 = 0xb453b411deadc0deull; From ae1b5ca19824af91bdf337d1cbf059844d5bbbae Mon Sep 17 00:00:00 2001 From: Paulo Zanoni Date: Wed, 25 Feb 2026 11:18:17 -0800 Subject: [PATCH 4/8] intel/blorp: prepare for usage of mi_builder.h In the next patch we will use mi_builder.h from blorp code, so this commit prepares the terrain for that by adding the necessary definitions that the header requires. Signed-off-by: Paulo Zanoni --- src/gallium/drivers/iris/iris_blorp.c | 7 +++++++ src/intel/blorp/blorp_genX_exec_brw.h | 19 +++++++++++++++++++ src/intel/vulkan/genX_blorp_exec.c | 7 +++++++ 3 files changed, 33 insertions(+) diff --git a/src/gallium/drivers/iris/iris_blorp.c b/src/gallium/drivers/iris/iris_blorp.c index 13b257262dc..6c18e2b00b0 100644 --- a/src/gallium/drivers/iris/iris_blorp.c +++ b/src/gallium/drivers/iris/iris_blorp.c @@ -534,3 +534,10 @@ blorp_emit_post_draw(struct blorp_batch *blorp_batch, const struct blorp_params genX(maybe_emit_breakpoint)(batch, false); blorp_measure_end(blorp_batch, params); } + +static bool * +blorp_get_write_fencing_status(struct blorp_batch *blorp_batch) +{ + struct iris_batch *batch = blorp_batch->driver_batch; + return &batch->write_fence_status; +} diff --git a/src/intel/blorp/blorp_genX_exec_brw.h b/src/intel/blorp/blorp_genX_exec_brw.h index ba50dd8efc3..f72f439d3f6 100644 --- a/src/intel/blorp/blorp_genX_exec_brw.h +++ b/src/intel/blorp/blorp_genX_exec_brw.h @@ -148,6 +148,9 @@ brw_blorp_get_urb_length(const struct brw_fs_prog_data *prog_data) return MAX2((prog_data->num_varying_inputs + 1) / 2, 1); } +static bool * +blorp_get_write_fencing_status(struct blorp_batch *batch); + /***** BEGIN blorp_exec implementation ******/ static uint64_t @@ -164,9 +167,25 @@ _blorp_combine_address(struct blorp_batch *batch, void *location, #define __gen_address_type struct blorp_address #define __gen_user_data struct blorp_batch #define __gen_combine_address _blorp_combine_address +#define __gen_get_write_fencing_status(b) blorp_get_write_fencing_status(b) +#define __gen_get_batch_dwords(b, d) blorp_emit_dwords((b), (d)) + +static inline struct blorp_address +__gen_address_offset(struct blorp_address addr, uint64_t offset) +{ + addr.offset += offset; + return addr; +} + +static inline struct blorp_address +__gen_get_batch_address(struct blorp_batch *batch, void *location) +{ + UNREACHABLE("Not supported by blorp"); +} #include "genxml/genX_pack.h" #include "common/intel_genX_state_brw.h" +#include "common/mi_builder.h" #define _blorp_cmd_length(cmd) cmd ## _length #define _blorp_cmd_length_bias(cmd) cmd ## _length_bias diff --git a/src/intel/vulkan/genX_blorp_exec.c b/src/intel/vulkan/genX_blorp_exec.c index b0f59e6edad..376d7dc06b5 100644 --- a/src/intel/vulkan/genX_blorp_exec.c +++ b/src/intel/vulkan/genX_blorp_exec.c @@ -596,3 +596,10 @@ genX(blorp_init_dynamic_states)(struct blorp_context *context) { blorp_init_dynamic_states(context); } + +static bool * +blorp_get_write_fencing_status(struct blorp_batch *blorp_batch) +{ + struct anv_cmd_buffer *cmd_buffer = blorp_batch->driver_batch; + return &cmd_buffer->batch.write_fence_status; +} From 772966d412c29614003a73e906c60abc1cfaac57 Mon Sep 17 00:00:00 2001 From: Paulo Zanoni Date: Thu, 2 Apr 2026 11:36:41 -0700 Subject: [PATCH 5/8] libcl/vk: add aligned(4) to VkCopyMemoryIndirectCommandKHR This structure, despite containing 8-bit members, can be 4-byte aligned: "VUID-VkCopyMemoryIndirectInfoKHR-copyAddressRange-10942 copyAddressRange.address must be 4 byte aligned" So do it like we do with the other structures. Reviewed-by: Lionel Landwerlin Signed-off-by: Paulo Zanoni --- src/compiler/libcl/libcl_vk.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/compiler/libcl/libcl_vk.h b/src/compiler/libcl/libcl_vk.h index 6bb96efe30e..e3e8cf62423 100644 --- a/src/compiler/libcl/libcl_vk.h +++ b/src/compiler/libcl/libcl_vk.h @@ -186,4 +186,4 @@ typedef struct VkCopyMemoryIndirectCommandKHR { VkDeviceAddress srcAddress; VkDeviceAddress dstAddress; VkDeviceSize size; -} VkCopyMemoryIndirectCommandKHR; +} VkCopyMemoryIndirectCommandKHR __attribute__((aligned(4))); From 373eabcdbf1dce763623bdaf87f83d584ce2c105 Mon Sep 17 00:00:00 2001 From: Paulo Zanoni Date: Thu, 2 Apr 2026 11:55:35 -0700 Subject: [PATCH 6/8] libcl/vk: add VkCopyMemoryToImageIndirectCommandKHR and its members The members are all naturally aligned to 4, but other naturally-aligned-to-4 structs in this file still have the attribute declared (such as VkDispatchIndirectCommand), so I'm adding the attributes to these as well. Reviewed-by: Lionel Landwerlin Signed-off-by: Paulo Zanoni --- src/compiler/libcl/libcl_vk.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/src/compiler/libcl/libcl_vk.h b/src/compiler/libcl/libcl_vk.h index e3e8cf62423..5969d97d5ff 100644 --- a/src/compiler/libcl/libcl_vk.h +++ b/src/compiler/libcl/libcl_vk.h @@ -11,6 +11,7 @@ typedef uint32_t VkBool32; typedef uint64_t VkDeviceAddress; typedef uint64_t VkDeviceSize; typedef uint32_t VkFlags; +typedef VkFlags VkImageAspectFlags; typedef enum VkQueryType { VK_QUERY_TYPE_OCCLUSION = 0, @@ -126,6 +127,25 @@ typedef enum VkDepthBiasRepresentationEXT { VK_DEPTH_BIAS_REPRESENTATION_MAX_ENUM_EXT = 0x7FFFFFFF } VkDepthBiasRepresentationEXT; +typedef struct VkOffset3D { + int32_t x; + int32_t y; + int32_t z; +} VkOffset3D __attribute__((aligned(4))); + +typedef struct VkExtent3D { + uint32_t width; + uint32_t height; + uint32_t depth; +} VkExtent3D __attribute__((aligned(4))); + +typedef struct VkImageSubresourceLayers { + VkImageAspectFlags aspectMask; + uint32_t mipLevel; + uint32_t baseArrayLayer; + uint32_t layerCount; +} VkImageSubresourceLayers __attribute__((aligned(4))); + typedef struct VkDispatchIndirectCommand { uint32_t x; uint32_t y; @@ -187,3 +207,12 @@ typedef struct VkCopyMemoryIndirectCommandKHR { VkDeviceAddress dstAddress; VkDeviceSize size; } VkCopyMemoryIndirectCommandKHR __attribute__((aligned(4))); + +typedef struct VkCopyMemoryToImageIndirectCommandKHR { + VkDeviceAddress srcAddress; + uint32_t bufferRowLength; + uint32_t bufferImageHeight; + VkImageSubresourceLayers imageSubresource; + VkOffset3D imageOffset; + VkExtent3D imageExtent; +} VkCopyMemoryToImageIndirectCommandKHR __attribute__((aligned(4))); From 0bd9aa85eb695c01a5ad0cf3c4f0d8f1345f8196 Mon Sep 17 00:00:00 2001 From: Paulo Zanoni Date: Thu, 18 Dec 2025 15:34:36 -0800 Subject: [PATCH 7/8] anv: implement VK_KHR_copy_memory_indirect This implements the extension on the Graphics and Compute queues using Blorp OpenCL compute shaders. Support for the Transfer queue will come in a later patch. We also don't support 24/48/96 bpp formats yet. Signed-off-by: Paulo Zanoni --- docs/features.txt | 2 +- src/intel/blorp/blorp.h | 18 ++ src/intel/blorp/blorp_genX_exec_brw.h | 175 ++++++++++++ src/intel/blorp/blorp_indirect_copy.c | 355 +++++++++++++++++++++++++ src/intel/blorp/blorp_priv.h | 73 ++++- src/intel/blorp/blorp_shaders.cl | 277 +++++++++++++++++++ src/intel/blorp/meson.build | 1 + src/intel/vulkan/anv_blorp.c | 132 +++++++++ src/intel/vulkan/anv_formats.c | 40 ++- src/intel/vulkan/anv_physical_device.c | 10 + src/intel/vulkan/genX_blorp_exec.c | 2 + 11 files changed, 1082 insertions(+), 3 deletions(-) create mode 100644 src/intel/blorp/blorp_indirect_copy.c diff --git a/docs/features.txt b/docs/features.txt index 29197179949..0634d7cbe28 100644 --- a/docs/features.txt +++ b/docs/features.txt @@ -542,7 +542,7 @@ Khronos extensions that are not part of any Vulkan version: VK_KHR_calibrated_timestamps DONE (anv, hk, kk, nvk, panvk/v10+, radv, tu/a750+, vn) VK_KHR_compute_shader_derivatives DONE (anv, lvp, nvk, radv, tu, vn) VK_KHR_cooperative_matrix DONE (anv, nvk/Turing+, radv/gfx11+, vn) - VK_KHR_copy_memory_indirect DONE (nvk, radv/gfx8+) + VK_KHR_copy_memory_indirect DONE (anv, nvk, radv/gfx8+) VK_KHR_deferred_host_operations DONE (anv, hasvk, lvp, radv, tu, vn) VK_KHR_depth_clamp_zero_one DONE (anv, nvk, panvk, radv, tu, vn) VK_KHR_device_address_commands DONE (radv) diff --git a/src/intel/blorp/blorp.h b/src/intel/blorp/blorp.h index b07006777f5..31c1e14805b 100644 --- a/src/intel/blorp/blorp.h +++ b/src/intel/blorp/blorp.h @@ -41,6 +41,8 @@ typedef struct nir_shader nir_shader; enum blorp_op { BLORP_OP_BLIT, BLORP_OP_COPY, + BLORP_OP_COPY_INDIRECT, + BLORP_OP_COPY_IMAGE_INDIRECT, BLORP_OP_CCS_AMBIGUATE, BLORP_OP_CCS_COLOR_CLEAR, BLORP_OP_CCS_PARTIAL_RESOLVE, @@ -299,6 +301,22 @@ blorp_buffer_copy(struct blorp_batch *batch, struct blorp_address dst, uint64_t size); +void +blorp_copy_memory_indirect(struct blorp_batch *batch, + uint64_t indirect_buf_addr, + uint32_t copy_count, + uint64_t stride); + +void +blorp_copy_memory_to_image_indirect(struct blorp_batch *batch, + const struct blorp_surf *img_blorp_surf, + uint64_t indirect_buf_addr, + uint64_t indirect_buf_stride, + uint32_t first_copy_idx, + uint32_t img_mip_level, + int layer_count, + int forced_layer_or_z); + void blorp_fast_clear(struct blorp_batch *batch, const struct blorp_surf *surf, diff --git a/src/intel/blorp/blorp_genX_exec_brw.h b/src/intel/blorp/blorp_genX_exec_brw.h index f72f439d3f6..44f420af8fd 100644 --- a/src/intel/blorp/blorp_genX_exec_brw.h +++ b/src/intel/blorp/blorp_genX_exec_brw.h @@ -1380,6 +1380,10 @@ blorp_setup_binding_table(struct blorp_batch *batch, uint32_t surface_offsets[BLORP_NUM_BT_ENTRIES], bind_offset = 0; void *surface_maps[BLORP_NUM_BT_ENTRIES]; + /* There's nothing to bind here. */ + if (params->op == BLORP_OP_COPY_INDIRECT) + return 0; + if (params->use_pre_baked_binding_table) { bind_offset = params->pre_baked_binding_table_offset; } else { @@ -1784,6 +1788,170 @@ blorp_get_compute_push_const(struct blorp_batch *batch, *state_size = push_const_size; } +static void +blorp_indirect_buffer_get_dispatch_size(struct blorp_batch *batch, + const struct blorp_params *params, + struct mi_builder *b, + struct mi_value *size_x, + struct mi_value *size_y, + struct mi_value *size_z) +{ + struct blorp_context *blorp = batch->blorp; + const struct brw_cs_prog_data *cs_prog_data = params->cs_prog_data; + + uint64_t indirect_buf_addr = params->wm_inputs.indirect.indirect_buf_addr; + uint64_t stride = params->wm_inputs.indirect.indirect_buf_stride; + uint32_t copy_count = params->wm_inputs.indirect.copy_count; + + size_t size_offset = 16; /* offsetof(VkCopyMemoryIndirectCommandKHR, size) */ + + struct mi_value biggest_copy_size; + for (int c = 0; c < copy_count; c++) { + struct blorp_address copy_size_addr = { + .buffer = NULL, + .offset = indirect_buf_addr + c * stride + size_offset, + .reloc_flags = 0, + .mocs = isl_mocs(blorp->isl_dev, ISL_SURF_USAGE_STORAGE_BIT, false), + .local_hint = false, /* We don't have a way to know this. */ + }; + + struct mi_value this_copy_size = mi_mem32(copy_size_addr); + if (c == 0) + biggest_copy_size = this_copy_size; + else + biggest_copy_size = mi_umax2(b, this_copy_size, biggest_copy_size); + } + + /* Each shader invocation writes an uint32_t. */ + int divisor = cs_prog_data->local_size[0] * sizeof(uint32_t); + *size_x = mi_udiv32_imm(b, mi_iadd_imm(b, biggest_copy_size, divisor - 1), + divisor); + + assert(cs_prog_data->local_size[1] == 1); + assert(cs_prog_data->local_size[2] == 1); + *size_y = mi_imm(1); + *size_z = mi_imm(1); +} + +static void +blorp_indirect_buf2img_get_dispatch_size(struct blorp_batch *batch, + const struct blorp_params *params, + struct mi_builder *b, + struct mi_value *size_x, + struct mi_value *size_y, + struct mi_value *size_z) +{ + struct blorp_context *blorp = batch->blorp; + const struct brw_cs_prog_data *cs_prog_data = params->cs_prog_data; + + uint64_t indirect_buf_addr = params->wm_inputs.indirect.indirect_buf_addr; + uint64_t stride = params->wm_inputs.indirect.indirect_buf_stride; + uint32_t copy_idx = params->wm_inputs.indirect.copy_idx; + bool is_forced_layer = params->wm_inputs.indirect.forced_layer_or_z != -1; + bool is_3d = params->wm_inputs.indirect.dimensions == 3; + + /* These are all offsetof(VkCopyMemoryToImageIndirectCommandKHR, x). */ + const size_t img_extent_x_offset = 44; + const size_t img_extent_y_offset = 48; + const size_t img_extent_z_offset = 52; + + struct blorp_address x_extent_addr = { + .buffer = NULL, + .offset = indirect_buf_addr + copy_idx * stride + img_extent_x_offset, + .reloc_flags = 0, + .mocs = isl_mocs(blorp->isl_dev, ISL_SURF_USAGE_STORAGE_BIT, false), + .local_hint = false, /* We don't have a way to know this. */ + }; + struct blorp_address y_extent_addr = { + .buffer = NULL, + .offset = indirect_buf_addr + copy_idx * stride + img_extent_y_offset, + .reloc_flags = 0, + .mocs = isl_mocs(blorp->isl_dev, ISL_SURF_USAGE_STORAGE_BIT, false), + .local_hint = false, /* We don't have a way to know this. */ + }; + struct blorp_address z_extent_addr = { + .buffer = NULL, + .offset = indirect_buf_addr + copy_idx * stride + img_extent_z_offset, + .reloc_flags = 0, + .mocs = isl_mocs(blorp->isl_dev, ISL_SURF_USAGE_STORAGE_BIT, false), + .local_hint = false, /* We don't have a way to know this. */ + }; + + /* Notes on the 'params->num_layers' usage below: + * + * - Please see how we handle this in function + * blorp_copy_memory_to_image_indirect(). + * + * - If we're using forced layers (see + * params.wm_inputs.indirect.forced_layer_or_z), then we can just set + * size_z to 1 and don't even look at the indirect buffer: each shader + * call operates on a single layer. + * + * - The information on the number of layers for each copy is not truly + * indirect: it has to be passed to during command creation, so we + * have already processed it. See 'max_layer_count' in + * blorp_copy_memory_to_image_indirect(). We don't want to be looking + * at the indirect buffer if we don't need to. + * + * - For 3D images, the applications can use the Z axis as either an + * actual axis (offset.z + extent.z) or pretend the Z axis is layers + * (base_layer + layer_count), so would have to check both places. + * Fortunately, base_layer + layer_count was already checked (see + * above) and recorded in params->num_layers. If params->num_layers is + * bigger than 1, then we don't even bother looking at extent.z. + */ + struct mi_value x_extent = mi_mem32(x_extent_addr); + struct mi_value y_extent = mi_mem32(y_extent_addr); + struct mi_value z_extent; + if (is_forced_layer) { + z_extent = mi_imm(1); + } else { + if (is_3d && params->num_layers == 1) + z_extent = mi_mem32(z_extent_addr); + else + z_extent = mi_imm(params->num_layers); + } + + int divisor_x = cs_prog_data->local_size[0]; + int divisor_y = cs_prog_data->local_size[1]; + int divisor_z = cs_prog_data->local_size[2]; + + assert(divisor_x != 1 && divisor_y != 1 && divisor_z == 1); + *size_x = mi_udiv32_imm(b, mi_iadd_imm(b, x_extent, divisor_x - 1), + divisor_x); + *size_y = mi_udiv32_imm(b, mi_iadd_imm(b, y_extent, divisor_y - 1), + divisor_y); + *size_z = z_extent; +} + +static void +blorp_indirect_write_gpgpu_dispatch_regs(struct blorp_batch *batch, + const struct blorp_params *params) +{ + struct blorp_context *blorp = batch->blorp; + const struct intel_device_info *devinfo = blorp->compiler->brw->devinfo; + + struct mi_builder b; + mi_builder_init(&b, devinfo, batch); + mi_builder_set_mocs(&b, isl_mocs(blorp->isl_dev, 0, false)); + + struct mi_value size_x, size_y, size_z; + + if (params->op == BLORP_OP_COPY_INDIRECT) { + blorp_indirect_buffer_get_dispatch_size(batch, params, &b, &size_x, + &size_y, &size_z); + } else { + assert(params->op == BLORP_OP_COPY_IMAGE_INDIRECT); + + blorp_indirect_buf2img_get_dispatch_size(batch, params, &b, &size_x, + &size_y, &size_z); + } + + mi_store(&b, mi_reg32(GENX(GPGPU_DISPATCHDIMX_num)), size_x); + mi_store(&b, mi_reg32(GENX(GPGPU_DISPATCHDIMY_num)), size_y); + mi_store(&b, mi_reg32(GENX(GPGPU_DISPATCHDIMZ_num)), size_z); +} + static void blorp_exec_compute(struct blorp_batch *batch, const struct blorp_params *params) { @@ -1798,6 +1966,11 @@ blorp_exec_compute(struct blorp_batch *batch, const struct blorp_params *params) const struct intel_cs_dispatch_info dispatch = brw_cs_get_dispatch_info(devinfo, cs_prog_data, NULL); + bool use_indirect = blorp_op_type_is_indirect(params->op); + + if (use_indirect) + blorp_indirect_write_gpgpu_dispatch_regs(batch, params); + uint32_t group_x0 = params->x0 / cs_prog_data->local_size[0]; uint32_t group_y0 = params->y0 / cs_prog_data->local_size[1]; uint32_t group_z0 = params->dst.z_offset; @@ -1911,6 +2084,7 @@ blorp_exec_compute(struct blorp_batch *batch, const struct blorp_params *params) assert(cs_prog_data->push.per_thread.regs == 0); blorp_emit(batch, GENX(COMPUTE_WALKER), cw) { + cw.IndirectParameterEnable = use_indirect, cw.body = body; } #else /* GFX_VERx10 >= 125 */ @@ -1980,6 +2154,7 @@ blorp_exec_compute(struct blorp_batch *batch, const struct blorp_params *params) } blorp_emit(batch, GENX(GPGPU_WALKER), ggw) { + ggw.IndirectParameterEnable = use_indirect, ggw.SIMDSize = dispatch.simd_size / 16; ggw.ThreadDepthCounterMaximum = 0; ggw.ThreadHeightCounterMaximum = 0; diff --git a/src/intel/blorp/blorp_indirect_copy.c b/src/intel/blorp/blorp_indirect_copy.c new file mode 100644 index 00000000000..4d46d3c8bd4 --- /dev/null +++ b/src/intel/blorp/blorp_indirect_copy.c @@ -0,0 +1,355 @@ +/* Copyright © 2025 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "blorp_nir_builder.h" +#include "compiler/nir/nir_format_convert.h" + +#include "blorp_priv.h" +#include "dev/intel_debug.h" +#include "dev/intel_device_info.h" + +#include "blorp_shaders.h" + +/* Refer to struct blorp_wm_inputs_indirect. */ +struct blorp_indirect_vars { + nir_variable *indirect_buf_addr; + nir_variable *stride; + nir_variable *copy_count; + nir_variable *copy_idx; + nir_variable *max_layer; + nir_variable *x_offset; + nir_variable *y_offset; +}; + +static enum isl_format +get_format_for_copy(int format_bpb) +{ + switch (format_bpb) { + case 8: return ISL_FORMAT_R8_UINT; + case 16: return ISL_FORMAT_R16_UINT; + case 24: return ISL_FORMAT_R8G8B8_UINT; + case 32: return ISL_FORMAT_R32_UINT; + case 48: return ISL_FORMAT_R16G16B16_UINT; + case 64: return ISL_FORMAT_R32G32_UINT; + case 96: return ISL_FORMAT_R32G32B32_UINT; + case 128: return ISL_FORMAT_R32G32B32A32_UINT; + default: + mesa_loge("unexpected format bpb: %d", format_bpb); + assert(false); + return ISL_FORMAT_UNSUPPORTED; + } +} + +static void +blorp_indirect_vars_init(nir_builder *b, struct blorp_indirect_vars *v) +{ + v->copy_count = + BLORP_CREATE_NIR_INPUT(b->shader, indirect.copy_count, + glsl_uint_type()); + v->indirect_buf_addr = + BLORP_CREATE_NIR_INPUT(b->shader, indirect.indirect_buf_addr, + glsl_uint64_t_type()); + v->stride = + BLORP_CREATE_NIR_INPUT(b->shader, indirect.indirect_buf_stride, + glsl_uint64_t_type()); + v->copy_idx = + BLORP_CREATE_NIR_INPUT(b->shader, indirect.copy_idx, + glsl_uint_type()); + v->max_layer = + BLORP_CREATE_NIR_INPUT(b->shader, indirect.max_layer, + glsl_uint_type()); + v->x_offset = + BLORP_CREATE_NIR_INPUT(b->shader, indirect.x_offset, + glsl_uint_type()); + v->y_offset = + BLORP_CREATE_NIR_INPUT(b->shader, indirect.y_offset, + glsl_uint_type()); +} + +static nir_shader * +blorp_build_copy_mem_indirect_shader(struct blorp_batch *batch, + void *mem_ctx) +{ + + struct blorp_context *blorp = batch->blorp; + + mesa_shader_stage stage = MESA_SHADER_COMPUTE; + const nir_shader_compiler_options *nir_options = + blorp->compiler->nir_options(blorp, stage); + nir_builder b = nir_builder_init_simple_shader(stage, nir_options, + "copy mem indirect"); + ralloc_steal(mem_ctx, b.shader); + + b.shader->info.workgroup_size[0] = 32; + b.shader->info.workgroup_size[1] = 1; + b.shader->info.workgroup_size[2] = 1; + + struct blorp_indirect_vars v; + blorp_indirect_vars_init(&b, &v); + + /* The indirect buffer is an array containing 'copy_count' + * VkCopyMemoryIndirectCommandKHR structures, separated by 'stride' bytes. + */ + nir_def *copy_count = nir_load_var(&b, v.copy_count); + nir_def *indirect_buf_addr = nir_load_var(&b, v.indirect_buf_addr); + nir_def *stride = nir_load_var(&b, v.stride); + nir_def *global_id = + nir_channel(&b, nir_load_global_invocation_id(&b, 32), 0); + + blorp_copy_memory_indirect_shader(&b, indirect_buf_addr, copy_count, + stride, global_id); + + return b.shader; +} + +static nir_shader * +blorp_build_copy_mem2img_indirect_shader(struct blorp_batch *batch, + void *mem_ctx, + struct blorp_indirect_copy_mem2img_key *key) +{ + struct blorp_context *blorp = batch->blorp; + + mesa_shader_stage stage = MESA_SHADER_COMPUTE; + const nir_shader_compiler_options *nir_options = + blorp->compiler->nir_options(blorp, stage); + nir_builder b = nir_builder_init_simple_shader(stage, nir_options, + "copy mem2img indirect"); + ralloc_steal(mem_ctx, b.shader); + + b.shader->info.workgroup_size[0] = 4; + b.shader->info.workgroup_size[1] = 8; + b.shader->info.workgroup_size[2] = 1; + + struct blorp_indirect_vars v; + blorp_indirect_vars_init(&b, &v); + + /* The indirect buffer is an array containing 'copy_count' + * VkCopyMemoryToImageIndirectCommandKHR structures, separated by 'stride' + * bytes. + */ + nir_def *indirect_buf_addr = nir_load_var(&b, v.indirect_buf_addr); + nir_def *stride = nir_load_var(&b, v.stride); + nir_def *copy_idx = nir_load_var(&b, v.copy_idx); + nir_def *max_layer = nir_load_var(&b, v.max_layer); + nir_def *dest_coord_offsets_arr[2] = { + nir_load_var(&b, v.x_offset), + nir_load_var(&b, v.y_offset), + }; + nir_def *dest_coord_offsets = nir_vec(&b, dest_coord_offsets_arr, 2); + + nir_def *global_id = nir_load_global_invocation_id(&b, 32); + + /* Shader keys (constants). */ + nir_def *dimensions = nir_imm_int(&b, key->dimensions); + nir_def *forced_layer_or_z = nir_imm_int(&b, key->forced_layer_or_z); + nir_def *format_Bpb = nir_imm_intN_t(&b, key->format_Bpb, 16); + nir_def *format_block_size_arr[3] = { + nir_imm_int(&b, key->format_bw), + nir_imm_int(&b, key->format_bh), + nir_imm_int(&b, key->format_bd), + }; + nir_def *format_block_size = nir_vec(&b, format_block_size_arr, 3); + + /* Constants derived from the shader keys. */ + nir_def *is_block_compressed = + nir_imm_bool(&b, key->format_bw > 1 || key->format_bh > 1 || + key->format_bd > 1); + + blorp_copy_memory_to_image_indirect_shader(&b, indirect_buf_addr, + stride, + copy_idx, + max_layer, + dest_coord_offsets, + global_id, + dimensions, + forced_layer_or_z, + format_Bpb, + format_block_size, + is_block_compressed); + + return b.shader; +} + +static bool +blorp_get_copy_mem_indirect_kernel_cs(struct blorp_batch *batch, + struct blorp_params *params) +{ + struct blorp_context *blorp = batch->blorp; + const char *key = "copy_mem_indirect_kernel_cs"; + uint32_t key_size = strlen(key); + + if (blorp->lookup_shader(batch, key, key_size, ¶ms->cs_prog_kernel, + ¶ms->cs_prog_data)) + return true; + + void *mem_ctx = ralloc_context(NULL); + + nir_shader *nir = + blorp_build_copy_mem_indirect_shader(batch, mem_ctx); + + const struct blorp_program prog = + blorp_compile_cs(blorp, mem_ctx, nir, key, key_size); + + bool result = blorp->upload_shader(batch, MESA_SHADER_COMPUTE, + key, key_size, + prog.kernel, prog.kernel_size, + prog.prog_data, prog.prog_data_size, + ¶ms->cs_prog_kernel, + ¶ms->cs_prog_data); + + ralloc_free(mem_ctx); + return result; + +} + +static bool +blorp_get_copy_mem2img_indirect_kernel_cs(struct blorp_batch *batch, + struct blorp_params *params, + struct blorp_indirect_copy_mem2img_key *key) +{ + struct blorp_context *blorp = batch->blorp; + + if (blorp->lookup_shader(batch, key, sizeof(*key), ¶ms->cs_prog_kernel, + ¶ms->cs_prog_data)) + return true; + + void *mem_ctx = ralloc_context(NULL); + + nir_shader *nir = + blorp_build_copy_mem2img_indirect_shader(batch, mem_ctx, key); + + const struct blorp_program prog = + blorp_compile_cs(blorp, mem_ctx, nir, key, sizeof(*key)); + + bool result = blorp->upload_shader(batch, MESA_SHADER_COMPUTE, + key, sizeof(*key), + prog.kernel, prog.kernel_size, + prog.prog_data, prog.prog_data_size, + ¶ms->cs_prog_kernel, + ¶ms->cs_prog_data); + + ralloc_free(mem_ctx); + return result; +} + +void +blorp_copy_memory_indirect(struct blorp_batch *batch, + uint64_t indirect_buf_addr, + uint32_t copy_count, + uint64_t stride) +{ + struct blorp_params params; + blorp_params_init(¶ms); + + params.op = BLORP_OP_COPY_INDIRECT; + params.shader_type = BLORP_SHADER_TYPE_COPY_INDIRECT; + params.shader_pipeline = BLORP_SHADER_PIPELINE_COMPUTE; + + params.x0 = 0; + params.y0 = 0; + params.x1 = 1; + params.y1 = 1; + + params.wm_inputs.indirect.indirect_buf_addr = indirect_buf_addr; + params.wm_inputs.indirect.indirect_buf_stride = stride; + params.wm_inputs.indirect.copy_count = copy_count; + + if (!blorp_get_copy_mem_indirect_kernel_cs(batch, ¶ms)) { + mesa_loge("failed to get copy_memory_indirect CS kernel"); + assert(false); + return; + } + + batch->blorp->exec(batch, ¶ms); +} + +void +blorp_copy_memory_to_image_indirect(struct blorp_batch *batch, + const struct blorp_surf *img_blorp_surf, + uint64_t indirect_buf_addr, + uint64_t indirect_buf_stride, + uint32_t copy_idx, + uint32_t img_mip_level, + int layer_count, + int forced_layer_or_z) +{ + enum isl_format original_format = img_blorp_surf->surf->format; + const struct isl_format_layout *fmtl = + isl_format_get_layout(original_format); + enum isl_format copy_format = get_format_for_copy(fmtl->bpb); + int dimensions = img_blorp_surf->surf->dim + 1; + + struct blorp_indirect_copy_mem2img_key key = { + .dimensions = dimensions, + .forced_layer_or_z = forced_layer_or_z, + .format_Bpb = fmtl->bpb / 8, + .format_bw = fmtl->bw, + .format_bh = fmtl->bh, + .format_bd = fmtl->bd, + }; + + struct blorp_params params; + blorp_params_init(¶ms); + + params.op = BLORP_OP_COPY_IMAGE_INDIRECT; + params.shader_type = BLORP_SHADER_TYPE_COPY_INDIRECT; + params.shader_pipeline = BLORP_SHADER_PIPELINE_COMPUTE; + + params.wm_inputs.indirect.indirect_buf_addr = indirect_buf_addr; + params.wm_inputs.indirect.indirect_buf_stride = indirect_buf_stride; + params.wm_inputs.indirect.copy_idx = copy_idx; + params.wm_inputs.indirect.dimensions = dimensions; + params.wm_inputs.indirect.max_layer = + img_blorp_surf->surf->logical_level0_px.array_len - 1; + params.wm_inputs.indirect.forced_layer_or_z = forced_layer_or_z; + + /* params.dst is our image. */ + blorp_surface_info_init(batch, ¶ms.dst, img_blorp_surf, + img_mip_level, + forced_layer_or_z == -1 ? 0 : forced_layer_or_z, + copy_format, + true /* is_dest */); + + struct isl_extent3d mip_dimensions = { + .width = MAX2(params.dst.surf.logical_level0_px.w >> img_mip_level, 1), + .height = MAX2(params.dst.surf.logical_level0_px.h >> img_mip_level, 1), + .depth = MAX2(params.dst.surf.logical_level0_px.d >> img_mip_level, 1), + }; + + if (fmtl->bw > 1 || fmtl->bh > 1 || fmtl->bd > 1) { + blorp_surf_convert_to_uncompressed(batch->blorp->isl_dev, + ¶ms.dst, NULL, NULL, NULL, NULL); + params.wm_inputs.indirect.x_offset = params.dst.tile_x_sa; + params.wm_inputs.indirect.y_offset = params.dst.tile_y_sa; + + mip_dimensions.width = params.dst.surf.logical_level0_px.w; + mip_dimensions.height = params.dst.surf.logical_level0_px.h; + mip_dimensions.depth = params.dst.surf.logical_level0_px.d; + } + + /* These settings control the number of workgroups in the shader, see + * blorp_exec_compute(). We don't need to divide by the local sizes here, + * this will be done later. + */ + params.x0 = 0; + params.y0 = 0; + params.x1 = mip_dimensions.width; + params.y1 = mip_dimensions.height; + if (forced_layer_or_z == -1) { + /* We set this here so blorp_indirect_buf2img_get_dispatch_size() can + * read it while figuring out how many shader instances we'll need. + */ + params.num_layers = layer_count; + } else { + params.num_layers = 1; + } + + if (!blorp_get_copy_mem2img_indirect_kernel_cs(batch, ¶ms, &key)) { + mesa_loge("failed to get copy_memory_to_image_indirect CS kernel"); + assert(false); + return; + } + + batch->blorp->exec(batch, ¶ms); +} diff --git a/src/intel/blorp/blorp_priv.h b/src/intel/blorp/blorp_priv.h index cc77cd770bd..a295084f9b7 100644 --- a/src/intel/blorp/blorp_priv.h +++ b/src/intel/blorp/blorp_priv.h @@ -205,17 +205,55 @@ struct blorp_wm_inputs_clear { struct blorp_bounds_rect bounds_rect; }; +/* Parameters using in blorp_indirect_copy.c */ +struct blorp_wm_inputs_indirect { + /* The address of the indirect buffer containing the information about the + * indirect copy. + */ + uint64_t indirect_buf_addr; + + /* How far apart the information about each copy is inside the indirect + * buffer. + */ + uint64_t indirect_buf_stride; + + /* How many copies we have to do. */ + uint32_t copy_count; + + /* For memory to image copies, we do a single copy per shader. This + * represents the index of the copy to be done. + */ + uint32_t copy_idx; + + /* How many dimensions does our image have? 1, 2 or 3. */ + uint32_t dimensions; + + /* The maximum array layer of the image. */ + uint32_t max_layer; + + /* When compressed formats are used, we pretend they are a non-compressed + * format, of the same bpb. Since we can't maintain the exact same layout + * of mipmap and layer offsets, we're forced to make adjustments to where X + * and Y actually start, and are also forced to copy only one layer (or Z + * axis position) per shader invocation. + */ + uint32_t x_offset; + uint32_t y_offset; + int forced_layer_or_z; +}; + struct blorp_wm_inputs { union { struct blorp_wm_inputs_blit blit; struct blorp_wm_inputs_clear clear; + struct blorp_wm_inputs_indirect indirect; }; /* Note: Pad out to an integral number of registers when extending, but * make sure subgroup_id is the last 32-bit item. */ - uint32_t pad[2]; + uint32_t pad[1]; uint32_t subgroup_id; }; @@ -257,6 +295,7 @@ enum blorp_shader_type { BLORP_SHADER_TYPE_MCS_PARTIAL_RESOLVE, BLORP_SHADER_TYPE_LAYER_OFFSET_VS, BLORP_SHADER_TYPE_GFX4_SF, + BLORP_SHADER_TYPE_COPY_INDIRECT, }; enum blorp_shader_pipeline { @@ -467,6 +506,25 @@ struct blorp_blit_prog_key uint8_t local_y; }; +struct blorp_indirect_copy_mem2img_key { + /* How many dimensions does our image have? 1, 2 or 3. */ + uint32_t dimensions; + + /* When compressed formats are used, we pretend they are a non-compressed + * format, of the same bpb. Since we can't maintain the exact same layout + * of mipmap and layer offsets, we're forced to make adjustments to where X + * and Y actually start, and are also forced to copy only one layer (or Z + * axis position) per shader invocation. + */ + int forced_layer_or_z; + + /* Info taken from isl_format_layout. */ + uint16_t format_Bpb; + uint16_t format_bw; + uint16_t format_bh; + uint16_t format_bd; +}; + /** * \name BLORP internals * \{ @@ -597,6 +655,19 @@ blorp_op_type_is_clear(enum blorp_op op) } } +/* This means: blorp->wm_inputs.indirect should be used. */ +static inline bool +blorp_op_type_is_indirect(enum blorp_op op) +{ + switch (op) { + case BLORP_OP_COPY_INDIRECT: + case BLORP_OP_COPY_IMAGE_INDIRECT: + return true; + default: + return false; + } +} + /* Asserts unless the surface is a buffer to image copy */ #define blorp_assert_is_buffer(surf, view) \ do { \ diff --git a/src/intel/blorp/blorp_shaders.cl b/src/intel/blorp/blorp_shaders.cl index 08dd30a37f9..27fce179cb9 100644 --- a/src/intel/blorp/blorp_shaders.cl +++ b/src/intel/blorp/blorp_shaders.cl @@ -3,6 +3,7 @@ */ #include "compiler/libcl/libcl.h" +#include "compiler/libcl/libcl_vk.h" #include "compiler/nir/nir_defines.h" #include "compiler/shader_enums.h" @@ -15,3 +16,279 @@ blorp_check_in_bounds(uint4 bounds_rect, uint2 pos) return pos.x >= x0 && pos.x < x1 && pos.y >= y0 && pos.y < y1; } + +void nir_image_store(uint handle, int4 coords, uint sample_index, + uint4 colors, uint lod, uint image_dim, + uint image_array, uint format, uint access, + uint range_base, uint src_type); + +/* Used by vkCmdCopyMemoryIndirectKHR. */ +void +blorp_copy_memory_indirect_shader( + global uint *indirect_buf, + uint copy_count, + ulong stride_bytes, + uint global_id) +{ + for (uint c = 0; c < copy_count; c++) { + uint idx = (uint)(c * (stride_bytes / 4)); + + /* The spec says the minimum alignment is 4 instead of 8, so we have to + * do these tricks. + */ + VkCopyMemoryIndirectCommandKHR cmd = + (*(global VkCopyMemoryIndirectCommandKHR*)&indirect_buf[idx]); + + uint copy_size_ints = cmd.size / 4; + + global int *src = (global int*)cmd.srcAddress; + global int *dst = (global int*)cmd.dstAddress; + + if (global_id < copy_size_ints) + dst[global_id] = src[global_id]; + } +} + +struct img_copy_params { + ulong src_address; + uint2 row_size_px; + uint base_layer; + uint layer_count; + uint3 offset; + uint3 extent; +}; + +void +read_img_params( + global uint *indirect_buf, + uint copy_idx, + ulong stride_bytes, + uint dimensions, + uint max_layer, + uint3 format_block_size, + bool is_block_compressed, + struct img_copy_params *p) +{ + uint idx = copy_idx * (stride_bytes / 4); + + VkCopyMemoryToImageIndirectCommandKHR cmd = + (*(global VkCopyMemoryToImageIndirectCommandKHR*)&indirect_buf[idx]); + + p->src_address = cmd.srcAddress; + p->row_size_px = (uint2)(cmd.bufferRowLength, cmd.bufferImageHeight); + + /* We don't use imageSubresource.aspectMask and imageSubresource.mipLevel, + * those are dealt with when the application calls + * vkCmdCopyMemoryToImageIndirectKHR(). + */ + + p->base_layer = cmd.imageSubresource.baseArrayLayer; + p->layer_count = cmd.imageSubresource.layerCount; + + p->offset = (uint3)(cmd.imageOffset.x, cmd.imageOffset.y, + cmd.imageOffset.z); + p->extent = (uint3)(cmd.imageExtent.width, cmd.imageExtent.height, + cmd.imageExtent.depth); + + if (p->row_size_px.x == 0) + p->row_size_px.x = p->extent.x; + if (p->row_size_px.y == 0) + p->row_size_px.y = p->extent.y; + + /* Our code deals with blocks, not pixels. */ + if (is_block_compressed) { + p->offset /= format_block_size; + p->extent = DIV_ROUND_UP(p->extent, format_block_size); + } + + /* Users can pass 3D images with the Z axis as an array layer. */ + if (dimensions == 3 && (p->base_layer != 0 || p->layer_count != 1)) { + p->offset.z = p->base_layer; + p->extent.z = p->layer_count; + p->base_layer = 0; + p->layer_count = 1; + } + + /* This handles VK_REMAINING_ARRAY_LAYERS and bugs. */ + if (p->base_layer + p->layer_count > max_layer + 1) + p->layer_count = max_layer - p->base_layer + 1; +} + +uint4 +get_pixel( + ulong src_address, + uint format_Bpb) +{ + switch (format_Bpb) { + case 1: { + global uchar *src_buf = (global uchar *)src_address; + return (uint4)(src_buf[0], 0, 0, 0); + } + case 2: { + global ushort *src_buf = (global ushort *)src_address; + return (uint4)(src_buf[0], 0, 0, 0); + } + case 4: { + global uint *src_buf = (global uint *)src_address; + return (uint4)(src_buf[0], 0, 0, 0); + } + case 8: { + global uint *src_buf = (global uint *)src_address; + return (uint4)(vload2(0, src_buf), 0, 0); + } + case 16: { + global uint *src_buf = (global uint *)src_address; + return (uint4)(vload4(0, src_buf)); + } + default: + /* TODO: support 3, 6, 12. */ + return (uint4)(0xFF, 0, 0, 0xFF); + } +} + +int4 +get_coords( + uint3 pos, + uint layer, + uint2 dest_coord_offsets, + int dimensions, + int forced_layer_or_z) +{ + int4 ret; + + pos.xy += dest_coord_offsets; + + switch (dimensions) { + case 1: + ret = (int4)(pos.x, layer, 0, 0); + break; + case 2: + ret = (int4)(pos.x, pos.y, layer, 0); + break; + case 3: + default: + ret = (int4)(pos.x, pos.y, pos.z, 0); + break; + } + + if (forced_layer_or_z != -1) { + ret.z = 0; + ret.w = 0; + } + + return ret; +} + +void +write_pixel( + const int4 coords, + const uint4 colors) +{ + /* We don't seem to need to set image_array to true if we set sampler_dim + * to 3D. + * Setting mip_level does not do anything on Intel, we set mip levels + * through the bindings. + */ + nir_image_store(0 /* The image handle. */, + coords, /* See get_coords(). */ + 0, /* Sample index for multi-sampling. */ + colors, /* The RGBA pixels, in src_type. */ + 0 /* mip_level */, + GLSL_SAMPLER_DIM_3D, + false, /* image_array */ + 0, /* format */ + ACCESS_NON_READABLE, /* access */ + 0, /* range_base */ + nir_type_int32 /* src_type */); +} + +/* Used by vkCmdCopyMemoryToImageIndirectKHR. */ +void +blorp_copy_memory_to_image_indirect_shader( + /* Actual parameters. */ + global uint *indirect_buf, + ulong stride_bytes, + uint copy_idx, + uint max_layer, + uint2 dest_coord_offsets, + uint3 global_id, + + /* These are shader keys, they are NIR immediates. */ + uint dimensions, + int forced_layer_or_z, + ushort format_Bpb, + uint3 format_block_size, + bool is_block_compressed) +{ + /* We have one invocation per texel of a given a mip level. This means + * that for the pixels outside the copy area, we'll hit the 'continue' + * below. + */ + uint3 src_pos = global_id; + uint layer; + + /* 'forced_layer_or_z' means that whatever slice or layer we're + * trying to work with is set as part of our binding (view) as depth + * 0 or layer 0, depending on the dimensionality. This can happen + * when we're trying to pretend a format is something that it's not + * (e.g., we're treating a block compressed 4x4 64bpp format as an + * r32g32 format). + */ + if (forced_layer_or_z != -1) { + if (dimensions == 3) { + src_pos.z = forced_layer_or_z; + layer = 0; + } else { + src_pos.z = 0; + layer = forced_layer_or_z; + } + } else { + if (dimensions == 3) { + src_pos.z = global_id.z; + layer = 0; + } else { + src_pos.z = 0; + layer = global_id.z; + } + } + + struct img_copy_params p; + read_img_params(indirect_buf, copy_idx, stride_bytes, dimensions, + max_layer, format_block_size, is_block_compressed, &p); + + if (any(src_pos >= p.extent)) + return; + + uint3 dst_pos = src_pos + p.offset; + if (forced_layer_or_z == -1 && dimensions != 3) + layer = global_id.z + p.base_layer; + + if (layer > p.base_layer + p.layer_count - 1) + return; + + uint2 row_size_blocks = p.row_size_px; + if (is_block_compressed) { + row_size_blocks = DIV_ROUND_UP(row_size_blocks, + format_block_size.xy); + } + uint row_length_bytes = row_size_blocks.x * format_Bpb; + uint row_height_bytes = row_size_blocks.y * row_length_bytes; + + uint buf_z_offset_bytes = src_pos.z * row_height_bytes; + uint buf_y_offset_bytes = src_pos.y * row_length_bytes; + uint buf_x_offset_bytes = src_pos.x * format_Bpb; + + uint layer_offset_bytes = (layer - p.base_layer) * row_height_bytes; + uint buf_offset_bytes = layer_offset_bytes + + buf_z_offset_bytes + + buf_y_offset_bytes + + buf_x_offset_bytes; + + uint4 colors = get_pixel(p.src_address + buf_offset_bytes, + format_Bpb); + + int4 coords = get_coords(dst_pos, layer, dest_coord_offsets, dimensions, + forced_layer_or_z); + + write_pixel(coords, colors); +} diff --git a/src/intel/blorp/meson.build b/src/intel/blorp/meson.build index 1b82cc5ce1f..9ccbf59275b 100644 --- a/src/intel/blorp/meson.build +++ b/src/intel/blorp/meson.build @@ -37,6 +37,7 @@ files_libblorp = files( 'blorp.h', 'blorp_blit.c', 'blorp_clear.c', + 'blorp_indirect_copy.c', 'blorp_nir_builder.h', 'blorp_priv.h', ) diff --git a/src/intel/vulkan/anv_blorp.c b/src/intel/vulkan/anv_blorp.c index 3348d95e589..899c25e2df2 100644 --- a/src/intel/vulkan/anv_blorp.c +++ b/src/intel/vulkan/anv_blorp.c @@ -2811,3 +2811,135 @@ anv_image_ccs_op(struct anv_cmd_buffer *cmd_buffer, anv_blorp_batch_finish(&batch); } + +void +anv_CmdCopyMemoryIndirectKHR( + VkCommandBuffer commandBuffer, + const VkCopyMemoryIndirectInfoKHR* pCopyMemoryIndirectInfo) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + + if (pCopyMemoryIndirectInfo->copyCount == 0) + return; + + const uint64_t indirect_buf_addr = + pCopyMemoryIndirectInfo->copyAddressRange.address; + const uint32_t copy_count = pCopyMemoryIndirectInfo->copyCount; + const uint64_t stride = pCopyMemoryIndirectInfo->copyAddressRange.stride; + + /* These are all restrictions by the spec. */ + assert((pCopyMemoryIndirectInfo->srcCopyFlags & + VK_ADDRESS_COPY_PROTECTED_BIT_KHR) == 0); + assert((pCopyMemoryIndirectInfo->dstCopyFlags & + VK_ADDRESS_COPY_PROTECTED_BIT_KHR) == 0); + assert(pCopyMemoryIndirectInfo->copyAddressRange.size >= + pCopyMemoryIndirectInfo->copyCount * + pCopyMemoryIndirectInfo->copyAddressRange.stride); + + assert(!anv_cmd_buffer_is_blitter_queue(cmd_buffer)); + + enum blorp_batch_flags blorp_flags = BLORP_BATCH_USE_COMPUTE; + + struct blorp_batch batch; + anv_blorp_batch_init(cmd_buffer, &batch, blorp_flags); + + blorp_copy_memory_indirect(&batch, indirect_buf_addr, copy_count, stride); + + anv_blorp_batch_finish(&batch); +} + +void +anv_CmdCopyMemoryToImageIndirectKHR( + VkCommandBuffer commandBuffer, + const VkCopyMemoryToImageIndirectInfoKHR* pCopyMemoryToImageIndirectInfo) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_image, anv_image, + pCopyMemoryToImageIndirectInfo->dstImage); + + if (pCopyMemoryToImageIndirectInfo->copyCount == 0) + return; + + const uint64_t indirect_buf_addr = + pCopyMemoryToImageIndirectInfo->copyAddressRange.address; + const uint32_t copy_count = pCopyMemoryToImageIndirectInfo->copyCount; + const uint64_t stride = + pCopyMemoryToImageIndirectInfo->copyAddressRange.stride; + const VkImageLayout img_layout = + pCopyMemoryToImageIndirectInfo->dstImageLayout; + + assert((pCopyMemoryToImageIndirectInfo->srcCopyFlags & + VK_ADDRESS_COPY_PROTECTED_BIT_KHR) == 0); + assert(pCopyMemoryToImageIndirectInfo->copyAddressRange.size >= + pCopyMemoryToImageIndirectInfo->copyCount * + pCopyMemoryToImageIndirectInfo->copyAddressRange.stride); + + assert(!anv_cmd_buffer_is_blitter_queue(cmd_buffer)); + + enum blorp_batch_flags blorp_flags = BLORP_BATCH_USE_COMPUTE; + struct blorp_batch batch; + anv_blorp_batch_init(cmd_buffer, &batch, blorp_flags); + + for (int c = 0; c < copy_count; c++) { + const VkImageSubresourceLayers *img_subresource = + &pCopyMemoryToImageIndirectInfo->pImageSubresources[c]; + VkImageAspectFlags aspect_mask = img_subresource->aspectMask; + uint32_t mip_level = img_subresource->mipLevel; + uint32_t base_layer = img_subresource->baseArrayLayer; + uint32_t layer_count = img_subresource->layerCount; + + assert(mip_level != VK_REMAINING_MIP_LEVELS); + if (layer_count == VK_REMAINING_ARRAY_LAYERS) + layer_count = anv_image->vk.array_layers - base_layer; + + const unsigned plane = + anv_image_aspect_to_plane(anv_image, aspect_mask); + struct isl_surf *img_isl_surf = + &anv_image->planes[plane].primary_surface.isl; + enum isl_format format = img_isl_surf->format; + bool format_is_compressed = isl_format_is_compressed(format); + + struct blorp_surf img_blorp_surf; + get_blorp_surf_for_anv_image(cmd_buffer, anv_image, + aspect_mask, VK_IMAGE_USAGE_STORAGE_BIT, + img_layout, + anv_image->planes[plane].aux_usage, + format, false /* cross_aspect */, + &img_blorp_surf); + + anv_cmd_buffer_mark_image_written(cmd_buffer, anv_image, + img_subresource->aspectMask, + img_blorp_surf.aux_usage, mip_level, + base_layer, layer_count); + + /* If the format is compressed, we will pretend the format is one where + * each pixel is the size of the compressed block, so image stores can + * work. Unfortunately, that translation only works if we do it for one + * mip level of a specific layer, as the complete layout of, say, BC4 + * and R32G32 formats are not compatible: so we loop through all layers + * here. + */ + if (format_is_compressed) { + int min_l_or_z, top_l_or_z; + if (img_blorp_surf.surf->dim == ISL_SURF_DIM_3D) { + min_l_or_z = 0; + top_l_or_z = img_isl_surf->logical_level0_px.depth >> mip_level; + } else { + min_l_or_z = base_layer; + top_l_or_z = base_layer + layer_count; + } + for (int l_or_z = min_l_or_z; l_or_z < top_l_or_z; l_or_z++) { + blorp_copy_memory_to_image_indirect(&batch, &img_blorp_surf, + indirect_buf_addr, + stride, c, mip_level, + layer_count, l_or_z); + } + } else { + blorp_copy_memory_to_image_indirect(&batch, &img_blorp_surf, + indirect_buf_addr, stride, c, + mip_level, layer_count, -1); + } + } + + anv_blorp_batch_finish(&batch); +} diff --git a/src/intel/vulkan/anv_formats.c b/src/intel/vulkan/anv_formats.c index 22f177adf2c..cbbebbf6f34 100644 --- a/src/intel/vulkan/anv_formats.c +++ b/src/intel/vulkan/anv_formats.c @@ -629,6 +629,39 @@ anv_get_format_aspect(const struct anv_physical_device *device, return anv_get_format_plane(device, vk_format, plane, tiling); } +static bool +anv_format_supports_indirect_copies(const struct anv_physical_device *pdevice, + const struct anv_format *anv_format) +{ + const struct isl_format_layout *fmtl = + isl_format_get_layout(anv_format->planes[0].isl_format); + + /* CTS insists on it even when we say we don't support it. */ + if (!pdevice->vk.supported_features.indirectMemoryToImageCopy) + return false; + + /* TODO: implement support for this in the copy shader. */ + if (!util_is_power_of_two_or_zero(fmtl->bpb)) + return false; + + /* TODO: we use compute for indirect copies, and compute cannot write HiZ, + * we could try to support that if we see that applications want it. + */ + if (vk_format_is_depth_or_stencil(anv_format->vk_format)) + return false; + + /* Let's leave YCbCr and multi-planar formats out until we have proper + * tests to verify they work. + */ + if (isl_format_is_yuv(anv_format->planes[0].isl_format)) + return false; + + if (anv_format->n_planes > 1) + return false; + + return true; +} + // Format capabilities static bool @@ -852,7 +885,8 @@ anv_get_color_format_features(const struct anv_physical_device *physical_device, if (vk_tiling == VK_IMAGE_TILING_LINEAR && isl_format_get_layout(plane_format.isl_format)->txc == ISL_TXC_ASTC) return VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT | - VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT; + VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT | + VK_FORMAT_FEATURE_2_COPY_IMAGE_INDIRECT_DST_BIT_KHR; flags |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT | VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_MINMAX_BIT | @@ -935,6 +969,10 @@ anv_get_color_format_features(const struct anv_physical_device *physical_device, VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT; } + if ((flags & VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT) && + anv_format_supports_indirect_copies(physical_device, anv_format)) + flags |= VK_FORMAT_FEATURE_2_COPY_IMAGE_INDIRECT_DST_BIT_KHR; + /* XXX: We handle 3-channel formats by switching them out for RGBX or * RGBA formats behind-the-scenes. This works fine for textures * because the upload process will fill in the extra channel. diff --git a/src/intel/vulkan/anv_physical_device.c b/src/intel/vulkan/anv_physical_device.c index 5b723093e86..06b4502e12a 100644 --- a/src/intel/vulkan/anv_physical_device.c +++ b/src/intel/vulkan/anv_physical_device.c @@ -152,6 +152,7 @@ get_device_extensions(const struct anv_physical_device *device, .KHR_cooperative_matrix = device->has_cooperative_matrix, .NV_cooperative_matrix2 = device->has_cooperative_matrix, .KHR_copy_commands2 = true, + .KHR_copy_memory_indirect = true, .KHR_create_renderpass2 = true, .KHR_dedicated_allocation = true, .KHR_deferred_host_operations = true, @@ -1023,6 +1024,10 @@ get_features(const struct anv_physical_device *pdevice, /* VK_KHR_pipeline_binary */ .pipelineBinaries = true, + /* VK_KHR_copy_memory_indirect */ + .indirectMemoryCopy = true, + .indirectMemoryToImageCopy = pdevice->info.ver >= 12, + #ifdef ANV_USE_WSI_PLATFORM /* VK_EXT_present_timing */ .presentTiming = true, @@ -2205,6 +2210,11 @@ get_properties(const struct anv_physical_device *pdevice, */ props->shaderBinaryVersion = 0; } + + /* VK_KHR_copy_memory_indirect */ + { + props->supportedQueues = VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT; + } } /* This function restricts the maximum size of system memory heap. The diff --git a/src/intel/vulkan/genX_blorp_exec.c b/src/intel/vulkan/genX_blorp_exec.c index 376d7dc06b5..a6a9362fff4 100644 --- a/src/intel/vulkan/genX_blorp_exec.c +++ b/src/intel/vulkan/genX_blorp_exec.c @@ -534,6 +534,8 @@ get_color_aux_op(const struct blorp_params *params) case BLORP_OP_SLOW_COLOR_CLEAR: case BLORP_OP_BLIT: case BLORP_OP_COPY: + case BLORP_OP_COPY_INDIRECT: + case BLORP_OP_COPY_IMAGE_INDIRECT: assert(params->fast_clear_op == ISL_AUX_OP_NONE); return ANV_COLOR_AUX_OP_CLASS_NONE; } From 69b11f7b257ee7670a7adebd60073dfe39de929d Mon Sep 17 00:00:00 2001 From: Paulo Zanoni Date: Wed, 15 Apr 2026 15:54:53 -0700 Subject: [PATCH 8/8] anv: enable VK_KHR_copy_memory_indirect for ASTC formats If you have vk_require_astc=true, this will allow the formats to work. Signed-off-by: Paulo Zanoni --- src/intel/vulkan/anv_formats.c | 42 ++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/src/intel/vulkan/anv_formats.c b/src/intel/vulkan/anv_formats.c index cbbebbf6f34..90c1fffabde 100644 --- a/src/intel/vulkan/anv_formats.c +++ b/src/intel/vulkan/anv_formats.c @@ -748,28 +748,41 @@ anv_color_format_supports_drm_modifier_tiling(const struct anv_physical_device * } static VkFormatFeatureFlags2 -anv_get_compressed_emulated_format_features(const struct anv_format *anv_format, +anv_get_compressed_emulated_format_features(const struct anv_physical_device *pdevice, + const struct anv_format *anv_format, const VkImageTiling vk_tiling) { + VkFormatFeatureFlags2 flags = 0; + assert(isl_format_is_compressed(anv_format->planes[0].isl_format)); /* Require optimal tiling so that we can decompress on upload */ - if (vk_tiling == VK_IMAGE_TILING_OPTIMAL) { + switch (vk_tiling) { + case VK_IMAGE_TILING_OPTIMAL: /* Required features for compressed formats */ - return VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT | - VK_FORMAT_FEATURE_2_BLIT_SRC_BIT | - VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_LINEAR_BIT | - VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT | - VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT | - VK_FORMAT_FEATURE_2_HOST_IMAGE_TRANSFER_BIT_EXT; - } else if (vk_tiling == VK_IMAGE_TILING_LINEAR) { + flags |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT | + VK_FORMAT_FEATURE_2_BLIT_SRC_BIT | + VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_LINEAR_BIT | + VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT | + VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT | + VK_FORMAT_FEATURE_2_HOST_IMAGE_TRANSFER_BIT_EXT; + break; + + case VK_IMAGE_TILING_LINEAR: /* Images used for transfers */ - return VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT | - VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT | - VK_FORMAT_FEATURE_2_HOST_IMAGE_TRANSFER_BIT_EXT; + flags |= VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT | + VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT | + VK_FORMAT_FEATURE_2_HOST_IMAGE_TRANSFER_BIT_EXT; + break; + + default: + return 0; } - return 0; + if (anv_format_supports_indirect_copies(pdevice, anv_format)) + flags |= VK_FORMAT_FEATURE_2_COPY_IMAGE_INDIRECT_DST_BIT_KHR; + + return flags; } static VkFormatFeatureFlags2 @@ -1163,7 +1176,8 @@ anv_get_image_format_features2(const struct anv_physical_device *physical_device } if (anv_is_compressed_format_emulated(physical_device, vk_format)) { - return anv_get_compressed_emulated_format_features(anv_format, + return anv_get_compressed_emulated_format_features(physical_device, + anv_format, vk_tiling); }