From 0bd9aa85eb695c01a5ad0cf3c4f0d8f1345f8196 Mon Sep 17 00:00:00 2001
From: Paulo Zanoni <paulo.r.zanoni@intel.com>
Date: Thu, 18 Dec 2025 15:34:36 -0800
Subject: [PATCH] anv: implement VK_KHR_copy_memory_indirect

This implements the extension on the Graphics and Compute queues using
Blorp OpenCL compute shaders. Support for the Transfer queue will come
in a later patch. We also don't support 24/48/96 bpp formats yet.

Signed-off-by: Paulo Zanoni <paulo.r.zanoni@intel.com>
---
 docs/features.txt                      |   2 +-
 src/intel/blorp/blorp.h                |  18 ++
 src/intel/blorp/blorp_genX_exec_brw.h  | 175 ++++++++++++
 src/intel/blorp/blorp_indirect_copy.c  | 355 +++++++++++++++++++++++++
 src/intel/blorp/blorp_priv.h           |  73 ++++-
 src/intel/blorp/blorp_shaders.cl       | 277 +++++++++++++++++++
 src/intel/blorp/meson.build            |   1 +
 src/intel/vulkan/anv_blorp.c           | 132 +++++++++
 src/intel/vulkan/anv_formats.c         |  40 ++-
 src/intel/vulkan/anv_physical_device.c |  10 +
 src/intel/vulkan/genX_blorp_exec.c     |   2 +
 11 files changed, 1082 insertions(+), 3 deletions(-)
 create mode 100644 src/intel/blorp/blorp_indirect_copy.c

diff --git a/docs/features.txt b/docs/features.txt
index 29197179949..0634d7cbe28 100644
--- a/docs/features.txt
+++ b/docs/features.txt
@@ -542,7 +542,7 @@ Khronos extensions that are not part of any Vulkan version:
   VK_KHR_calibrated_timestamps                          DONE (anv, hk, kk, nvk, panvk/v10+, radv, tu/a750+, vn)
   VK_KHR_compute_shader_derivatives                     DONE (anv, lvp, nvk, radv, tu, vn)
   VK_KHR_cooperative_matrix                             DONE (anv, nvk/Turing+, radv/gfx11+, vn)
-  VK_KHR_copy_memory_indirect                           DONE (nvk, radv/gfx8+)
+  VK_KHR_copy_memory_indirect                           DONE (anv, nvk, radv/gfx8+)
   VK_KHR_deferred_host_operations                       DONE (anv, hasvk, lvp, radv, tu, vn)
   VK_KHR_depth_clamp_zero_one                           DONE (anv, nvk, panvk, radv, tu, vn)
   VK_KHR_device_address_commands                        DONE (radv)
diff --git a/src/intel/blorp/blorp.h b/src/intel/blorp/blorp.h
index b07006777f5..31c1e14805b 100644
--- a/src/intel/blorp/blorp.h
+++ b/src/intel/blorp/blorp.h
@@ -41,6 +41,8 @@ typedef struct nir_shader nir_shader;
 enum blorp_op {
    BLORP_OP_BLIT,
    BLORP_OP_COPY,
+   BLORP_OP_COPY_INDIRECT,
+   BLORP_OP_COPY_IMAGE_INDIRECT,
    BLORP_OP_CCS_AMBIGUATE,
    BLORP_OP_CCS_COLOR_CLEAR,
    BLORP_OP_CCS_PARTIAL_RESOLVE,
@@ -299,6 +301,22 @@ blorp_buffer_copy(struct blorp_batch *batch,
                   struct blorp_address dst,
                   uint64_t size);
 
+void
+blorp_copy_memory_indirect(struct blorp_batch *batch,
+                           uint64_t indirect_buf_addr,
+                           uint32_t copy_count,
+                           uint64_t stride);
+
+void
+blorp_copy_memory_to_image_indirect(struct blorp_batch *batch,
+                                    const struct blorp_surf *img_blorp_surf,
+                                    uint64_t indirect_buf_addr,
+                                    uint64_t indirect_buf_stride,
+                                    uint32_t first_copy_idx,
+                                    uint32_t img_mip_level,
+                                    int layer_count,
+                                    int forced_layer_or_z);
+
 void
 blorp_fast_clear(struct blorp_batch *batch,
                  const struct blorp_surf *surf,
diff --git a/src/intel/blorp/blorp_genX_exec_brw.h b/src/intel/blorp/blorp_genX_exec_brw.h
index f72f439d3f6..44f420af8fd 100644
--- a/src/intel/blorp/blorp_genX_exec_brw.h
+++ b/src/intel/blorp/blorp_genX_exec_brw.h
@@ -1380,6 +1380,10 @@ blorp_setup_binding_table(struct blorp_batch *batch,
    uint32_t surface_offsets[BLORP_NUM_BT_ENTRIES], bind_offset = 0;
    void *surface_maps[BLORP_NUM_BT_ENTRIES];
 
+   /* There's nothing to bind here. */
+   if (params->op == BLORP_OP_COPY_INDIRECT)
+      return 0;
+
    if (params->use_pre_baked_binding_table) {
       bind_offset = params->pre_baked_binding_table_offset;
    } else {
@@ -1784,6 +1788,170 @@ blorp_get_compute_push_const(struct blorp_batch *batch,
    *state_size = push_const_size;
 }
 
+static void
+blorp_indirect_buffer_get_dispatch_size(struct blorp_batch *batch,
+                                        const struct blorp_params *params,
+                                        struct mi_builder *b,
+                                        struct mi_value *size_x,
+                                        struct mi_value *size_y,
+                                        struct mi_value *size_z)
+{
+   struct blorp_context *blorp = batch->blorp;
+   const struct brw_cs_prog_data *cs_prog_data = params->cs_prog_data;
+
+   uint64_t indirect_buf_addr = params->wm_inputs.indirect.indirect_buf_addr;
+   uint64_t stride = params->wm_inputs.indirect.indirect_buf_stride;
+   uint32_t copy_count = params->wm_inputs.indirect.copy_count;
+
+   size_t size_offset = 16; /* offsetof(VkCopyMemoryIndirectCommandKHR, size) */
+
+   struct mi_value biggest_copy_size;
+   for (int c = 0; c < copy_count; c++) {
+      struct blorp_address copy_size_addr = {
+         .buffer = NULL,
+         .offset = indirect_buf_addr + c * stride + size_offset,
+         .reloc_flags = 0,
+         .mocs = isl_mocs(blorp->isl_dev, ISL_SURF_USAGE_STORAGE_BIT, false),
+         .local_hint = false, /* We don't have a way to know this. */
+      };
+
+      struct mi_value this_copy_size = mi_mem32(copy_size_addr);
+      if (c == 0)
+         biggest_copy_size = this_copy_size;
+      else
+         biggest_copy_size = mi_umax2(b, this_copy_size, biggest_copy_size);
+   }
+
+   /* Each shader invocation writes an uint32_t. */
+   int divisor = cs_prog_data->local_size[0] * sizeof(uint32_t);
+   *size_x = mi_udiv32_imm(b, mi_iadd_imm(b, biggest_copy_size, divisor - 1),
+                           divisor);
+
+   assert(cs_prog_data->local_size[1] == 1);
+   assert(cs_prog_data->local_size[2] == 1);
+   *size_y = mi_imm(1);
+   *size_z = mi_imm(1);
+}
+
+static void
+blorp_indirect_buf2img_get_dispatch_size(struct blorp_batch *batch,
+                                         const struct blorp_params *params,
+                                         struct mi_builder *b,
+                                         struct mi_value *size_x,
+                                         struct mi_value *size_y,
+                                         struct mi_value *size_z)
+{
+   struct blorp_context *blorp = batch->blorp;
+   const struct brw_cs_prog_data *cs_prog_data = params->cs_prog_data;
+
+   uint64_t indirect_buf_addr = params->wm_inputs.indirect.indirect_buf_addr;
+   uint64_t stride = params->wm_inputs.indirect.indirect_buf_stride;
+   uint32_t copy_idx = params->wm_inputs.indirect.copy_idx;
+   bool is_forced_layer = params->wm_inputs.indirect.forced_layer_or_z != -1;
+   bool is_3d = params->wm_inputs.indirect.dimensions == 3;
+
+   /* These are all offsetof(VkCopyMemoryToImageIndirectCommandKHR, x). */
+   const size_t img_extent_x_offset = 44;
+   const size_t img_extent_y_offset = 48;
+   const size_t img_extent_z_offset = 52;
+
+   struct blorp_address x_extent_addr = {
+      .buffer = NULL,
+      .offset = indirect_buf_addr + copy_idx * stride + img_extent_x_offset,
+      .reloc_flags = 0,
+      .mocs = isl_mocs(blorp->isl_dev, ISL_SURF_USAGE_STORAGE_BIT, false),
+      .local_hint = false, /* We don't have a way to know this. */
+   };
+   struct blorp_address y_extent_addr = {
+      .buffer = NULL,
+      .offset = indirect_buf_addr + copy_idx * stride + img_extent_y_offset,
+      .reloc_flags = 0,
+      .mocs = isl_mocs(blorp->isl_dev, ISL_SURF_USAGE_STORAGE_BIT, false),
+      .local_hint = false, /* We don't have a way to know this. */
+   };
+   struct blorp_address z_extent_addr = {
+      .buffer = NULL,
+      .offset = indirect_buf_addr + copy_idx * stride + img_extent_z_offset,
+      .reloc_flags = 0,
+      .mocs = isl_mocs(blorp->isl_dev, ISL_SURF_USAGE_STORAGE_BIT, false),
+      .local_hint = false, /* We don't have a way to know this. */
+   };
+
+   /* Notes on the 'params->num_layers' usage below:
+    *
+    * - Please see how we handle this in function
+    *   blorp_copy_memory_to_image_indirect().
+    *
+    * - If we're using forced layers (see
+    *   params.wm_inputs.indirect.forced_layer_or_z), then we can just set
+    *   size_z to 1 and don't even look at the indirect buffer: each shader
+    *   call operates on a single layer.
+    *
+    * - The information on the number of layers for each copy is not truly
+    *   indirect: it has to be passed to during command creation, so we
+    *   have already processed it. See 'max_layer_count' in
+    *   blorp_copy_memory_to_image_indirect(). We don't want to be looking
+    *   at the indirect buffer if we don't need to.
+    *
+    * - For 3D images, the applications can use the Z axis as either an
+    *   actual axis (offset.z + extent.z) or pretend the Z axis is layers
+    *   (base_layer + layer_count), so would have to check both places.
+    *   Fortunately, base_layer + layer_count was already checked (see
+    *   above) and recorded in params->num_layers. If params->num_layers is
+    *   bigger than 1, then we don't even bother looking at extent.z.
+    */
+   struct mi_value x_extent = mi_mem32(x_extent_addr);
+   struct mi_value y_extent = mi_mem32(y_extent_addr);
+   struct mi_value z_extent;
+   if (is_forced_layer) {
+      z_extent = mi_imm(1);
+   } else {
+      if (is_3d && params->num_layers == 1)
+         z_extent = mi_mem32(z_extent_addr);
+      else
+         z_extent = mi_imm(params->num_layers);
+   }
+
+   int divisor_x = cs_prog_data->local_size[0];
+   int divisor_y = cs_prog_data->local_size[1];
+   int divisor_z = cs_prog_data->local_size[2];
+
+   assert(divisor_x != 1 && divisor_y != 1 && divisor_z == 1);
+   *size_x = mi_udiv32_imm(b, mi_iadd_imm(b, x_extent, divisor_x - 1),
+                           divisor_x);
+   *size_y = mi_udiv32_imm(b, mi_iadd_imm(b, y_extent, divisor_y - 1),
+                           divisor_y);
+   *size_z = z_extent;
+}
+
+static void
+blorp_indirect_write_gpgpu_dispatch_regs(struct blorp_batch *batch,
+                                         const struct blorp_params *params)
+{
+   struct blorp_context *blorp = batch->blorp;
+   const struct intel_device_info *devinfo = blorp->compiler->brw->devinfo;
+
+   struct mi_builder b;
+   mi_builder_init(&b, devinfo, batch);
+   mi_builder_set_mocs(&b, isl_mocs(blorp->isl_dev, 0, false));
+
+   struct mi_value size_x, size_y, size_z;
+
+   if (params->op == BLORP_OP_COPY_INDIRECT) {
+      blorp_indirect_buffer_get_dispatch_size(batch, params, &b, &size_x,
+                                              &size_y, &size_z);
+   } else {
+      assert(params->op == BLORP_OP_COPY_IMAGE_INDIRECT);
+
+      blorp_indirect_buf2img_get_dispatch_size(batch, params, &b, &size_x,
+                                               &size_y, &size_z);
+   }
+
+   mi_store(&b, mi_reg32(GENX(GPGPU_DISPATCHDIMX_num)), size_x);
+   mi_store(&b, mi_reg32(GENX(GPGPU_DISPATCHDIMY_num)), size_y);
+   mi_store(&b, mi_reg32(GENX(GPGPU_DISPATCHDIMZ_num)), size_z);
+}
+
 static void
 blorp_exec_compute(struct blorp_batch *batch, const struct blorp_params *params)
 {
@@ -1798,6 +1966,11 @@ blorp_exec_compute(struct blorp_batch *batch, const struct blorp_params *params)
    const struct intel_cs_dispatch_info dispatch =
       brw_cs_get_dispatch_info(devinfo, cs_prog_data, NULL);
 
+   bool use_indirect = blorp_op_type_is_indirect(params->op);
+
+   if (use_indirect)
+      blorp_indirect_write_gpgpu_dispatch_regs(batch, params);
+
    uint32_t group_x0 = params->x0 / cs_prog_data->local_size[0];
    uint32_t group_y0 = params->y0 / cs_prog_data->local_size[1];
    uint32_t group_z0 = params->dst.z_offset;
@@ -1911,6 +2084,7 @@ blorp_exec_compute(struct blorp_batch *batch, const struct blorp_params *params)
 
    assert(cs_prog_data->push.per_thread.regs == 0);
    blorp_emit(batch, GENX(COMPUTE_WALKER), cw) {
+      cw.IndirectParameterEnable = use_indirect,
       cw.body = body;
    }
 #else /* GFX_VERx10 >= 125 */
@@ -1980,6 +2154,7 @@ blorp_exec_compute(struct blorp_batch *batch, const struct blorp_params *params)
    }
 
    blorp_emit(batch, GENX(GPGPU_WALKER), ggw) {
+      ggw.IndirectParameterEnable      = use_indirect,
       ggw.SIMDSize                     = dispatch.simd_size / 16;
       ggw.ThreadDepthCounterMaximum    = 0;
       ggw.ThreadHeightCounterMaximum   = 0;
diff --git a/src/intel/blorp/blorp_indirect_copy.c b/src/intel/blorp/blorp_indirect_copy.c
new file mode 100644
index 00000000000..4d46d3c8bd4
--- /dev/null
+++ b/src/intel/blorp/blorp_indirect_copy.c
@@ -0,0 +1,355 @@
+/* Copyright © 2025 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "blorp_nir_builder.h"
+#include "compiler/nir/nir_format_convert.h"
+
+#include "blorp_priv.h"
+#include "dev/intel_debug.h"
+#include "dev/intel_device_info.h"
+
+#include "blorp_shaders.h"
+
+/* Refer to struct blorp_wm_inputs_indirect. */
+struct blorp_indirect_vars {
+   nir_variable *indirect_buf_addr;
+   nir_variable *stride;
+   nir_variable *copy_count;
+   nir_variable *copy_idx;
+   nir_variable *max_layer;
+   nir_variable *x_offset;
+   nir_variable *y_offset;
+};
+
+static enum isl_format
+get_format_for_copy(int format_bpb)
+{
+   switch (format_bpb) {
+   case 8:   return ISL_FORMAT_R8_UINT;
+   case 16:  return ISL_FORMAT_R16_UINT;
+   case 24:  return ISL_FORMAT_R8G8B8_UINT;
+   case 32:  return ISL_FORMAT_R32_UINT;
+   case 48:  return ISL_FORMAT_R16G16B16_UINT;
+   case 64:  return ISL_FORMAT_R32G32_UINT;
+   case 96:  return ISL_FORMAT_R32G32B32_UINT;
+   case 128: return ISL_FORMAT_R32G32B32A32_UINT;
+   default:
+      mesa_loge("unexpected format bpb: %d", format_bpb);
+      assert(false);
+      return ISL_FORMAT_UNSUPPORTED;
+   }
+}
+
+static void
+blorp_indirect_vars_init(nir_builder *b, struct blorp_indirect_vars *v)
+{
+   v->copy_count =
+      BLORP_CREATE_NIR_INPUT(b->shader, indirect.copy_count,
+                             glsl_uint_type());
+   v->indirect_buf_addr =
+      BLORP_CREATE_NIR_INPUT(b->shader, indirect.indirect_buf_addr,
+                             glsl_uint64_t_type());
+   v->stride =
+      BLORP_CREATE_NIR_INPUT(b->shader, indirect.indirect_buf_stride,
+                             glsl_uint64_t_type());
+   v->copy_idx =
+      BLORP_CREATE_NIR_INPUT(b->shader, indirect.copy_idx,
+                             glsl_uint_type());
+   v->max_layer =
+      BLORP_CREATE_NIR_INPUT(b->shader, indirect.max_layer,
+                             glsl_uint_type());
+   v->x_offset =
+      BLORP_CREATE_NIR_INPUT(b->shader, indirect.x_offset,
+                             glsl_uint_type());
+   v->y_offset =
+      BLORP_CREATE_NIR_INPUT(b->shader, indirect.y_offset,
+                             glsl_uint_type());
+}
+
+static nir_shader *
+blorp_build_copy_mem_indirect_shader(struct blorp_batch *batch,
+                                     void *mem_ctx)
+{
+
+   struct blorp_context *blorp = batch->blorp;
+
+   mesa_shader_stage stage = MESA_SHADER_COMPUTE;
+   const nir_shader_compiler_options *nir_options =
+      blorp->compiler->nir_options(blorp, stage);
+   nir_builder b = nir_builder_init_simple_shader(stage, nir_options,
+                                                  "copy mem indirect");
+   ralloc_steal(mem_ctx, b.shader);
+
+   b.shader->info.workgroup_size[0] = 32;
+   b.shader->info.workgroup_size[1] = 1;
+   b.shader->info.workgroup_size[2] = 1;
+
+   struct blorp_indirect_vars v;
+   blorp_indirect_vars_init(&b, &v);
+
+   /* The indirect buffer is an array containing 'copy_count'
+    * VkCopyMemoryIndirectCommandKHR structures, separated by 'stride' bytes.
+    */
+   nir_def *copy_count = nir_load_var(&b, v.copy_count);
+   nir_def *indirect_buf_addr = nir_load_var(&b, v.indirect_buf_addr);
+   nir_def *stride = nir_load_var(&b, v.stride);
+   nir_def *global_id =
+      nir_channel(&b, nir_load_global_invocation_id(&b, 32), 0);
+
+   blorp_copy_memory_indirect_shader(&b, indirect_buf_addr, copy_count,
+                                     stride, global_id);
+
+   return b.shader;
+}
+
+static nir_shader *
+blorp_build_copy_mem2img_indirect_shader(struct blorp_batch *batch,
+                                         void *mem_ctx,
+                                         struct blorp_indirect_copy_mem2img_key *key)
+{
+   struct blorp_context *blorp = batch->blorp;
+
+   mesa_shader_stage stage = MESA_SHADER_COMPUTE;
+   const nir_shader_compiler_options *nir_options =
+      blorp->compiler->nir_options(blorp, stage);
+   nir_builder b = nir_builder_init_simple_shader(stage, nir_options,
+                                                  "copy mem2img indirect");
+   ralloc_steal(mem_ctx, b.shader);
+
+   b.shader->info.workgroup_size[0] = 4;
+   b.shader->info.workgroup_size[1] = 8;
+   b.shader->info.workgroup_size[2] = 1;
+
+   struct blorp_indirect_vars v;
+   blorp_indirect_vars_init(&b, &v);
+
+   /* The indirect buffer is an array containing 'copy_count'
+    * VkCopyMemoryToImageIndirectCommandKHR structures, separated by 'stride'
+    * bytes.
+    */
+   nir_def *indirect_buf_addr = nir_load_var(&b, v.indirect_buf_addr);
+   nir_def *stride = nir_load_var(&b, v.stride);
+   nir_def *copy_idx = nir_load_var(&b, v.copy_idx);
+   nir_def *max_layer = nir_load_var(&b, v.max_layer);
+   nir_def *dest_coord_offsets_arr[2] = {
+      nir_load_var(&b, v.x_offset),
+      nir_load_var(&b, v.y_offset),
+   };
+   nir_def *dest_coord_offsets = nir_vec(&b, dest_coord_offsets_arr, 2);
+
+   nir_def *global_id = nir_load_global_invocation_id(&b, 32);
+
+   /* Shader keys (constants). */
+   nir_def *dimensions = nir_imm_int(&b, key->dimensions);
+   nir_def *forced_layer_or_z = nir_imm_int(&b, key->forced_layer_or_z);
+   nir_def *format_Bpb = nir_imm_intN_t(&b, key->format_Bpb, 16);
+   nir_def *format_block_size_arr[3] = {
+      nir_imm_int(&b, key->format_bw),
+      nir_imm_int(&b, key->format_bh),
+      nir_imm_int(&b, key->format_bd),
+   };
+   nir_def *format_block_size = nir_vec(&b, format_block_size_arr, 3);
+
+   /* Constants derived from the shader keys. */
+   nir_def *is_block_compressed =
+      nir_imm_bool(&b, key->format_bw > 1 || key->format_bh > 1 ||
+                       key->format_bd > 1);
+
+   blorp_copy_memory_to_image_indirect_shader(&b, indirect_buf_addr,
+                                              stride,
+                                              copy_idx,
+                                              max_layer,
+                                              dest_coord_offsets,
+                                              global_id,
+                                              dimensions,
+                                              forced_layer_or_z,
+                                              format_Bpb,
+                                              format_block_size,
+                                              is_block_compressed);
+
+   return b.shader;
+}
+
+static bool
+blorp_get_copy_mem_indirect_kernel_cs(struct blorp_batch *batch,
+                                      struct blorp_params *params)
+{
+   struct blorp_context *blorp = batch->blorp;
+   const char *key = "copy_mem_indirect_kernel_cs";
+   uint32_t key_size = strlen(key);
+
+   if (blorp->lookup_shader(batch, key, key_size, &params->cs_prog_kernel,
+                            &params->cs_prog_data))
+      return true;
+
+   void *mem_ctx = ralloc_context(NULL);
+
+   nir_shader *nir =
+      blorp_build_copy_mem_indirect_shader(batch, mem_ctx);
+
+   const struct blorp_program prog =
+      blorp_compile_cs(blorp, mem_ctx, nir, key, key_size);
+
+   bool result = blorp->upload_shader(batch, MESA_SHADER_COMPUTE,
+                                      key, key_size,
+                                      prog.kernel, prog.kernel_size,
+                                      prog.prog_data, prog.prog_data_size,
+                                      &params->cs_prog_kernel,
+                                      &params->cs_prog_data);
+
+   ralloc_free(mem_ctx);
+   return result;
+
+}
+
+static bool
+blorp_get_copy_mem2img_indirect_kernel_cs(struct blorp_batch *batch,
+                                          struct blorp_params *params,
+                                          struct blorp_indirect_copy_mem2img_key *key)
+{
+   struct blorp_context *blorp = batch->blorp;
+
+   if (blorp->lookup_shader(batch, key, sizeof(*key), &params->cs_prog_kernel,
+                            &params->cs_prog_data))
+      return true;
+
+   void *mem_ctx = ralloc_context(NULL);
+
+   nir_shader *nir =
+      blorp_build_copy_mem2img_indirect_shader(batch, mem_ctx, key);
+
+   const struct blorp_program prog =
+      blorp_compile_cs(blorp, mem_ctx, nir, key, sizeof(*key));
+
+   bool result = blorp->upload_shader(batch, MESA_SHADER_COMPUTE,
+                                      key, sizeof(*key),
+                                      prog.kernel, prog.kernel_size,
+                                      prog.prog_data, prog.prog_data_size,
+                                      &params->cs_prog_kernel,
+                                      &params->cs_prog_data);
+
+   ralloc_free(mem_ctx);
+   return result;
+}
+
+void
+blorp_copy_memory_indirect(struct blorp_batch *batch,
+                           uint64_t indirect_buf_addr,
+                           uint32_t copy_count,
+                           uint64_t stride)
+{
+   struct blorp_params params;
+   blorp_params_init(&params);
+
+   params.op = BLORP_OP_COPY_INDIRECT;
+   params.shader_type = BLORP_SHADER_TYPE_COPY_INDIRECT;
+   params.shader_pipeline = BLORP_SHADER_PIPELINE_COMPUTE;
+
+   params.x0 = 0;
+   params.y0 = 0;
+   params.x1 = 1;
+   params.y1 = 1;
+
+   params.wm_inputs.indirect.indirect_buf_addr = indirect_buf_addr;
+   params.wm_inputs.indirect.indirect_buf_stride = stride;
+   params.wm_inputs.indirect.copy_count = copy_count;
+
+   if (!blorp_get_copy_mem_indirect_kernel_cs(batch, &params)) {
+      mesa_loge("failed to get copy_memory_indirect CS kernel");
+      assert(false);
+      return;
+   }
+
+   batch->blorp->exec(batch, &params);
+}
+
+void
+blorp_copy_memory_to_image_indirect(struct blorp_batch *batch,
+                                    const struct blorp_surf *img_blorp_surf,
+                                    uint64_t indirect_buf_addr,
+                                    uint64_t indirect_buf_stride,
+                                    uint32_t copy_idx,
+                                    uint32_t img_mip_level,
+                                    int layer_count,
+                                    int forced_layer_or_z)
+{
+   enum isl_format original_format = img_blorp_surf->surf->format;
+   const struct isl_format_layout *fmtl =
+      isl_format_get_layout(original_format);
+   enum isl_format copy_format = get_format_for_copy(fmtl->bpb);
+   int dimensions = img_blorp_surf->surf->dim + 1;
+
+   struct blorp_indirect_copy_mem2img_key key = {
+      .dimensions = dimensions,
+      .forced_layer_or_z = forced_layer_or_z,
+      .format_Bpb = fmtl->bpb / 8,
+      .format_bw = fmtl->bw,
+      .format_bh = fmtl->bh,
+      .format_bd = fmtl->bd,
+   };
+
+   struct blorp_params params;
+   blorp_params_init(&params);
+
+   params.op = BLORP_OP_COPY_IMAGE_INDIRECT;
+   params.shader_type = BLORP_SHADER_TYPE_COPY_INDIRECT;
+   params.shader_pipeline = BLORP_SHADER_PIPELINE_COMPUTE;
+
+   params.wm_inputs.indirect.indirect_buf_addr = indirect_buf_addr;
+   params.wm_inputs.indirect.indirect_buf_stride = indirect_buf_stride;
+   params.wm_inputs.indirect.copy_idx = copy_idx;
+   params.wm_inputs.indirect.dimensions = dimensions;
+   params.wm_inputs.indirect.max_layer =
+      img_blorp_surf->surf->logical_level0_px.array_len - 1;
+   params.wm_inputs.indirect.forced_layer_or_z = forced_layer_or_z;
+
+   /* params.dst is our image. */
+   blorp_surface_info_init(batch, &params.dst, img_blorp_surf,
+                           img_mip_level,
+                           forced_layer_or_z == -1 ? 0 : forced_layer_or_z,
+                           copy_format,
+                           true /* is_dest */);
+
+   struct isl_extent3d mip_dimensions = {
+      .width = MAX2(params.dst.surf.logical_level0_px.w >> img_mip_level, 1),
+      .height = MAX2(params.dst.surf.logical_level0_px.h >> img_mip_level, 1),
+      .depth = MAX2(params.dst.surf.logical_level0_px.d >> img_mip_level, 1),
+   };
+
+   if (fmtl->bw > 1 || fmtl->bh > 1 || fmtl->bd > 1) {
+      blorp_surf_convert_to_uncompressed(batch->blorp->isl_dev,
+                                         &params.dst, NULL, NULL, NULL, NULL);
+      params.wm_inputs.indirect.x_offset = params.dst.tile_x_sa;
+      params.wm_inputs.indirect.y_offset = params.dst.tile_y_sa;
+
+      mip_dimensions.width = params.dst.surf.logical_level0_px.w;
+      mip_dimensions.height = params.dst.surf.logical_level0_px.h;
+      mip_dimensions.depth = params.dst.surf.logical_level0_px.d;
+   }
+
+   /* These settings control the number of workgroups in the shader, see
+    * blorp_exec_compute(). We don't need to divide by the local sizes here,
+    * this will be done later.
+    */
+   params.x0 = 0;
+   params.y0 = 0;
+   params.x1 = mip_dimensions.width;
+   params.y1 = mip_dimensions.height;
+   if (forced_layer_or_z == -1) {
+      /* We set this here so blorp_indirect_buf2img_get_dispatch_size() can
+       * read it while figuring out how many shader instances we'll need.
+       */
+      params.num_layers = layer_count;
+   } else {
+      params.num_layers = 1;
+   }
+
+   if (!blorp_get_copy_mem2img_indirect_kernel_cs(batch, &params, &key)) {
+      mesa_loge("failed to get copy_memory_to_image_indirect CS kernel");
+      assert(false);
+      return;
+   }
+
+   batch->blorp->exec(batch, &params);
+}
diff --git a/src/intel/blorp/blorp_priv.h b/src/intel/blorp/blorp_priv.h
index cc77cd770bd..a295084f9b7 100644
--- a/src/intel/blorp/blorp_priv.h
+++ b/src/intel/blorp/blorp_priv.h
@@ -205,17 +205,55 @@ struct blorp_wm_inputs_clear {
    struct blorp_bounds_rect bounds_rect;
 };
 
+/* Parameters using in blorp_indirect_copy.c */
+struct blorp_wm_inputs_indirect {
+   /* The address of the indirect buffer containing the information about the
+    * indirect copy.
+    */
+   uint64_t indirect_buf_addr;
+
+   /* How far apart the information about each copy is inside the indirect
+    * buffer.
+    */
+   uint64_t indirect_buf_stride;
+
+   /* How many copies we have to do. */
+   uint32_t copy_count;
+
+   /* For memory to image copies, we do a single copy per shader. This
+    * represents the index of the copy to be done.
+    */
+   uint32_t copy_idx;
+
+   /* How many dimensions does our image have? 1, 2 or 3. */
+   uint32_t dimensions;
+
+   /* The maximum array layer of the image. */
+   uint32_t max_layer;
+
+   /* When compressed formats are used, we pretend they are a non-compressed
+    * format, of the same bpb. Since we can't maintain the exact same layout
+    * of mipmap and layer offsets, we're forced to make adjustments to where X
+    * and Y actually start, and are also forced to copy only one layer (or Z
+    * axis position) per shader invocation.
+    */
+   uint32_t x_offset;
+   uint32_t y_offset;
+   int forced_layer_or_z;
+};
+
 struct blorp_wm_inputs
 {
    union {
       struct blorp_wm_inputs_blit blit;
       struct blorp_wm_inputs_clear clear;
+      struct blorp_wm_inputs_indirect indirect;
    };
 
    /* Note: Pad out to an integral number of registers when extending, but
     * make sure subgroup_id is the last 32-bit item.
     */
-   uint32_t pad[2];
+   uint32_t pad[1];
    uint32_t subgroup_id;
 };
 
@@ -257,6 +295,7 @@ enum blorp_shader_type {
    BLORP_SHADER_TYPE_MCS_PARTIAL_RESOLVE,
    BLORP_SHADER_TYPE_LAYER_OFFSET_VS,
    BLORP_SHADER_TYPE_GFX4_SF,
+   BLORP_SHADER_TYPE_COPY_INDIRECT,
 };
 
 enum blorp_shader_pipeline {
@@ -467,6 +506,25 @@ struct blorp_blit_prog_key
    uint8_t local_y;
 };
 
+struct blorp_indirect_copy_mem2img_key {
+   /* How many dimensions does our image have? 1, 2 or 3. */
+   uint32_t dimensions;
+
+   /* When compressed formats are used, we pretend they are a non-compressed
+    * format, of the same bpb. Since we can't maintain the exact same layout
+    * of mipmap and layer offsets, we're forced to make adjustments to where X
+    * and Y actually start, and are also forced to copy only one layer (or Z
+    * axis position) per shader invocation.
+    */
+   int forced_layer_or_z;
+
+   /* Info taken from isl_format_layout. */
+   uint16_t format_Bpb;
+   uint16_t format_bw;
+   uint16_t format_bh;
+   uint16_t format_bd;
+};
+
 /**
  * \name BLORP internals
  * \{
@@ -597,6 +655,19 @@ blorp_op_type_is_clear(enum blorp_op op)
    }
 }
 
+/* This means: blorp->wm_inputs.indirect should be used. */
+static inline bool
+blorp_op_type_is_indirect(enum blorp_op op)
+{
+   switch (op) {
+   case BLORP_OP_COPY_INDIRECT:
+   case BLORP_OP_COPY_IMAGE_INDIRECT:
+      return true;
+   default:
+      return false;
+   }
+}
+
 /* Asserts unless the surface is a buffer to image copy */
 #define blorp_assert_is_buffer(surf, view)                     \
    do {                                                        \
diff --git a/src/intel/blorp/blorp_shaders.cl b/src/intel/blorp/blorp_shaders.cl
index 08dd30a37f9..27fce179cb9 100644
--- a/src/intel/blorp/blorp_shaders.cl
+++ b/src/intel/blorp/blorp_shaders.cl
@@ -3,6 +3,7 @@
  */
 
 #include "compiler/libcl/libcl.h"
+#include "compiler/libcl/libcl_vk.h"
 #include "compiler/nir/nir_defines.h"
 #include "compiler/shader_enums.h"
 
@@ -15,3 +16,279 @@ blorp_check_in_bounds(uint4 bounds_rect, uint2 pos)
    return pos.x >= x0 && pos.x < x1 &&
           pos.y >= y0 && pos.y < y1;
 }
+
+void nir_image_store(uint handle, int4 coords, uint sample_index,
+                     uint4 colors, uint lod, uint image_dim,
+                     uint image_array, uint format, uint access,
+                     uint range_base, uint src_type);
+
+/* Used by vkCmdCopyMemoryIndirectKHR. */
+void
+blorp_copy_memory_indirect_shader(
+   global uint *indirect_buf,
+   uint copy_count,
+   ulong stride_bytes,
+   uint global_id)
+{
+   for (uint c = 0; c < copy_count; c++) {
+      uint idx = (uint)(c * (stride_bytes / 4));
+
+      /* The spec says the minimum alignment is 4 instead of 8, so we have to
+       * do these tricks.
+       */
+      VkCopyMemoryIndirectCommandKHR cmd =
+         (*(global VkCopyMemoryIndirectCommandKHR*)&indirect_buf[idx]);
+
+      uint copy_size_ints = cmd.size / 4;
+
+      global int *src = (global int*)cmd.srcAddress;
+      global int *dst = (global int*)cmd.dstAddress;
+
+      if (global_id < copy_size_ints)
+         dst[global_id] = src[global_id];
+   }
+}
+
+struct img_copy_params {
+   ulong src_address;
+   uint2 row_size_px;
+   uint base_layer;
+   uint layer_count;
+   uint3 offset;
+   uint3 extent;
+};
+
+void
+read_img_params(
+   global uint *indirect_buf,
+   uint copy_idx,
+   ulong stride_bytes,
+   uint dimensions,
+   uint max_layer,
+   uint3 format_block_size,
+   bool is_block_compressed,
+   struct img_copy_params *p)
+{
+   uint idx = copy_idx * (stride_bytes / 4);
+
+   VkCopyMemoryToImageIndirectCommandKHR cmd =
+         (*(global VkCopyMemoryToImageIndirectCommandKHR*)&indirect_buf[idx]);
+
+   p->src_address = cmd.srcAddress;
+   p->row_size_px = (uint2)(cmd.bufferRowLength, cmd.bufferImageHeight);
+
+   /* We don't use imageSubresource.aspectMask and imageSubresource.mipLevel,
+    * those are dealt with when the application calls
+    * vkCmdCopyMemoryToImageIndirectKHR().
+    */
+
+   p->base_layer = cmd.imageSubresource.baseArrayLayer;
+   p->layer_count = cmd.imageSubresource.layerCount;
+
+   p->offset = (uint3)(cmd.imageOffset.x, cmd.imageOffset.y,
+                       cmd.imageOffset.z);
+   p->extent = (uint3)(cmd.imageExtent.width, cmd.imageExtent.height,
+                       cmd.imageExtent.depth);
+
+   if (p->row_size_px.x == 0)
+      p->row_size_px.x = p->extent.x;
+   if (p->row_size_px.y == 0)
+      p->row_size_px.y = p->extent.y;
+
+   /* Our code deals with blocks, not pixels. */
+   if (is_block_compressed) {
+      p->offset /= format_block_size;
+      p->extent = DIV_ROUND_UP(p->extent, format_block_size);
+   }
+
+   /* Users can pass 3D images with the Z axis as an array layer. */
+   if (dimensions == 3 && (p->base_layer != 0 || p->layer_count != 1)) {
+      p->offset.z = p->base_layer;
+      p->extent.z = p->layer_count;
+      p->base_layer = 0;
+      p->layer_count = 1;
+   }
+
+   /* This handles VK_REMAINING_ARRAY_LAYERS and bugs. */
+   if (p->base_layer + p->layer_count > max_layer + 1)
+      p->layer_count = max_layer - p->base_layer + 1;
+}
+
+uint4
+get_pixel(
+   ulong src_address,
+   uint format_Bpb)
+{
+   switch (format_Bpb) {
+   case 1: {
+      global uchar *src_buf = (global uchar *)src_address;
+      return (uint4)(src_buf[0], 0, 0, 0);
+   }
+   case 2: {
+      global ushort *src_buf = (global ushort *)src_address;
+      return (uint4)(src_buf[0], 0, 0, 0);
+   }
+   case 4: {
+      global uint *src_buf = (global uint *)src_address;
+      return (uint4)(src_buf[0], 0, 0, 0);
+   }
+   case 8: {
+      global uint *src_buf = (global uint *)src_address;
+      return (uint4)(vload2(0, src_buf), 0, 0);
+   }
+   case 16: {
+      global uint *src_buf = (global uint *)src_address;
+      return (uint4)(vload4(0, src_buf));
+   }
+   default:
+      /* TODO: support 3, 6, 12. */
+      return (uint4)(0xFF, 0, 0, 0xFF);
+   }
+}
+
+int4
+get_coords(
+   uint3 pos,
+   uint layer,
+   uint2 dest_coord_offsets,
+   int dimensions,
+   int forced_layer_or_z)
+{
+   int4 ret;
+
+   pos.xy += dest_coord_offsets;
+
+   switch (dimensions) {
+   case 1:
+      ret = (int4)(pos.x, layer, 0, 0);
+      break;
+   case 2:
+      ret = (int4)(pos.x, pos.y, layer, 0);
+      break;
+   case 3:
+   default:
+      ret = (int4)(pos.x, pos.y, pos.z, 0);
+      break;
+   }
+
+   if (forced_layer_or_z != -1) {
+      ret.z = 0;
+      ret.w = 0;
+   }
+
+   return ret;
+}
+
+void
+write_pixel(
+   const int4 coords,
+   const uint4 colors)
+{
+   /* We don't seem to need to set image_array to true if we set sampler_dim
+    * to 3D.
+    * Setting mip_level does not do anything on Intel, we set mip levels
+    * through the bindings.
+    */
+   nir_image_store(0 /* The image handle. */,
+                   coords, /* See get_coords(). */
+                   0, /* Sample index for multi-sampling. */
+                   colors, /* The RGBA pixels, in src_type. */
+                   0 /* mip_level */,
+                   GLSL_SAMPLER_DIM_3D,
+                   false, /* image_array */
+                   0, /* format */
+                   ACCESS_NON_READABLE, /* access */
+                   0, /* range_base */
+                   nir_type_int32 /* src_type */);
+}
+
+/* Used by vkCmdCopyMemoryToImageIndirectKHR. */
+void
+blorp_copy_memory_to_image_indirect_shader(
+   /* Actual parameters. */
+   global uint *indirect_buf,
+   ulong stride_bytes,
+   uint copy_idx,
+   uint max_layer,
+   uint2 dest_coord_offsets,
+   uint3 global_id,
+
+   /* These are shader keys, they are NIR immediates. */
+   uint dimensions,
+   int forced_layer_or_z,
+   ushort format_Bpb,
+   uint3 format_block_size,
+   bool is_block_compressed)
+{
+   /* We have one invocation per texel of a given a mip level.  This means
+    * that for the pixels outside the copy area, we'll hit the 'continue'
+    * below.
+    */
+   uint3 src_pos = global_id;
+   uint layer;
+
+   /* 'forced_layer_or_z' means that whatever slice or layer we're
+    * trying to work with is set as part of our binding (view) as depth
+    * 0 or layer 0, depending on the dimensionality. This can happen
+    * when we're trying to pretend a format is something that it's not
+    * (e.g., we're treating a block compressed 4x4 64bpp format as an
+    * r32g32 format).
+    */
+   if (forced_layer_or_z != -1) {
+      if (dimensions == 3) {
+         src_pos.z = forced_layer_or_z;
+         layer = 0;
+      } else {
+         src_pos.z = 0;
+         layer = forced_layer_or_z;
+      }
+   } else {
+      if (dimensions == 3) {
+         src_pos.z = global_id.z;
+         layer = 0;
+      } else {
+         src_pos.z = 0;
+         layer = global_id.z;
+      }
+   }
+
+   struct img_copy_params p;
+   read_img_params(indirect_buf, copy_idx, stride_bytes, dimensions,
+                   max_layer, format_block_size, is_block_compressed, &p);
+
+   if (any(src_pos >= p.extent))
+      return;
+
+   uint3 dst_pos = src_pos + p.offset;
+   if (forced_layer_or_z == -1 && dimensions != 3)
+      layer = global_id.z + p.base_layer;
+
+   if (layer > p.base_layer + p.layer_count - 1)
+      return;
+
+   uint2 row_size_blocks = p.row_size_px;
+   if (is_block_compressed) {
+      row_size_blocks = DIV_ROUND_UP(row_size_blocks,
+                                     format_block_size.xy);
+   }
+   uint row_length_bytes = row_size_blocks.x * format_Bpb;
+   uint row_height_bytes = row_size_blocks.y * row_length_bytes;
+
+   uint buf_z_offset_bytes = src_pos.z * row_height_bytes;
+   uint buf_y_offset_bytes = src_pos.y * row_length_bytes;
+   uint buf_x_offset_bytes = src_pos.x * format_Bpb;
+
+   uint layer_offset_bytes = (layer - p.base_layer) * row_height_bytes;
+   uint buf_offset_bytes = layer_offset_bytes +
+                           buf_z_offset_bytes +
+                           buf_y_offset_bytes +
+                           buf_x_offset_bytes;
+
+   uint4 colors = get_pixel(p.src_address + buf_offset_bytes,
+                            format_Bpb);
+
+   int4 coords = get_coords(dst_pos, layer, dest_coord_offsets, dimensions,
+                            forced_layer_or_z);
+
+   write_pixel(coords, colors);
+}
diff --git a/src/intel/blorp/meson.build b/src/intel/blorp/meson.build
index 1b82cc5ce1f..9ccbf59275b 100644
--- a/src/intel/blorp/meson.build
+++ b/src/intel/blorp/meson.build
@@ -37,6 +37,7 @@ files_libblorp = files(
   'blorp.h',
   'blorp_blit.c',
   'blorp_clear.c',
+  'blorp_indirect_copy.c',
   'blorp_nir_builder.h',
   'blorp_priv.h',
 )
diff --git a/src/intel/vulkan/anv_blorp.c b/src/intel/vulkan/anv_blorp.c
index 3348d95e589..899c25e2df2 100644
--- a/src/intel/vulkan/anv_blorp.c
+++ b/src/intel/vulkan/anv_blorp.c
@@ -2811,3 +2811,135 @@ anv_image_ccs_op(struct anv_cmd_buffer *cmd_buffer,
 
    anv_blorp_batch_finish(&batch);
 }
+
+void
+anv_CmdCopyMemoryIndirectKHR(
+      VkCommandBuffer                     commandBuffer,
+      const VkCopyMemoryIndirectInfoKHR*  pCopyMemoryIndirectInfo)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   if (pCopyMemoryIndirectInfo->copyCount == 0)
+      return;
+
+   const uint64_t indirect_buf_addr =
+      pCopyMemoryIndirectInfo->copyAddressRange.address;
+   const uint32_t copy_count = pCopyMemoryIndirectInfo->copyCount;
+   const uint64_t stride = pCopyMemoryIndirectInfo->copyAddressRange.stride;
+
+   /* These are all restrictions by the spec. */
+   assert((pCopyMemoryIndirectInfo->srcCopyFlags &
+           VK_ADDRESS_COPY_PROTECTED_BIT_KHR) == 0);
+   assert((pCopyMemoryIndirectInfo->dstCopyFlags &
+           VK_ADDRESS_COPY_PROTECTED_BIT_KHR) == 0);
+   assert(pCopyMemoryIndirectInfo->copyAddressRange.size >=
+          pCopyMemoryIndirectInfo->copyCount *
+          pCopyMemoryIndirectInfo->copyAddressRange.stride);
+
+   assert(!anv_cmd_buffer_is_blitter_queue(cmd_buffer));
+
+   enum blorp_batch_flags blorp_flags = BLORP_BATCH_USE_COMPUTE;
+
+   struct blorp_batch batch;
+   anv_blorp_batch_init(cmd_buffer, &batch, blorp_flags);
+
+   blorp_copy_memory_indirect(&batch, indirect_buf_addr, copy_count, stride);
+
+   anv_blorp_batch_finish(&batch);
+}
+
+void
+anv_CmdCopyMemoryToImageIndirectKHR(
+      VkCommandBuffer                           commandBuffer,
+      const VkCopyMemoryToImageIndirectInfoKHR* pCopyMemoryToImageIndirectInfo)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_image, anv_image,
+                   pCopyMemoryToImageIndirectInfo->dstImage);
+
+   if (pCopyMemoryToImageIndirectInfo->copyCount == 0)
+      return;
+
+   const uint64_t indirect_buf_addr =
+      pCopyMemoryToImageIndirectInfo->copyAddressRange.address;
+   const uint32_t copy_count = pCopyMemoryToImageIndirectInfo->copyCount;
+   const uint64_t stride =
+      pCopyMemoryToImageIndirectInfo->copyAddressRange.stride;
+   const VkImageLayout img_layout =
+      pCopyMemoryToImageIndirectInfo->dstImageLayout;
+
+   assert((pCopyMemoryToImageIndirectInfo->srcCopyFlags &
+           VK_ADDRESS_COPY_PROTECTED_BIT_KHR) == 0);
+   assert(pCopyMemoryToImageIndirectInfo->copyAddressRange.size >=
+          pCopyMemoryToImageIndirectInfo->copyCount *
+          pCopyMemoryToImageIndirectInfo->copyAddressRange.stride);
+
+   assert(!anv_cmd_buffer_is_blitter_queue(cmd_buffer));
+
+   enum blorp_batch_flags blorp_flags = BLORP_BATCH_USE_COMPUTE;
+   struct blorp_batch batch;
+   anv_blorp_batch_init(cmd_buffer, &batch, blorp_flags);
+
+   for (int c = 0; c < copy_count; c++) {
+      const VkImageSubresourceLayers *img_subresource =
+         &pCopyMemoryToImageIndirectInfo->pImageSubresources[c];
+      VkImageAspectFlags aspect_mask = img_subresource->aspectMask;
+      uint32_t mip_level = img_subresource->mipLevel;
+      uint32_t base_layer = img_subresource->baseArrayLayer;
+      uint32_t layer_count = img_subresource->layerCount;
+
+      assert(mip_level != VK_REMAINING_MIP_LEVELS);
+      if (layer_count == VK_REMAINING_ARRAY_LAYERS)
+         layer_count = anv_image->vk.array_layers - base_layer;
+
+      const unsigned plane =
+         anv_image_aspect_to_plane(anv_image, aspect_mask);
+      struct isl_surf *img_isl_surf =
+         &anv_image->planes[plane].primary_surface.isl;
+      enum isl_format format = img_isl_surf->format;
+      bool format_is_compressed = isl_format_is_compressed(format);
+
+      struct blorp_surf img_blorp_surf;
+      get_blorp_surf_for_anv_image(cmd_buffer, anv_image,
+                                   aspect_mask, VK_IMAGE_USAGE_STORAGE_BIT,
+                                   img_layout,
+                                   anv_image->planes[plane].aux_usage,
+                                   format, false /* cross_aspect */,
+                                   &img_blorp_surf);
+
+      anv_cmd_buffer_mark_image_written(cmd_buffer, anv_image,
+                                        img_subresource->aspectMask,
+                                        img_blorp_surf.aux_usage, mip_level,
+                                        base_layer, layer_count);
+
+      /* If the format is compressed, we will pretend the format is one where
+       * each pixel is the size of the compressed block, so image stores can
+       * work. Unfortunately, that translation only works if we do it for one
+       * mip level of a specific layer, as the complete layout of, say, BC4
+       * and R32G32 formats are not compatible: so we loop through all layers
+       * here.
+       */
+      if (format_is_compressed) {
+         int min_l_or_z, top_l_or_z;
+         if (img_blorp_surf.surf->dim == ISL_SURF_DIM_3D) {
+            min_l_or_z = 0;
+            top_l_or_z = img_isl_surf->logical_level0_px.depth >> mip_level;
+         } else {
+            min_l_or_z = base_layer;
+            top_l_or_z = base_layer + layer_count;
+         }
+         for (int l_or_z = min_l_or_z; l_or_z < top_l_or_z; l_or_z++) {
+            blorp_copy_memory_to_image_indirect(&batch, &img_blorp_surf,
+                                                indirect_buf_addr,
+                                                stride, c, mip_level,
+                                                layer_count, l_or_z);
+         }
+      } else {
+         blorp_copy_memory_to_image_indirect(&batch, &img_blorp_surf,
+                                             indirect_buf_addr, stride, c,
+                                             mip_level, layer_count, -1);
+      }
+   }
+
+   anv_blorp_batch_finish(&batch);
+}
diff --git a/src/intel/vulkan/anv_formats.c b/src/intel/vulkan/anv_formats.c
index 22f177adf2c..cbbebbf6f34 100644
--- a/src/intel/vulkan/anv_formats.c
+++ b/src/intel/vulkan/anv_formats.c
@@ -629,6 +629,39 @@ anv_get_format_aspect(const struct anv_physical_device *device,
    return anv_get_format_plane(device, vk_format, plane, tiling);
 }
 
+static bool
+anv_format_supports_indirect_copies(const struct anv_physical_device *pdevice,
+                                    const struct anv_format *anv_format)
+{
+   const struct isl_format_layout *fmtl =
+      isl_format_get_layout(anv_format->planes[0].isl_format);
+
+   /* CTS insists on it even when we say we don't support it. */
+   if (!pdevice->vk.supported_features.indirectMemoryToImageCopy)
+      return false;
+
+   /* TODO: implement support for this in the copy shader. */
+   if (!util_is_power_of_two_or_zero(fmtl->bpb))
+      return false;
+
+   /* TODO: we use compute for indirect copies, and compute cannot write HiZ,
+    * we could try to support that if we see that applications want it.
+    */
+   if (vk_format_is_depth_or_stencil(anv_format->vk_format))
+      return false;
+
+   /* Let's leave YCbCr and multi-planar formats out until we have proper
+    * tests to verify they work.
+    */
+   if (isl_format_is_yuv(anv_format->planes[0].isl_format))
+      return false;
+
+   if (anv_format->n_planes > 1)
+      return false;
+
+   return true;
+}
+
 // Format capabilities
 
 static bool
@@ -852,7 +885,8 @@ anv_get_color_format_features(const struct anv_physical_device *physical_device,
       if (vk_tiling == VK_IMAGE_TILING_LINEAR &&
           isl_format_get_layout(plane_format.isl_format)->txc == ISL_TXC_ASTC)
          return VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT |
-                VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT;
+                VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT |
+                VK_FORMAT_FEATURE_2_COPY_IMAGE_INDIRECT_DST_BIT_KHR;
 
       flags |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT |
                VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_MINMAX_BIT |
@@ -935,6 +969,10 @@ anv_get_color_format_features(const struct anv_physical_device *physical_device,
                VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT;
    }
 
+   if ((flags & VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT) &&
+       anv_format_supports_indirect_copies(physical_device, anv_format))
+      flags |= VK_FORMAT_FEATURE_2_COPY_IMAGE_INDIRECT_DST_BIT_KHR;
+
    /* XXX: We handle 3-channel formats by switching them out for RGBX or
     * RGBA formats behind-the-scenes.  This works fine for textures
     * because the upload process will fill in the extra channel.
diff --git a/src/intel/vulkan/anv_physical_device.c b/src/intel/vulkan/anv_physical_device.c
index 5b723093e86..06b4502e12a 100644
--- a/src/intel/vulkan/anv_physical_device.c
+++ b/src/intel/vulkan/anv_physical_device.c
@@ -152,6 +152,7 @@ get_device_extensions(const struct anv_physical_device *device,
       .KHR_cooperative_matrix                = device->has_cooperative_matrix,
       .NV_cooperative_matrix2                = device->has_cooperative_matrix,
       .KHR_copy_commands2                    = true,
+      .KHR_copy_memory_indirect              = true,
       .KHR_create_renderpass2                = true,
       .KHR_dedicated_allocation              = true,
       .KHR_deferred_host_operations          = true,
@@ -1023,6 +1024,10 @@ get_features(const struct anv_physical_device *pdevice,
       /* VK_KHR_pipeline_binary */
       .pipelineBinaries = true,
 
+      /* VK_KHR_copy_memory_indirect */
+      .indirectMemoryCopy = true,
+      .indirectMemoryToImageCopy = pdevice->info.ver >= 12,
+
 #ifdef ANV_USE_WSI_PLATFORM
       /* VK_EXT_present_timing */
       .presentTiming = true,
@@ -2205,6 +2210,11 @@ get_properties(const struct anv_physical_device *pdevice,
        */
       props->shaderBinaryVersion = 0;
    }
+
+   /* VK_KHR_copy_memory_indirect */
+   {
+      props->supportedQueues = VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT;
+   }
 }
 
 /* This function restricts the maximum size of system memory heap. The
diff --git a/src/intel/vulkan/genX_blorp_exec.c b/src/intel/vulkan/genX_blorp_exec.c
index 376d7dc06b5..a6a9362fff4 100644
--- a/src/intel/vulkan/genX_blorp_exec.c
+++ b/src/intel/vulkan/genX_blorp_exec.c
@@ -534,6 +534,8 @@ get_color_aux_op(const struct blorp_params *params)
    case BLORP_OP_SLOW_COLOR_CLEAR:
    case BLORP_OP_BLIT:
    case BLORP_OP_COPY:
+   case BLORP_OP_COPY_INDIRECT:
+   case BLORP_OP_COPY_IMAGE_INDIRECT:
       assert(params->fast_clear_op == ISL_AUX_OP_NONE);
       return ANV_COLOR_AUX_OP_CLASS_NONE;
    }