Merge branch 'copy-mem-indirect-blorp' into 'main'

anv: implement VK_KHR_copy_memory_indirect See merge request mesa/mesa!39338
2026-05-08 06:58:05 +02:00 · 2026-05-07 17:21:51 -07:00 · 2026-05-07 17:21:51 -07:00 · 889ee7456d
commit 889ee7456d
parent ff5b909511 69b11f7b25
15 changed files with 1270 additions and 18 deletions
--- a/docs/features.txt
+++ b/docs/features.txt
@ -542,7 +542,7 @@ Khronos extensions that are not part of any Vulkan version:
  VK_KHR_calibrated_timestamps                          DONE (anv, hk, kk, nvk, panvk/v10+, radv, tu/a750+, vn)
  VK_KHR_compute_shader_derivatives                     DONE (anv, lvp, nvk, radv, tu, vn)
  VK_KHR_cooperative_matrix                             DONE (anv, nvk/Turing+, radv/gfx11+, vn)
-  VK_KHR_copy_memory_indirect                           DONE (nvk, radv/gfx8+)
+  VK_KHR_copy_memory_indirect                           DONE (anv, nvk, radv/gfx8+)
  VK_KHR_deferred_host_operations                       DONE (anv, hasvk, lvp, radv, tu, vn)
  VK_KHR_depth_clamp_zero_one                           DONE (anv, nvk, panvk, radv, tu, vn)
  VK_KHR_device_address_commands                        DONE (radv)
--- a/src/compiler/libcl/libcl_vk.h
+++ b/src/compiler/libcl/libcl_vk.h
@ -11,6 +11,7 @@ typedef uint32_t VkBool32;
 typedef uint64_t VkDeviceAddress;
 typedef uint64_t VkDeviceSize;
 typedef uint32_t VkFlags;
+typedef VkFlags VkImageAspectFlags;

 typedef enum VkQueryType {
    VK_QUERY_TYPE_OCCLUSION = 0,
@ -126,6 +127,25 @@ typedef enum VkDepthBiasRepresentationEXT {
    VK_DEPTH_BIAS_REPRESENTATION_MAX_ENUM_EXT = 0x7FFFFFFF
 } VkDepthBiasRepresentationEXT;

+typedef struct VkOffset3D {
+    int32_t    x;
+    int32_t    y;
+    int32_t    z;
+} VkOffset3D __attribute__((aligned(4)));
+
+typedef struct VkExtent3D {
+    uint32_t    width;
+    uint32_t    height;
+    uint32_t    depth;
+} VkExtent3D __attribute__((aligned(4)));
+
+typedef struct VkImageSubresourceLayers {
+    VkImageAspectFlags    aspectMask;
+    uint32_t              mipLevel;
+    uint32_t              baseArrayLayer;
+    uint32_t              layerCount;
+} VkImageSubresourceLayers __attribute__((aligned(4)));
+
 typedef struct VkDispatchIndirectCommand {
    uint32_t    x;
    uint32_t    y;
@ -186,4 +206,13 @@ typedef struct VkCopyMemoryIndirectCommandKHR {
    VkDeviceAddress    srcAddress;
    VkDeviceAddress    dstAddress;
    VkDeviceSize       size;
-} VkCopyMemoryIndirectCommandKHR;
+} VkCopyMemoryIndirectCommandKHR __attribute__((aligned(4)));
+
+typedef struct VkCopyMemoryToImageIndirectCommandKHR {
+    VkDeviceAddress             srcAddress;
+    uint32_t                    bufferRowLength;
+    uint32_t                    bufferImageHeight;
+    VkImageSubresourceLayers    imageSubresource;
+    VkOffset3D                  imageOffset;
+    VkExtent3D                  imageExtent;
+} VkCopyMemoryToImageIndirectCommandKHR __attribute__((aligned(4)));
--- a/src/gallium/drivers/iris/iris_blorp.c
+++ b/src/gallium/drivers/iris/iris_blorp.c
@ -534,3 +534,10 @@ blorp_emit_post_draw(struct blorp_batch *blorp_batch, const struct blorp_params
   genX(maybe_emit_breakpoint)(batch, false);
   blorp_measure_end(blorp_batch, params);
 }
+
+static bool *
+blorp_get_write_fencing_status(struct blorp_batch *blorp_batch)
+{
+   struct iris_batch *batch = blorp_batch->driver_batch;
+   return &batch->write_fence_status;
+}
--- a/src/intel/blorp/blorp.h
+++ b/src/intel/blorp/blorp.h
@ -41,6 +41,8 @@ typedef struct nir_shader nir_shader;
 enum blorp_op {
   BLORP_OP_BLIT,
   BLORP_OP_COPY,
+   BLORP_OP_COPY_INDIRECT,
+   BLORP_OP_COPY_IMAGE_INDIRECT,
   BLORP_OP_CCS_AMBIGUATE,
   BLORP_OP_CCS_COLOR_CLEAR,
   BLORP_OP_CCS_PARTIAL_RESOLVE,
@ -299,6 +301,22 @@ blorp_buffer_copy(struct blorp_batch *batch,
                  struct blorp_address dst,
                  uint64_t size);

+void
+blorp_copy_memory_indirect(struct blorp_batch *batch,
+                           uint64_t indirect_buf_addr,
+                           uint32_t copy_count,
+                           uint64_t stride);
+
+void
+blorp_copy_memory_to_image_indirect(struct blorp_batch *batch,
+                                    const struct blorp_surf *img_blorp_surf,
+                                    uint64_t indirect_buf_addr,
+                                    uint64_t indirect_buf_stride,
+                                    uint32_t first_copy_idx,
+                                    uint32_t img_mip_level,
+                                    int layer_count,
+                                    int forced_layer_or_z);
+
 void
 blorp_fast_clear(struct blorp_batch *batch,
                 const struct blorp_surf *surf,
--- a/src/intel/blorp/blorp_genX_exec_brw.h
+++ b/src/intel/blorp/blorp_genX_exec_brw.h
@ -148,6 +148,9 @@ brw_blorp_get_urb_length(const struct brw_fs_prog_data *prog_data)
   return MAX2((prog_data->num_varying_inputs + 1) / 2, 1);
 }

+static bool *
+blorp_get_write_fencing_status(struct blorp_batch *batch);
+
 /***** BEGIN blorp_exec implementation ******/

 static uint64_t
@ -164,9 +167,25 @@ _blorp_combine_address(struct blorp_batch *batch, void *location,
 #define __gen_address_type struct blorp_address
 #define __gen_user_data struct blorp_batch
 #define __gen_combine_address _blorp_combine_address
+#define __gen_get_write_fencing_status(b) blorp_get_write_fencing_status(b)
+#define __gen_get_batch_dwords(b, d) blorp_emit_dwords((b), (d))
+
+static inline struct blorp_address
+__gen_address_offset(struct blorp_address addr, uint64_t offset)
+{
+   addr.offset += offset;
+   return addr;
+}
+
+static inline struct blorp_address
+__gen_get_batch_address(struct blorp_batch *batch, void *location)
+{
+   UNREACHABLE("Not supported by blorp");
+}

 #include "genxml/genX_pack.h"
 #include "common/intel_genX_state_brw.h"
+#include "common/mi_builder.h"

 #define _blorp_cmd_length(cmd) cmd ## _length
 #define _blorp_cmd_length_bias(cmd) cmd ## _length_bias
@ -1361,6 +1380,10 @@ blorp_setup_binding_table(struct blorp_batch *batch,
   uint32_t surface_offsets[BLORP_NUM_BT_ENTRIES], bind_offset = 0;
   void *surface_maps[BLORP_NUM_BT_ENTRIES];

+   /* There's nothing to bind here. */
+   if (params->op == BLORP_OP_COPY_INDIRECT)
+      return 0;
+
   if (params->use_pre_baked_binding_table) {
      bind_offset = params->pre_baked_binding_table_offset;
   } else {
@ -1765,6 +1788,170 @@ blorp_get_compute_push_const(struct blorp_batch *batch,
   *state_size = push_const_size;
 }

+static void
+blorp_indirect_buffer_get_dispatch_size(struct blorp_batch *batch,
+                                        const struct blorp_params *params,
+                                        struct mi_builder *b,
+                                        struct mi_value *size_x,
+                                        struct mi_value *size_y,
+                                        struct mi_value *size_z)
+{
+   struct blorp_context *blorp = batch->blorp;
+   const struct brw_cs_prog_data *cs_prog_data = params->cs_prog_data;
+
+   uint64_t indirect_buf_addr = params->wm_inputs.indirect.indirect_buf_addr;
+   uint64_t stride = params->wm_inputs.indirect.indirect_buf_stride;
+   uint32_t copy_count = params->wm_inputs.indirect.copy_count;
+
+   size_t size_offset = 16; /* offsetof(VkCopyMemoryIndirectCommandKHR, size) */
+
+   struct mi_value biggest_copy_size;
+   for (int c = 0; c < copy_count; c++) {
+      struct blorp_address copy_size_addr = {
+         .buffer = NULL,
+         .offset = indirect_buf_addr + c * stride + size_offset,
+         .reloc_flags = 0,
+         .mocs = isl_mocs(blorp->isl_dev, ISL_SURF_USAGE_STORAGE_BIT, false),
+         .local_hint = false, /* We don't have a way to know this. */
+      };
+
+      struct mi_value this_copy_size = mi_mem32(copy_size_addr);
+      if (c == 0)
+         biggest_copy_size = this_copy_size;
+      else
+         biggest_copy_size = mi_umax2(b, this_copy_size, biggest_copy_size);
+   }
+
+   /* Each shader invocation writes an uint32_t. */
+   int divisor = cs_prog_data->local_size[0] * sizeof(uint32_t);
+   *size_x = mi_udiv32_imm(b, mi_iadd_imm(b, biggest_copy_size, divisor - 1),
+                           divisor);
+
+   assert(cs_prog_data->local_size[1] == 1);
+   assert(cs_prog_data->local_size[2] == 1);
+   *size_y = mi_imm(1);
+   *size_z = mi_imm(1);
+}
+
+static void
+blorp_indirect_buf2img_get_dispatch_size(struct blorp_batch *batch,
+                                         const struct blorp_params *params,
+                                         struct mi_builder *b,
+                                         struct mi_value *size_x,
+                                         struct mi_value *size_y,
+                                         struct mi_value *size_z)
+{
+   struct blorp_context *blorp = batch->blorp;
+   const struct brw_cs_prog_data *cs_prog_data = params->cs_prog_data;
+
+   uint64_t indirect_buf_addr = params->wm_inputs.indirect.indirect_buf_addr;
+   uint64_t stride = params->wm_inputs.indirect.indirect_buf_stride;
+   uint32_t copy_idx = params->wm_inputs.indirect.copy_idx;
+   bool is_forced_layer = params->wm_inputs.indirect.forced_layer_or_z != -1;
+   bool is_3d = params->wm_inputs.indirect.dimensions == 3;
+
+   /* These are all offsetof(VkCopyMemoryToImageIndirectCommandKHR, x). */
+   const size_t img_extent_x_offset = 44;
+   const size_t img_extent_y_offset = 48;
+   const size_t img_extent_z_offset = 52;
+
+   struct blorp_address x_extent_addr = {
+      .buffer = NULL,
+      .offset = indirect_buf_addr + copy_idx * stride + img_extent_x_offset,
+      .reloc_flags = 0,
+      .mocs = isl_mocs(blorp->isl_dev, ISL_SURF_USAGE_STORAGE_BIT, false),
+      .local_hint = false, /* We don't have a way to know this. */
+   };
+   struct blorp_address y_extent_addr = {
+      .buffer = NULL,
+      .offset = indirect_buf_addr + copy_idx * stride + img_extent_y_offset,
+      .reloc_flags = 0,
+      .mocs = isl_mocs(blorp->isl_dev, ISL_SURF_USAGE_STORAGE_BIT, false),
+      .local_hint = false, /* We don't have a way to know this. */
+   };
+   struct blorp_address z_extent_addr = {
+      .buffer = NULL,
+      .offset = indirect_buf_addr + copy_idx * stride + img_extent_z_offset,
+      .reloc_flags = 0,
+      .mocs = isl_mocs(blorp->isl_dev, ISL_SURF_USAGE_STORAGE_BIT, false),
+      .local_hint = false, /* We don't have a way to know this. */
+   };
+
+   /* Notes on the 'params->num_layers' usage below:
+    *
+    * - Please see how we handle this in function
+    *   blorp_copy_memory_to_image_indirect().
+    *
+    * - If we're using forced layers (see
+    *   params.wm_inputs.indirect.forced_layer_or_z), then we can just set
+    *   size_z to 1 and don't even look at the indirect buffer: each shader
+    *   call operates on a single layer.
+    *
+    * - The information on the number of layers for each copy is not truly
+    *   indirect: it has to be passed to during command creation, so we
+    *   have already processed it. See 'max_layer_count' in
+    *   blorp_copy_memory_to_image_indirect(). We don't want to be looking
+    *   at the indirect buffer if we don't need to.
+    *
+    * - For 3D images, the applications can use the Z axis as either an
+    *   actual axis (offset.z + extent.z) or pretend the Z axis is layers
+    *   (base_layer + layer_count), so would have to check both places.
+    *   Fortunately, base_layer + layer_count was already checked (see
+    *   above) and recorded in params->num_layers. If params->num_layers is
+    *   bigger than 1, then we don't even bother looking at extent.z.
+    */
+   struct mi_value x_extent = mi_mem32(x_extent_addr);
+   struct mi_value y_extent = mi_mem32(y_extent_addr);
+   struct mi_value z_extent;
+   if (is_forced_layer) {
+      z_extent = mi_imm(1);
+   } else {
+      if (is_3d && params->num_layers == 1)
+         z_extent = mi_mem32(z_extent_addr);
+      else
+         z_extent = mi_imm(params->num_layers);
+   }
+
+   int divisor_x = cs_prog_data->local_size[0];
+   int divisor_y = cs_prog_data->local_size[1];
+   int divisor_z = cs_prog_data->local_size[2];
+
+   assert(divisor_x != 1 && divisor_y != 1 && divisor_z == 1);
+   *size_x = mi_udiv32_imm(b, mi_iadd_imm(b, x_extent, divisor_x - 1),
+                           divisor_x);
+   *size_y = mi_udiv32_imm(b, mi_iadd_imm(b, y_extent, divisor_y - 1),
+                           divisor_y);
+   *size_z = z_extent;
+}
+
+static void
+blorp_indirect_write_gpgpu_dispatch_regs(struct blorp_batch *batch,
+                                         const struct blorp_params *params)
+{
+   struct blorp_context *blorp = batch->blorp;
+   const struct intel_device_info *devinfo = blorp->compiler->brw->devinfo;
+
+   struct mi_builder b;
+   mi_builder_init(&b, devinfo, batch);
+   mi_builder_set_mocs(&b, isl_mocs(blorp->isl_dev, 0, false));
+
+   struct mi_value size_x, size_y, size_z;
+
+   if (params->op == BLORP_OP_COPY_INDIRECT) {
+      blorp_indirect_buffer_get_dispatch_size(batch, params, &b, &size_x,
+                                              &size_y, &size_z);
+   } else {
+      assert(params->op == BLORP_OP_COPY_IMAGE_INDIRECT);
+
+      blorp_indirect_buf2img_get_dispatch_size(batch, params, &b, &size_x,
+                                               &size_y, &size_z);
+   }
+
+   mi_store(&b, mi_reg32(GENX(GPGPU_DISPATCHDIMX_num)), size_x);
+   mi_store(&b, mi_reg32(GENX(GPGPU_DISPATCHDIMY_num)), size_y);
+   mi_store(&b, mi_reg32(GENX(GPGPU_DISPATCHDIMZ_num)), size_z);
+}
+
 static void
 blorp_exec_compute(struct blorp_batch *batch, const struct blorp_params *params)
 {
@ -1779,6 +1966,11 @@ blorp_exec_compute(struct blorp_batch *batch, const struct blorp_params *params)
   const struct intel_cs_dispatch_info dispatch =
      brw_cs_get_dispatch_info(devinfo, cs_prog_data, NULL);

+   bool use_indirect = blorp_op_type_is_indirect(params->op);
+
+   if (use_indirect)
+      blorp_indirect_write_gpgpu_dispatch_regs(batch, params);
+
   uint32_t group_x0 = params->x0 / cs_prog_data->local_size[0];
   uint32_t group_y0 = params->y0 / cs_prog_data->local_size[1];
   uint32_t group_z0 = params->dst.z_offset;
@ -1892,6 +2084,7 @@ blorp_exec_compute(struct blorp_batch *batch, const struct blorp_params *params)

   assert(cs_prog_data->push.per_thread.regs == 0);
   blorp_emit(batch, GENX(COMPUTE_WALKER), cw) {
+      cw.IndirectParameterEnable = use_indirect,
      cw.body = body;
   }
 #else /* GFX_VERx10 >= 125 */
@ -1961,6 +2154,7 @@ blorp_exec_compute(struct blorp_batch *batch, const struct blorp_params *params)
   }

   blorp_emit(batch, GENX(GPGPU_WALKER), ggw) {
+      ggw.IndirectParameterEnable      = use_indirect,
      ggw.SIMDSize                     = dispatch.simd_size / 16;
      ggw.ThreadDepthCounterMaximum    = 0;
      ggw.ThreadHeightCounterMaximum   = 0;
--- a/src/intel/blorp/blorp_indirect_copy.c
+++ b/src/intel/blorp/blorp_indirect_copy.c
@ -0,0 +1,355 @@
+/* Copyright © 2025 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "blorp_nir_builder.h"
+#include "compiler/nir/nir_format_convert.h"
+
+#include "blorp_priv.h"
+#include "dev/intel_debug.h"
+#include "dev/intel_device_info.h"
+
+#include "blorp_shaders.h"
+
+/* Refer to struct blorp_wm_inputs_indirect. */
+struct blorp_indirect_vars {
+   nir_variable *indirect_buf_addr;
+   nir_variable *stride;
+   nir_variable *copy_count;
+   nir_variable *copy_idx;
+   nir_variable *max_layer;
+   nir_variable *x_offset;
+   nir_variable *y_offset;
+};
+
+static enum isl_format
+get_format_for_copy(int format_bpb)
+{
+   switch (format_bpb) {
+   case 8:   return ISL_FORMAT_R8_UINT;
+   case 16:  return ISL_FORMAT_R16_UINT;
+   case 24:  return ISL_FORMAT_R8G8B8_UINT;
+   case 32:  return ISL_FORMAT_R32_UINT;
+   case 48:  return ISL_FORMAT_R16G16B16_UINT;
+   case 64:  return ISL_FORMAT_R32G32_UINT;
+   case 96:  return ISL_FORMAT_R32G32B32_UINT;
+   case 128: return ISL_FORMAT_R32G32B32A32_UINT;
+   default:
+      mesa_loge("unexpected format bpb: %d", format_bpb);
+      assert(false);
+      return ISL_FORMAT_UNSUPPORTED;
+   }
+}
+
+static void
+blorp_indirect_vars_init(nir_builder *b, struct blorp_indirect_vars *v)
+{
+   v->copy_count =
+      BLORP_CREATE_NIR_INPUT(b->shader, indirect.copy_count,
+                             glsl_uint_type());
+   v->indirect_buf_addr =
+      BLORP_CREATE_NIR_INPUT(b->shader, indirect.indirect_buf_addr,
+                             glsl_uint64_t_type());
+   v->stride =
+      BLORP_CREATE_NIR_INPUT(b->shader, indirect.indirect_buf_stride,
+                             glsl_uint64_t_type());
+   v->copy_idx =
+      BLORP_CREATE_NIR_INPUT(b->shader, indirect.copy_idx,
+                             glsl_uint_type());
+   v->max_layer =
+      BLORP_CREATE_NIR_INPUT(b->shader, indirect.max_layer,
+                             glsl_uint_type());
+   v->x_offset =
+      BLORP_CREATE_NIR_INPUT(b->shader, indirect.x_offset,
+                             glsl_uint_type());
+   v->y_offset =
+      BLORP_CREATE_NIR_INPUT(b->shader, indirect.y_offset,
+                             glsl_uint_type());
+}
+
+static nir_shader *
+blorp_build_copy_mem_indirect_shader(struct blorp_batch *batch,
+                                     void *mem_ctx)
+{
+
+   struct blorp_context *blorp = batch->blorp;
+
+   mesa_shader_stage stage = MESA_SHADER_COMPUTE;
+   const nir_shader_compiler_options *nir_options =
+      blorp->compiler->nir_options(blorp, stage);
+   nir_builder b = nir_builder_init_simple_shader(stage, nir_options,
+                                                  "copy mem indirect");
+   ralloc_steal(mem_ctx, b.shader);
+
+   b.shader->info.workgroup_size[0] = 32;
+   b.shader->info.workgroup_size[1] = 1;
+   b.shader->info.workgroup_size[2] = 1;
+
+   struct blorp_indirect_vars v;
+   blorp_indirect_vars_init(&b, &v);
+
+   /* The indirect buffer is an array containing 'copy_count'
+    * VkCopyMemoryIndirectCommandKHR structures, separated by 'stride' bytes.
+    */
+   nir_def *copy_count = nir_load_var(&b, v.copy_count);
+   nir_def *indirect_buf_addr = nir_load_var(&b, v.indirect_buf_addr);
+   nir_def *stride = nir_load_var(&b, v.stride);
+   nir_def *global_id =
+      nir_channel(&b, nir_load_global_invocation_id(&b, 32), 0);
+
+   blorp_copy_memory_indirect_shader(&b, indirect_buf_addr, copy_count,
+                                     stride, global_id);
+
+   return b.shader;
+}
+
+static nir_shader *
+blorp_build_copy_mem2img_indirect_shader(struct blorp_batch *batch,
+                                         void *mem_ctx,
+                                         struct blorp_indirect_copy_mem2img_key *key)
+{
+   struct blorp_context *blorp = batch->blorp;
+
+   mesa_shader_stage stage = MESA_SHADER_COMPUTE;
+   const nir_shader_compiler_options *nir_options =
+      blorp->compiler->nir_options(blorp, stage);
+   nir_builder b = nir_builder_init_simple_shader(stage, nir_options,
+                                                  "copy mem2img indirect");
+   ralloc_steal(mem_ctx, b.shader);
+
+   b.shader->info.workgroup_size[0] = 4;
+   b.shader->info.workgroup_size[1] = 8;
+   b.shader->info.workgroup_size[2] = 1;
+
+   struct blorp_indirect_vars v;
+   blorp_indirect_vars_init(&b, &v);
+
+   /* The indirect buffer is an array containing 'copy_count'
+    * VkCopyMemoryToImageIndirectCommandKHR structures, separated by 'stride'
+    * bytes.
+    */
+   nir_def *indirect_buf_addr = nir_load_var(&b, v.indirect_buf_addr);
+   nir_def *stride = nir_load_var(&b, v.stride);
+   nir_def *copy_idx = nir_load_var(&b, v.copy_idx);
+   nir_def *max_layer = nir_load_var(&b, v.max_layer);
+   nir_def *dest_coord_offsets_arr[2] = {
+      nir_load_var(&b, v.x_offset),
+      nir_load_var(&b, v.y_offset),
+   };
+   nir_def *dest_coord_offsets = nir_vec(&b, dest_coord_offsets_arr, 2);
+
+   nir_def *global_id = nir_load_global_invocation_id(&b, 32);
+
+   /* Shader keys (constants). */
+   nir_def *dimensions = nir_imm_int(&b, key->dimensions);
+   nir_def *forced_layer_or_z = nir_imm_int(&b, key->forced_layer_or_z);
+   nir_def *format_Bpb = nir_imm_intN_t(&b, key->format_Bpb, 16);
+   nir_def *format_block_size_arr[3] = {
+      nir_imm_int(&b, key->format_bw),
+      nir_imm_int(&b, key->format_bh),
+      nir_imm_int(&b, key->format_bd),
+   };
+   nir_def *format_block_size = nir_vec(&b, format_block_size_arr, 3);
+
+   /* Constants derived from the shader keys. */
+   nir_def *is_block_compressed =
+      nir_imm_bool(&b, key->format_bw > 1 || key->format_bh > 1 ||
+                       key->format_bd > 1);
+
+   blorp_copy_memory_to_image_indirect_shader(&b, indirect_buf_addr,
+                                              stride,
+                                              copy_idx,
+                                              max_layer,
+                                              dest_coord_offsets,
+                                              global_id,
+                                              dimensions,
+                                              forced_layer_or_z,
+                                              format_Bpb,
+                                              format_block_size,
+                                              is_block_compressed);
+
+   return b.shader;
+}
+
+static bool
+blorp_get_copy_mem_indirect_kernel_cs(struct blorp_batch *batch,
+                                      struct blorp_params *params)
+{
+   struct blorp_context *blorp = batch->blorp;
+   const char *key = "copy_mem_indirect_kernel_cs";
+   uint32_t key_size = strlen(key);
+
+   if (blorp->lookup_shader(batch, key, key_size, &params->cs_prog_kernel,
+                            &params->cs_prog_data))
+      return true;
+
+   void *mem_ctx = ralloc_context(NULL);
+
+   nir_shader *nir =
+      blorp_build_copy_mem_indirect_shader(batch, mem_ctx);
+
+   const struct blorp_program prog =
+      blorp_compile_cs(blorp, mem_ctx, nir, key, key_size);
+
+   bool result = blorp->upload_shader(batch, MESA_SHADER_COMPUTE,
+                                      key, key_size,
+                                      prog.kernel, prog.kernel_size,
+                                      prog.prog_data, prog.prog_data_size,
+                                      &params->cs_prog_kernel,
+                                      &params->cs_prog_data);
+
+   ralloc_free(mem_ctx);
+   return result;
+
+}
+
+static bool
+blorp_get_copy_mem2img_indirect_kernel_cs(struct blorp_batch *batch,
+                                          struct blorp_params *params,
+                                          struct blorp_indirect_copy_mem2img_key *key)
+{
+   struct blorp_context *blorp = batch->blorp;
+
+   if (blorp->lookup_shader(batch, key, sizeof(*key), &params->cs_prog_kernel,
+                            &params->cs_prog_data))
+      return true;
+
+   void *mem_ctx = ralloc_context(NULL);
+
+   nir_shader *nir =
+      blorp_build_copy_mem2img_indirect_shader(batch, mem_ctx, key);
+
+   const struct blorp_program prog =
+      blorp_compile_cs(blorp, mem_ctx, nir, key, sizeof(*key));
+
+   bool result = blorp->upload_shader(batch, MESA_SHADER_COMPUTE,
+                                      key, sizeof(*key),
+                                      prog.kernel, prog.kernel_size,
+                                      prog.prog_data, prog.prog_data_size,
+                                      &params->cs_prog_kernel,
+                                      &params->cs_prog_data);
+
+   ralloc_free(mem_ctx);
+   return result;
+}
+
+void
+blorp_copy_memory_indirect(struct blorp_batch *batch,
+                           uint64_t indirect_buf_addr,
+                           uint32_t copy_count,
+                           uint64_t stride)
+{
+   struct blorp_params params;
+   blorp_params_init(&params);
+
+   params.op = BLORP_OP_COPY_INDIRECT;
+   params.shader_type = BLORP_SHADER_TYPE_COPY_INDIRECT;
+   params.shader_pipeline = BLORP_SHADER_PIPELINE_COMPUTE;
+
+   params.x0 = 0;
+   params.y0 = 0;
+   params.x1 = 1;
+   params.y1 = 1;
+
+   params.wm_inputs.indirect.indirect_buf_addr = indirect_buf_addr;
+   params.wm_inputs.indirect.indirect_buf_stride = stride;
+   params.wm_inputs.indirect.copy_count = copy_count;
+
+   if (!blorp_get_copy_mem_indirect_kernel_cs(batch, &params)) {
+      mesa_loge("failed to get copy_memory_indirect CS kernel");
+      assert(false);
+      return;
+   }
+
+   batch->blorp->exec(batch, &params);
+}
+
+void
+blorp_copy_memory_to_image_indirect(struct blorp_batch *batch,
+                                    const struct blorp_surf *img_blorp_surf,
+                                    uint64_t indirect_buf_addr,
+                                    uint64_t indirect_buf_stride,
+                                    uint32_t copy_idx,
+                                    uint32_t img_mip_level,
+                                    int layer_count,
+                                    int forced_layer_or_z)
+{
+   enum isl_format original_format = img_blorp_surf->surf->format;
+   const struct isl_format_layout *fmtl =
+      isl_format_get_layout(original_format);
+   enum isl_format copy_format = get_format_for_copy(fmtl->bpb);
+   int dimensions = img_blorp_surf->surf->dim + 1;
+
+   struct blorp_indirect_copy_mem2img_key key = {
+      .dimensions = dimensions,
+      .forced_layer_or_z = forced_layer_or_z,
+      .format_Bpb = fmtl->bpb / 8,
+      .format_bw = fmtl->bw,
+      .format_bh = fmtl->bh,
+      .format_bd = fmtl->bd,
+   };
+
+   struct blorp_params params;
+   blorp_params_init(&params);
+
+   params.op = BLORP_OP_COPY_IMAGE_INDIRECT;
+   params.shader_type = BLORP_SHADER_TYPE_COPY_INDIRECT;
+   params.shader_pipeline = BLORP_SHADER_PIPELINE_COMPUTE;
+
+   params.wm_inputs.indirect.indirect_buf_addr = indirect_buf_addr;
+   params.wm_inputs.indirect.indirect_buf_stride = indirect_buf_stride;
+   params.wm_inputs.indirect.copy_idx = copy_idx;
+   params.wm_inputs.indirect.dimensions = dimensions;
+   params.wm_inputs.indirect.max_layer =
+      img_blorp_surf->surf->logical_level0_px.array_len - 1;
+   params.wm_inputs.indirect.forced_layer_or_z = forced_layer_or_z;
+
+   /* params.dst is our image. */
+   blorp_surface_info_init(batch, &params.dst, img_blorp_surf,
+                           img_mip_level,
+                           forced_layer_or_z == -1 ? 0 : forced_layer_or_z,
+                           copy_format,
+                           true /* is_dest */);
+
+   struct isl_extent3d mip_dimensions = {
+      .width = MAX2(params.dst.surf.logical_level0_px.w >> img_mip_level, 1),
+      .height = MAX2(params.dst.surf.logical_level0_px.h >> img_mip_level, 1),
+      .depth = MAX2(params.dst.surf.logical_level0_px.d >> img_mip_level, 1),
+   };
+
+   if (fmtl->bw > 1 || fmtl->bh > 1 || fmtl->bd > 1) {
+      blorp_surf_convert_to_uncompressed(batch->blorp->isl_dev,
+                                         &params.dst, NULL, NULL, NULL, NULL);
+      params.wm_inputs.indirect.x_offset = params.dst.tile_x_sa;
+      params.wm_inputs.indirect.y_offset = params.dst.tile_y_sa;
+
+      mip_dimensions.width = params.dst.surf.logical_level0_px.w;
+      mip_dimensions.height = params.dst.surf.logical_level0_px.h;
+      mip_dimensions.depth = params.dst.surf.logical_level0_px.d;
+   }
+
+   /* These settings control the number of workgroups in the shader, see
+    * blorp_exec_compute(). We don't need to divide by the local sizes here,
+    * this will be done later.
+    */
+   params.x0 = 0;
+   params.y0 = 0;
+   params.x1 = mip_dimensions.width;
+   params.y1 = mip_dimensions.height;
+   if (forced_layer_or_z == -1) {
+      /* We set this here so blorp_indirect_buf2img_get_dispatch_size() can
+       * read it while figuring out how many shader instances we'll need.
+       */
+      params.num_layers = layer_count;
+   } else {
+      params.num_layers = 1;
+   }
+
+   if (!blorp_get_copy_mem2img_indirect_kernel_cs(batch, &params, &key)) {
+      mesa_loge("failed to get copy_memory_to_image_indirect CS kernel");
+      assert(false);
+      return;
+   }
+
+   batch->blorp->exec(batch, &params);
+}
--- a/src/intel/blorp/blorp_priv.h
+++ b/src/intel/blorp/blorp_priv.h
@ -205,17 +205,55 @@ struct blorp_wm_inputs_clear {
   struct blorp_bounds_rect bounds_rect;
 };

+/* Parameters using in blorp_indirect_copy.c */
+struct blorp_wm_inputs_indirect {
+   /* The address of the indirect buffer containing the information about the
+    * indirect copy.
+    */
+   uint64_t indirect_buf_addr;
+
+   /* How far apart the information about each copy is inside the indirect
+    * buffer.
+    */
+   uint64_t indirect_buf_stride;
+
+   /* How many copies we have to do. */
+   uint32_t copy_count;
+
+   /* For memory to image copies, we do a single copy per shader. This
+    * represents the index of the copy to be done.
+    */
+   uint32_t copy_idx;
+
+   /* How many dimensions does our image have? 1, 2 or 3. */
+   uint32_t dimensions;
+
+   /* The maximum array layer of the image. */
+   uint32_t max_layer;
+
+   /* When compressed formats are used, we pretend they are a non-compressed
+    * format, of the same bpb. Since we can't maintain the exact same layout
+    * of mipmap and layer offsets, we're forced to make adjustments to where X
+    * and Y actually start, and are also forced to copy only one layer (or Z
+    * axis position) per shader invocation.
+    */
+   uint32_t x_offset;
+   uint32_t y_offset;
+   int forced_layer_or_z;
+};
+
 struct blorp_wm_inputs
 {
   union {
      struct blorp_wm_inputs_blit blit;
      struct blorp_wm_inputs_clear clear;
+      struct blorp_wm_inputs_indirect indirect;
   };

   /* Note: Pad out to an integral number of registers when extending, but
    * make sure subgroup_id is the last 32-bit item.
    */
-   uint32_t pad[2];
+   uint32_t pad[1];
   uint32_t subgroup_id;
 };

@ -257,6 +295,7 @@ enum blorp_shader_type {
   BLORP_SHADER_TYPE_MCS_PARTIAL_RESOLVE,
   BLORP_SHADER_TYPE_LAYER_OFFSET_VS,
   BLORP_SHADER_TYPE_GFX4_SF,
+   BLORP_SHADER_TYPE_COPY_INDIRECT,
 };

 enum blorp_shader_pipeline {
@ -467,6 +506,25 @@ struct blorp_blit_prog_key
   uint8_t local_y;
 };

+struct blorp_indirect_copy_mem2img_key {
+   /* How many dimensions does our image have? 1, 2 or 3. */
+   uint32_t dimensions;
+
+   /* When compressed formats are used, we pretend they are a non-compressed
+    * format, of the same bpb. Since we can't maintain the exact same layout
+    * of mipmap and layer offsets, we're forced to make adjustments to where X
+    * and Y actually start, and are also forced to copy only one layer (or Z
+    * axis position) per shader invocation.
+    */
+   int forced_layer_or_z;
+
+   /* Info taken from isl_format_layout. */
+   uint16_t format_Bpb;
+   uint16_t format_bw;
+   uint16_t format_bh;
+   uint16_t format_bd;
+};
+
 /**
 * \name BLORP internals
 * \{
@ -597,6 +655,19 @@ blorp_op_type_is_clear(enum blorp_op op)
   }
 }

+/* This means: blorp->wm_inputs.indirect should be used. */
+static inline bool
+blorp_op_type_is_indirect(enum blorp_op op)
+{
+   switch (op) {
+   case BLORP_OP_COPY_INDIRECT:
+   case BLORP_OP_COPY_IMAGE_INDIRECT:
+      return true;
+   default:
+      return false;
+   }
+}
+
 /* Asserts unless the surface is a buffer to image copy */
 #define blorp_assert_is_buffer(surf, view)                     \
   do {                                                        \
--- a/src/intel/blorp/blorp_shaders.cl
+++ b/src/intel/blorp/blorp_shaders.cl
@ -3,6 +3,7 @@
 */

 #include "compiler/libcl/libcl.h"
+#include "compiler/libcl/libcl_vk.h"
 #include "compiler/nir/nir_defines.h"
 #include "compiler/shader_enums.h"

@ -15,3 +16,279 @@ blorp_check_in_bounds(uint4 bounds_rect, uint2 pos)
   return pos.x >= x0 && pos.x < x1 &&
          pos.y >= y0 && pos.y < y1;
 }
+
+void nir_image_store(uint handle, int4 coords, uint sample_index,
+                     uint4 colors, uint lod, uint image_dim,
+                     uint image_array, uint format, uint access,
+                     uint range_base, uint src_type);
+
+/* Used by vkCmdCopyMemoryIndirectKHR. */
+void
+blorp_copy_memory_indirect_shader(
+   global uint *indirect_buf,
+   uint copy_count,
+   ulong stride_bytes,
+   uint global_id)
+{
+   for (uint c = 0; c < copy_count; c++) {
+      uint idx = (uint)(c * (stride_bytes / 4));
+
+      /* The spec says the minimum alignment is 4 instead of 8, so we have to
+       * do these tricks.
+       */
+      VkCopyMemoryIndirectCommandKHR cmd =
+         (*(global VkCopyMemoryIndirectCommandKHR*)&indirect_buf[idx]);
+
+      uint copy_size_ints = cmd.size / 4;
+
+      global int *src = (global int*)cmd.srcAddress;
+      global int *dst = (global int*)cmd.dstAddress;
+
+      if (global_id < copy_size_ints)
+         dst[global_id] = src[global_id];
+   }
+}
+
+struct img_copy_params {
+   ulong src_address;
+   uint2 row_size_px;
+   uint base_layer;
+   uint layer_count;
+   uint3 offset;
+   uint3 extent;
+};
+
+void
+read_img_params(
+   global uint *indirect_buf,
+   uint copy_idx,
+   ulong stride_bytes,
+   uint dimensions,
+   uint max_layer,
+   uint3 format_block_size,
+   bool is_block_compressed,
+   struct img_copy_params *p)
+{
+   uint idx = copy_idx * (stride_bytes / 4);
+
+   VkCopyMemoryToImageIndirectCommandKHR cmd =
+         (*(global VkCopyMemoryToImageIndirectCommandKHR*)&indirect_buf[idx]);
+
+   p->src_address = cmd.srcAddress;
+   p->row_size_px = (uint2)(cmd.bufferRowLength, cmd.bufferImageHeight);
+
+   /* We don't use imageSubresource.aspectMask and imageSubresource.mipLevel,
+    * those are dealt with when the application calls
+    * vkCmdCopyMemoryToImageIndirectKHR().
+    */
+
+   p->base_layer = cmd.imageSubresource.baseArrayLayer;
+   p->layer_count = cmd.imageSubresource.layerCount;
+
+   p->offset = (uint3)(cmd.imageOffset.x, cmd.imageOffset.y,
+                       cmd.imageOffset.z);
+   p->extent = (uint3)(cmd.imageExtent.width, cmd.imageExtent.height,
+                       cmd.imageExtent.depth);
+
+   if (p->row_size_px.x == 0)
+      p->row_size_px.x = p->extent.x;
+   if (p->row_size_px.y == 0)
+      p->row_size_px.y = p->extent.y;
+
+   /* Our code deals with blocks, not pixels. */
+   if (is_block_compressed) {
+      p->offset /= format_block_size;
+      p->extent = DIV_ROUND_UP(p->extent, format_block_size);
+   }
+
+   /* Users can pass 3D images with the Z axis as an array layer. */
+   if (dimensions == 3 && (p->base_layer != 0 || p->layer_count != 1)) {
+      p->offset.z = p->base_layer;
+      p->extent.z = p->layer_count;
+      p->base_layer = 0;
+      p->layer_count = 1;
+   }
+
+   /* This handles VK_REMAINING_ARRAY_LAYERS and bugs. */
+   if (p->base_layer + p->layer_count > max_layer + 1)
+      p->layer_count = max_layer - p->base_layer + 1;
+}
+
+uint4
+get_pixel(
+   ulong src_address,
+   uint format_Bpb)
+{
+   switch (format_Bpb) {
+   case 1: {
+      global uchar *src_buf = (global uchar *)src_address;
+      return (uint4)(src_buf[0], 0, 0, 0);
+   }
+   case 2: {
+      global ushort *src_buf = (global ushort *)src_address;
+      return (uint4)(src_buf[0], 0, 0, 0);
+   }
+   case 4: {
+      global uint *src_buf = (global uint *)src_address;
+      return (uint4)(src_buf[0], 0, 0, 0);
+   }
+   case 8: {
+      global uint *src_buf = (global uint *)src_address;
+      return (uint4)(vload2(0, src_buf), 0, 0);
+   }
+   case 16: {
+      global uint *src_buf = (global uint *)src_address;
+      return (uint4)(vload4(0, src_buf));
+   }
+   default:
+      /* TODO: support 3, 6, 12. */
+      return (uint4)(0xFF, 0, 0, 0xFF);
+   }
+}
+
+int4
+get_coords(
+   uint3 pos,
+   uint layer,
+   uint2 dest_coord_offsets,
+   int dimensions,
+   int forced_layer_or_z)
+{
+   int4 ret;
+
+   pos.xy += dest_coord_offsets;
+
+   switch (dimensions) {
+   case 1:
+      ret = (int4)(pos.x, layer, 0, 0);
+      break;
+   case 2:
+      ret = (int4)(pos.x, pos.y, layer, 0);
+      break;
+   case 3:
+   default:
+      ret = (int4)(pos.x, pos.y, pos.z, 0);
+      break;
+   }
+
+   if (forced_layer_or_z != -1) {
+      ret.z = 0;
+      ret.w = 0;
+   }
+
+   return ret;
+}
+
+void
+write_pixel(
+   const int4 coords,
+   const uint4 colors)
+{
+   /* We don't seem to need to set image_array to true if we set sampler_dim
+    * to 3D.
+    * Setting mip_level does not do anything on Intel, we set mip levels
+    * through the bindings.
+    */
+   nir_image_store(0 /* The image handle. */,
+                   coords, /* See get_coords(). */
+                   0, /* Sample index for multi-sampling. */
+                   colors, /* The RGBA pixels, in src_type. */
+                   0 /* mip_level */,
+                   GLSL_SAMPLER_DIM_3D,
+                   false, /* image_array */
+                   0, /* format */
+                   ACCESS_NON_READABLE, /* access */
+                   0, /* range_base */
+                   nir_type_int32 /* src_type */);
+}
+
+/* Used by vkCmdCopyMemoryToImageIndirectKHR. */
+void
+blorp_copy_memory_to_image_indirect_shader(
+   /* Actual parameters. */
+   global uint *indirect_buf,
+   ulong stride_bytes,
+   uint copy_idx,
+   uint max_layer,
+   uint2 dest_coord_offsets,
+   uint3 global_id,
+
+   /* These are shader keys, they are NIR immediates. */
+   uint dimensions,
+   int forced_layer_or_z,
+   ushort format_Bpb,
+   uint3 format_block_size,
+   bool is_block_compressed)
+{
+   /* We have one invocation per texel of a given a mip level.  This means
+    * that for the pixels outside the copy area, we'll hit the 'continue'
+    * below.
+    */
+   uint3 src_pos = global_id;
+   uint layer;
+
+   /* 'forced_layer_or_z' means that whatever slice or layer we're
+    * trying to work with is set as part of our binding (view) as depth
+    * 0 or layer 0, depending on the dimensionality. This can happen
+    * when we're trying to pretend a format is something that it's not
+    * (e.g., we're treating a block compressed 4x4 64bpp format as an
+    * r32g32 format).
+    */
+   if (forced_layer_or_z != -1) {
+      if (dimensions == 3) {
+         src_pos.z = forced_layer_or_z;
+         layer = 0;
+      } else {
+         src_pos.z = 0;
+         layer = forced_layer_or_z;
+      }
+   } else {
+      if (dimensions == 3) {
+         src_pos.z = global_id.z;
+         layer = 0;
+      } else {
+         src_pos.z = 0;
+         layer = global_id.z;
+      }
+   }
+
+   struct img_copy_params p;
+   read_img_params(indirect_buf, copy_idx, stride_bytes, dimensions,
+                   max_layer, format_block_size, is_block_compressed, &p);
+
+   if (any(src_pos >= p.extent))
+      return;
+
+   uint3 dst_pos = src_pos + p.offset;
+   if (forced_layer_or_z == -1 && dimensions != 3)
+      layer = global_id.z + p.base_layer;
+
+   if (layer > p.base_layer + p.layer_count - 1)
+      return;
+
+   uint2 row_size_blocks = p.row_size_px;
+   if (is_block_compressed) {
+      row_size_blocks = DIV_ROUND_UP(row_size_blocks,
+                                     format_block_size.xy);
+   }
+   uint row_length_bytes = row_size_blocks.x * format_Bpb;
+   uint row_height_bytes = row_size_blocks.y * row_length_bytes;
+
+   uint buf_z_offset_bytes = src_pos.z * row_height_bytes;
+   uint buf_y_offset_bytes = src_pos.y * row_length_bytes;
+   uint buf_x_offset_bytes = src_pos.x * format_Bpb;
+
+   uint layer_offset_bytes = (layer - p.base_layer) * row_height_bytes;
+   uint buf_offset_bytes = layer_offset_bytes +
+                           buf_z_offset_bytes +
+                           buf_y_offset_bytes +
+                           buf_x_offset_bytes;
+
+   uint4 colors = get_pixel(p.src_address + buf_offset_bytes,
+                            format_Bpb);
+
+   int4 coords = get_coords(dst_pos, layer, dest_coord_offsets, dimensions,
+                            forced_layer_or_z);
+
+   write_pixel(coords, colors);
+}
--- a/src/intel/blorp/meson.build
+++ b/src/intel/blorp/meson.build
@ -37,6 +37,7 @@ files_libblorp = files(
  'blorp.h',
  'blorp_blit.c',
  'blorp_clear.c',
+  'blorp_indirect_copy.c',
  'blorp_nir_builder.h',
  'blorp_priv.h',
 )
--- a/src/intel/common/mi_builder.h
+++ b/src/intel/common/mi_builder.h
@ -1066,6 +1066,17 @@ mi_ior(struct mi_builder *b,
                           MI_ALU_STORE, MI_ALU_ACCU);
 }

+static inline struct mi_value
+mi_ixor(struct mi_builder *b,
+        struct mi_value src0, struct mi_value src1)
+{
+   if (src0.type == MI_VALUE_TYPE_IMM && src1.type == MI_VALUE_TYPE_IMM)
+      return mi_imm(mi_value_to_u64(src0) ^ mi_value_to_u64(src1));
+
+   return mi_math_binop(b, MI_ALU_XOR, src0, src1,
+                           MI_ALU_STORE, MI_ALU_ACCU);
+}
+
 #if GFX_VERx10 >= 125
 static inline struct mi_value
 mi_ishl(struct mi_builder *b, struct mi_value src0, struct mi_value src1)
@ -1290,6 +1301,40 @@ mi_udiv32_imm(struct mi_builder *b, struct mi_value N, uint32_t D)
   }
 }

+/* Finds the maximum between the two specified unsigned numbers. */
+static inline struct mi_value
+mi_umax2(struct mi_builder *b, struct mi_value val1, struct mi_value val2)
+{
+   /* The idea of the alrogithm here is that the value of 'mask' will be
+    * either 0 or ~0 depending on which number is bigger. Then we use AND
+    * operations to ensure the smaller value becomes zero and the bigger value
+    * is preserved, and finally OR both values to the destination (the bigger
+    * and zero).
+    *
+    * In other words:
+    *   mask = val1 < val2 ? 0xFFFFFFFF : 0x0;
+    *   biggest = (val1 & ~mask) | (val2 & mask);
+    */
+
+   /* If 'val1' is smaller, 'mask' is ~0, otherwise it's 0. */
+   struct mi_value mask = mi_ult(b, mi_value_ref(b, val1),
+                                 mi_value_ref(b, val2));
+   struct mi_value notmask = mi_ixor(b, mi_value_ref(b, mask),
+                                     mi_imm(UINT64_MAX));
+   /* If 'val1' is smaller, 'notmask' is 0, so we zero it, otherwise we
+    * preserve the value by ANDing it with ~0.
+    */
+   struct mi_value val1_or_zero = mi_iand(b, val1, notmask);
+   /* If 'val2' is smaller, mask is 0, so we zero it, otherwise we preserve
+    * the value.
+    */
+   struct mi_value val2_or_zero = mi_iand(b, val2, mask);
+   /* The smaller value was zeroed, the other was preserved, so just OR
+    * them now.
+    */
+   return mi_ior(b, val1_or_zero, val2_or_zero);
+}
+
 #endif /* MI_MATH section */

 /* This assumes addresses of strictly more than 32bits (aka. Gfx8+). */
--- a/src/intel/common/tests/mi_builder_test.cpp
+++ b/src/intel/common/tests/mi_builder_test.cpp
@ -1252,6 +1252,58 @@ TEST_F(mi_builder_test, udiv32_imm)
   }
 }

+TEST_F(mi_builder_test, umax2_32)
+{
+   const uint32_t values[] = {
+      0x01234567,
+      0x42424242,
+      0xffffffff,
+      0xf0000000,
+      0x00000001,
+      0x00000000,
+      0x000f0000,
+      0x000f0001,
+   };
+   memcpy(input, values, sizeof(values));
+
+   uint32_t cmp_val = 0x000f0001;
+   for (unsigned i = 0; i < ARRAY_SIZE(values); i++)
+      mi_store(&b, out_mem64(i * 4), mi_umax2(&b, in_mem32(i * 4), mi_imm(cmp_val)));
+
+   submit_batch();
+
+   for (unsigned i = 0; i < ARRAY_SIZE(values); i++) {
+      EXPECT_EQ(*(uint32_t *)(output + i * 4),
+                values[i] >= cmp_val ? values[i] : cmp_val);
+   }
+}
+
+TEST_F(mi_builder_test, umax2_64)
+{
+   const uint64_t values[] = {
+      0x0123456789abcdef,
+      0x4242424242424242,
+      0xffffffffffffffff,
+      0xf000000000000000,
+      0x0000000000000001,
+      0x0000000000000000,
+      0x00000000f0000000,
+      0x00000000f0000001,
+   };
+   memcpy(input, values, sizeof(values));
+
+   uint64_t cmp_val = 0x00000000f0000001;
+   for (unsigned i = 0; i < ARRAY_SIZE(values); i++)
+      mi_store(&b, out_mem64(i * 8), mi_umax2(&b, in_mem64(i * 8), mi_imm(cmp_val)));
+
+   submit_batch();
+
+   for (unsigned i = 0; i < ARRAY_SIZE(values); i++) {
+      EXPECT_EQ(*(uint64_t *)(output + i * 8),
+                values[i] >= cmp_val ? values[i] : cmp_val);
+   }
+}
+
 TEST_F(mi_builder_test, store_if)
 {
   uint64_t u64 = 0xb453b411deadc0deull;
--- a/src/intel/vulkan/anv_blorp.c
+++ b/src/intel/vulkan/anv_blorp.c
@ -2811,3 +2811,135 @@ anv_image_ccs_op(struct anv_cmd_buffer *cmd_buffer,

   anv_blorp_batch_finish(&batch);
 }
+
+void
+anv_CmdCopyMemoryIndirectKHR(
+      VkCommandBuffer                     commandBuffer,
+      const VkCopyMemoryIndirectInfoKHR*  pCopyMemoryIndirectInfo)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   if (pCopyMemoryIndirectInfo->copyCount == 0)
+      return;
+
+   const uint64_t indirect_buf_addr =
+      pCopyMemoryIndirectInfo->copyAddressRange.address;
+   const uint32_t copy_count = pCopyMemoryIndirectInfo->copyCount;
+   const uint64_t stride = pCopyMemoryIndirectInfo->copyAddressRange.stride;
+
+   /* These are all restrictions by the spec. */
+   assert((pCopyMemoryIndirectInfo->srcCopyFlags &
+           VK_ADDRESS_COPY_PROTECTED_BIT_KHR) == 0);
+   assert((pCopyMemoryIndirectInfo->dstCopyFlags &
+           VK_ADDRESS_COPY_PROTECTED_BIT_KHR) == 0);
+   assert(pCopyMemoryIndirectInfo->copyAddressRange.size >=
+          pCopyMemoryIndirectInfo->copyCount *
+          pCopyMemoryIndirectInfo->copyAddressRange.stride);
+
+   assert(!anv_cmd_buffer_is_blitter_queue(cmd_buffer));
+
+   enum blorp_batch_flags blorp_flags = BLORP_BATCH_USE_COMPUTE;
+
+   struct blorp_batch batch;
+   anv_blorp_batch_init(cmd_buffer, &batch, blorp_flags);
+
+   blorp_copy_memory_indirect(&batch, indirect_buf_addr, copy_count, stride);
+
+   anv_blorp_batch_finish(&batch);
+}
+
+void
+anv_CmdCopyMemoryToImageIndirectKHR(
+      VkCommandBuffer                           commandBuffer,
+      const VkCopyMemoryToImageIndirectInfoKHR* pCopyMemoryToImageIndirectInfo)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_image, anv_image,
+                   pCopyMemoryToImageIndirectInfo->dstImage);
+
+   if (pCopyMemoryToImageIndirectInfo->copyCount == 0)
+      return;
+
+   const uint64_t indirect_buf_addr =
+      pCopyMemoryToImageIndirectInfo->copyAddressRange.address;
+   const uint32_t copy_count = pCopyMemoryToImageIndirectInfo->copyCount;
+   const uint64_t stride =
+      pCopyMemoryToImageIndirectInfo->copyAddressRange.stride;
+   const VkImageLayout img_layout =
+      pCopyMemoryToImageIndirectInfo->dstImageLayout;
+
+   assert((pCopyMemoryToImageIndirectInfo->srcCopyFlags &
+           VK_ADDRESS_COPY_PROTECTED_BIT_KHR) == 0);
+   assert(pCopyMemoryToImageIndirectInfo->copyAddressRange.size >=
+          pCopyMemoryToImageIndirectInfo->copyCount *
+          pCopyMemoryToImageIndirectInfo->copyAddressRange.stride);
+
+   assert(!anv_cmd_buffer_is_blitter_queue(cmd_buffer));
+
+   enum blorp_batch_flags blorp_flags = BLORP_BATCH_USE_COMPUTE;
+   struct blorp_batch batch;
+   anv_blorp_batch_init(cmd_buffer, &batch, blorp_flags);
+
+   for (int c = 0; c < copy_count; c++) {
+      const VkImageSubresourceLayers *img_subresource =
+         &pCopyMemoryToImageIndirectInfo->pImageSubresources[c];
+      VkImageAspectFlags aspect_mask = img_subresource->aspectMask;
+      uint32_t mip_level = img_subresource->mipLevel;
+      uint32_t base_layer = img_subresource->baseArrayLayer;
+      uint32_t layer_count = img_subresource->layerCount;
+
+      assert(mip_level != VK_REMAINING_MIP_LEVELS);
+      if (layer_count == VK_REMAINING_ARRAY_LAYERS)
+         layer_count = anv_image->vk.array_layers - base_layer;
+
+      const unsigned plane =
+         anv_image_aspect_to_plane(anv_image, aspect_mask);
+      struct isl_surf *img_isl_surf =
+         &anv_image->planes[plane].primary_surface.isl;
+      enum isl_format format = img_isl_surf->format;
+      bool format_is_compressed = isl_format_is_compressed(format);
+
+      struct blorp_surf img_blorp_surf;
+      get_blorp_surf_for_anv_image(cmd_buffer, anv_image,
+                                   aspect_mask, VK_IMAGE_USAGE_STORAGE_BIT,
+                                   img_layout,
+                                   anv_image->planes[plane].aux_usage,
+                                   format, false /* cross_aspect */,
+                                   &img_blorp_surf);
+
+      anv_cmd_buffer_mark_image_written(cmd_buffer, anv_image,
+                                        img_subresource->aspectMask,
+                                        img_blorp_surf.aux_usage, mip_level,
+                                        base_layer, layer_count);
+
+      /* If the format is compressed, we will pretend the format is one where
+       * each pixel is the size of the compressed block, so image stores can
+       * work. Unfortunately, that translation only works if we do it for one
+       * mip level of a specific layer, as the complete layout of, say, BC4
+       * and R32G32 formats are not compatible: so we loop through all layers
+       * here.
+       */
+      if (format_is_compressed) {
+         int min_l_or_z, top_l_or_z;
+         if (img_blorp_surf.surf->dim == ISL_SURF_DIM_3D) {
+            min_l_or_z = 0;
+            top_l_or_z = img_isl_surf->logical_level0_px.depth >> mip_level;
+         } else {
+            min_l_or_z = base_layer;
+            top_l_or_z = base_layer + layer_count;
+         }
+         for (int l_or_z = min_l_or_z; l_or_z < top_l_or_z; l_or_z++) {
+            blorp_copy_memory_to_image_indirect(&batch, &img_blorp_surf,
+                                                indirect_buf_addr,
+                                                stride, c, mip_level,
+                                                layer_count, l_or_z);
+         }
+      } else {
+         blorp_copy_memory_to_image_indirect(&batch, &img_blorp_surf,
+                                             indirect_buf_addr, stride, c,
+                                             mip_level, layer_count, -1);
+      }
+   }
+
+   anv_blorp_batch_finish(&batch);
+}
--- a/src/intel/vulkan/anv_formats.c
+++ b/src/intel/vulkan/anv_formats.c
@ -629,6 +629,39 @@ anv_get_format_aspect(const struct anv_physical_device *device,
   return anv_get_format_plane(device, vk_format, plane, tiling);
 }

+static bool
+anv_format_supports_indirect_copies(const struct anv_physical_device *pdevice,
+                                    const struct anv_format *anv_format)
+{
+   const struct isl_format_layout *fmtl =
+      isl_format_get_layout(anv_format->planes[0].isl_format);
+
+   /* CTS insists on it even when we say we don't support it. */
+   if (!pdevice->vk.supported_features.indirectMemoryToImageCopy)
+      return false;
+
+   /* TODO: implement support for this in the copy shader. */
+   if (!util_is_power_of_two_or_zero(fmtl->bpb))
+      return false;
+
+   /* TODO: we use compute for indirect copies, and compute cannot write HiZ,
+    * we could try to support that if we see that applications want it.
+    */
+   if (vk_format_is_depth_or_stencil(anv_format->vk_format))
+      return false;
+
+   /* Let's leave YCbCr and multi-planar formats out until we have proper
+    * tests to verify they work.
+    */
+   if (isl_format_is_yuv(anv_format->planes[0].isl_format))
+      return false;
+
+   if (anv_format->n_planes > 1)
+      return false;
+
+   return true;
+}
+
 // Format capabilities

 static bool
@ -715,28 +748,41 @@ anv_color_format_supports_drm_modifier_tiling(const struct anv_physical_device *
 }

 static VkFormatFeatureFlags2
-anv_get_compressed_emulated_format_features(const struct anv_format *anv_format,
+anv_get_compressed_emulated_format_features(const struct anv_physical_device *pdevice,
+                                            const struct anv_format *anv_format,
                                            const VkImageTiling vk_tiling)
 {
+   VkFormatFeatureFlags2 flags = 0;
+
   assert(isl_format_is_compressed(anv_format->planes[0].isl_format));

   /* Require optimal tiling so that we can decompress on upload */
-   if (vk_tiling == VK_IMAGE_TILING_OPTIMAL) {
+   switch (vk_tiling) {
+   case VK_IMAGE_TILING_OPTIMAL:
      /* Required features for compressed formats */
-      return VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT |
-             VK_FORMAT_FEATURE_2_BLIT_SRC_BIT |
-             VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_LINEAR_BIT |
-             VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT |
-             VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT |
-             VK_FORMAT_FEATURE_2_HOST_IMAGE_TRANSFER_BIT_EXT;
-   } else if (vk_tiling == VK_IMAGE_TILING_LINEAR) {
+      flags |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT |
+               VK_FORMAT_FEATURE_2_BLIT_SRC_BIT |
+               VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_LINEAR_BIT |
+               VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT |
+               VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT |
+               VK_FORMAT_FEATURE_2_HOST_IMAGE_TRANSFER_BIT_EXT;
+      break;
+
+   case VK_IMAGE_TILING_LINEAR:
      /* Images used for transfers */
-      return VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT |
-             VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT |
-             VK_FORMAT_FEATURE_2_HOST_IMAGE_TRANSFER_BIT_EXT;
+      flags |= VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT |
+               VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT |
+               VK_FORMAT_FEATURE_2_HOST_IMAGE_TRANSFER_BIT_EXT;
+      break;
+
+   default:
+      return 0;
   }

-   return 0;
+   if (anv_format_supports_indirect_copies(pdevice, anv_format))
+      flags |= VK_FORMAT_FEATURE_2_COPY_IMAGE_INDIRECT_DST_BIT_KHR;
+
+   return flags;
 }

 static VkFormatFeatureFlags2
@ -852,7 +898,8 @@ anv_get_color_format_features(const struct anv_physical_device *physical_device,
      if (vk_tiling == VK_IMAGE_TILING_LINEAR &&
          isl_format_get_layout(plane_format.isl_format)->txc == ISL_TXC_ASTC)
         return VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT |
-                VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT;
+                VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT |
+                VK_FORMAT_FEATURE_2_COPY_IMAGE_INDIRECT_DST_BIT_KHR;

      flags |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT |
               VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_MINMAX_BIT |
@ -935,6 +982,10 @@ anv_get_color_format_features(const struct anv_physical_device *physical_device,
               VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT;
   }

+   if ((flags & VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT) &&
+       anv_format_supports_indirect_copies(physical_device, anv_format))
+      flags |= VK_FORMAT_FEATURE_2_COPY_IMAGE_INDIRECT_DST_BIT_KHR;
+
   /* XXX: We handle 3-channel formats by switching them out for RGBX or
    * RGBA formats behind-the-scenes.  This works fine for textures
    * because the upload process will fill in the extra channel.
@ -1125,7 +1176,8 @@ anv_get_image_format_features2(const struct anv_physical_device *physical_device
   }

   if (anv_is_compressed_format_emulated(physical_device, vk_format)) {
-      return anv_get_compressed_emulated_format_features(anv_format,
+      return anv_get_compressed_emulated_format_features(physical_device,
+                                                         anv_format,
                                                         vk_tiling);
   }

--- a/src/intel/vulkan/anv_physical_device.c
+++ b/src/intel/vulkan/anv_physical_device.c
@ -152,6 +152,7 @@ get_device_extensions(const struct anv_physical_device *device,
      .KHR_cooperative_matrix                = device->has_cooperative_matrix,
      .NV_cooperative_matrix2                = device->has_cooperative_matrix,
      .KHR_copy_commands2                    = true,
+      .KHR_copy_memory_indirect              = true,
      .KHR_create_renderpass2                = true,
      .KHR_dedicated_allocation              = true,
      .KHR_deferred_host_operations          = true,
@ -1023,6 +1024,10 @@ get_features(const struct anv_physical_device *pdevice,
      /* VK_KHR_pipeline_binary */
      .pipelineBinaries = true,

+      /* VK_KHR_copy_memory_indirect */
+      .indirectMemoryCopy = true,
+      .indirectMemoryToImageCopy = pdevice->info.ver >= 12,
+
 #ifdef ANV_USE_WSI_PLATFORM
      /* VK_EXT_present_timing */
      .presentTiming = true,
@ -2205,6 +2210,11 @@ get_properties(const struct anv_physical_device *pdevice,
       */
      props->shaderBinaryVersion = 0;
   }
+
+   /* VK_KHR_copy_memory_indirect */
+   {
+      props->supportedQueues = VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT;
+   }
 }

 /* This function restricts the maximum size of system memory heap. The
--- a/src/intel/vulkan/genX_blorp_exec.c
+++ b/src/intel/vulkan/genX_blorp_exec.c
@ -534,6 +534,8 @@ get_color_aux_op(const struct blorp_params *params)
   case BLORP_OP_SLOW_COLOR_CLEAR:
   case BLORP_OP_BLIT:
   case BLORP_OP_COPY:
+   case BLORP_OP_COPY_INDIRECT:
+   case BLORP_OP_COPY_IMAGE_INDIRECT:
      assert(params->fast_clear_op == ISL_AUX_OP_NONE);
      return ANV_COLOR_AUX_OP_CLASS_NONE;
   }
@ -596,3 +598,10 @@ genX(blorp_init_dynamic_states)(struct blorp_context *context)
 {
   blorp_init_dynamic_states(context);
 }
+
+static bool *
+blorp_get_write_fencing_status(struct blorp_batch *blorp_batch)
+{
+   struct anv_cmd_buffer *cmd_buffer = blorp_batch->driver_batch;
+   return &cmd_buffer->batch.write_fence_status;
+}