ac/nir: import the dispatch logic for the universal compute clear/blit shader

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28917>
2026-03-09 05:10:32 +01:00 · 2024-04-24 04:09:33 -04:00 · 2024-04-24 04:09:33 -04:00 · dc113c418d
commit dc113c418d
parent 6b15e45908
3 changed files with 892 additions and 716 deletions
--- a/src/amd/common/ac_nir_meta.h
+++ b/src/amd/common/ac_nir_meta.h
@ -9,6 +9,7 @@

 #include "ac_gpu_info.h"
 #include "nir.h"
+#include "util/box.h"

 union ac_ps_resolve_key {
   struct {
@ -82,14 +83,56 @@ union ac_cs_blit_key {
 };

 struct ac_cs_blit_options {
+   /* Global options. */
   const nir_shader_compiler_options *nir_options;
   const struct radeon_info *info;
-   bool use_aco;     /* global driver setting */
-   bool no_fmask;    /* FMASK disabled by a debug option, ignored on GFX11+ */
-   bool print_key;   /* print ac_ps_resolve_key into stderr */
+   bool use_aco;        /* global driver setting */
+   bool no_fmask;       /* FMASK disabled by a global debug option, ignored on GFX11+ */
+   bool print_key;      /* print ac_ps_resolve_key into stderr */
+   bool fail_if_slow;   /* fail if a gfx blit is faster, set to false on compute queues */
+
+   bool is_nested;      /* for internal use, don't set */
+};
+
+struct ac_cs_blit_description
+{
+   struct {
+      struct radeon_surf *surf;
+      uint8_t dim;            /* 1 = 1D texture, 2 = 2D texture, 3 = 3D texture */
+      bool is_array;          /* array or cube texture */
+      unsigned width0;        /* level 0 width */
+      unsigned height0;       /* level 0 height */
+      uint8_t num_samples;
+      uint8_t level;
+      struct pipe_box box;       /* negative width, height only legal for src */
+      enum pipe_format format;   /* format reinterpretation */
+   } dst, src;
+
+   bool sample0_only;                  /* copy sample 0 instead of resolving */
+   union pipe_color_union clear_color; /* if src.surf == NULL, this is the clear color */
+};
+
+/* Dispatch parameters generated by the blit. */
+struct ac_cs_blit_dispatch {
+   union ac_cs_blit_key shader_key;
+   uint32_t user_data[8];        /* for nir_intrinsic_load_user_data_amd */
+
+   unsigned wg_size[3];          /* variable workgroup size (NUM_THREAD_FULL) */
+   unsigned last_wg_size[3];     /* workgroup size of the last workgroup (NUM_THREAD_PARTIAL) */
+   unsigned num_workgroups[3];   /* DISPATCH_DIRECT parameters */
+};
+
+struct ac_cs_blit_dispatches {
+   unsigned num_dispatches;
+   struct ac_cs_blit_dispatch dispatches[7];
 };

 nir_shader *
 ac_create_blit_cs(const struct ac_cs_blit_options *options, const union ac_cs_blit_key *key);

+bool
+ac_prepare_compute_blit(const struct ac_cs_blit_options *options,
+                        const struct ac_cs_blit_description *blit,
+                        struct ac_cs_blit_dispatches *dispatches);
+
 #endif
--- a/src/amd/common/ac_nir_meta_cs_blit.c
+++ b/src/amd/common/ac_nir_meta_cs_blit.c
@ -6,8 +6,11 @@

 #include "ac_nir_meta.h"
 #include "ac_nir_helpers.h"
+#include "ac_surface.h"
 #include "nir_format_convert.h"
 #include "compiler/aco_interface.h"
+#include "util/format_srgb.h"
+#include "util/u_pack_color.h"

 static nir_def *
 deref_ssa(nir_builder *b, nir_variable *var)
@ -437,3 +440,741 @@ ac_create_blit_cs(const struct ac_cs_blit_options *options, const union ac_cs_bl

   return b.shader;
 }
+
+static unsigned
+set_work_size(struct ac_cs_blit_dispatch *dispatch,
+              unsigned block_x, unsigned block_y, unsigned block_z,
+              unsigned num_wg_x, unsigned num_wg_y, unsigned num_wg_z)
+{
+   dispatch->wg_size[0] = block_x;
+   dispatch->wg_size[1] = block_y;
+   dispatch->wg_size[2] = block_z;
+
+   unsigned num_wg[3] = {num_wg_x, num_wg_y, num_wg_z};
+   for (int i = 0; i < 3; ++i) {
+      dispatch->last_wg_size[i] = num_wg[i] % dispatch->wg_size[i];
+      dispatch->num_workgroups[i] = DIV_ROUND_UP(num_wg[i], dispatch->wg_size[i]);
+   }
+
+   return num_wg_z > 1 ? 3 : (num_wg_y > 1 ? 2 : 1);
+}
+
+static bool
+should_blit_clamp_to_edge(const struct ac_cs_blit_description *blit, unsigned coord_mask)
+{
+   return util_is_box_out_of_bounds(&blit->src.box, coord_mask, blit->src.width0,
+                                    blit->src.height0, blit->src.level);
+}
+
+/* Return a power-of-two alignment of a number. */
+static unsigned
+compute_alignment(unsigned x)
+{
+   return x ? BITFIELD_BIT(ffs(x) - 1) : BITFIELD_BIT(31);
+}
+
+/* Set the blit info, but change the dst box and trim the src box according to the new dst box. */
+static void
+set_trimmed_blit(const struct ac_cs_blit_description *old, const struct pipe_box *box,
+                 bool is_clear, struct ac_cs_blit_description *out)
+{
+   assert(old->dst.box.x <= box->x);
+   assert(old->dst.box.y <= box->y);
+   assert(old->dst.box.z <= box->z);
+   assert(box->x + box->width <= old->dst.box.x + old->dst.box.width);
+   assert(box->y + box->height <= old->dst.box.y + old->dst.box.height);
+   assert(box->z + box->depth <= old->dst.box.z + old->dst.box.depth);
+   /* No scaling. */
+   assert(is_clear || old->dst.box.width == abs(old->src.box.width));
+   assert(is_clear || old->dst.box.height == abs(old->src.box.height));
+   assert(is_clear || old->dst.box.depth == abs(old->src.box.depth));
+
+   *out = *old;
+   out->dst.box = *box;
+
+   if (!is_clear) {
+      if (out->src.box.width > 0) {
+         out->src.box.x += box->x - old->dst.box.x;
+         out->src.box.width = box->width;
+      } else {
+         out->src.box.x -= box->x - old->dst.box.x;
+         out->src.box.width = -box->width;
+      }
+
+      if (out->src.box.height > 0) {
+         out->src.box.y += box->y - old->dst.box.y;
+         out->src.box.height = box->height;
+      } else {
+         out->src.box.y -= box->y - old->dst.box.y;
+         out->src.box.height = -box->height;
+      }
+
+      out->src.box.z += box->z - old->dst.box.z;
+      out->src.box.depth = box->depth;
+   }
+}
+
+typedef struct {
+   unsigned x, y, z;
+} uvec3;
+
+/* This function uses the blit description to generate the shader key, prepare user SGPR constants,
+ * and determine the parameters for up to 7 compute dispatches.
+ *
+ * The driver should use the shader key to create the shader, set the SGPR constants, and launch
+ * compute dispatches.
+ */
+bool
+ac_prepare_compute_blit(const struct ac_cs_blit_options *options,
+                        const struct ac_cs_blit_description *blit,
+                        struct ac_cs_blit_dispatches *out)
+{
+   const struct radeon_info *info = options->info;
+   bool is_2d_tiling = !blit->dst.surf->is_linear && !blit->dst.surf->thick_tiling;
+   bool is_3d_tiling = blit->dst.surf->thick_tiling;
+   bool is_clear = !blit->src.surf;
+   unsigned dst_samples = MAX2(1, blit->dst.num_samples);
+   unsigned src_samples = is_clear ? 1 : MAX2(1, blit->src.num_samples);
+   bool is_resolve = !is_clear && dst_samples == 1 && src_samples >= 2 &&
+                     !util_format_is_pure_integer(blit->dst.format);
+   bool is_upsampling = !is_clear && src_samples == 1 && dst_samples >= 2;
+   bool sample0_only = src_samples >= 2 && dst_samples == 1 &&
+                       (blit->sample0_only || util_format_is_pure_integer(blit->dst.format));
+   /* Get the channel sizes. */
+   unsigned max_dst_chan_size = util_format_get_max_channel_size(blit->dst.format);
+   unsigned max_src_chan_size = is_clear ? 0 : util_format_get_max_channel_size(blit->src.format);
+
+   if (!options->is_nested)
+      memset(out, 0, sizeof(*out));
+
+   /* Reject blits with invalid parameters. */
+   if (blit->dst.box.width < 0 || blit->dst.box.height < 0 || blit->dst.box.depth < 0 ||
+       blit->src.box.depth < 0) {
+      assert(!"invalid box parameters"); /* this is reachable and prevents hangs */
+      return true;
+   }
+
+   /* Skip zero-area blits. */
+   if (!blit->dst.box.width || !blit->dst.box.height || !blit->dst.box.depth ||
+       (!is_clear && (!blit->src.box.width || !blit->src.box.height || !blit->src.box.depth)))
+      return true;
+
+   if (blit->dst.format == PIPE_FORMAT_A8R8_UNORM || /* This format fails AMD_TEST=imagecopy. */
+       max_dst_chan_size == 5 || /* PIPE_FORMAT_R5G5B5A1_UNORM has precision issues */
+       util_format_is_depth_or_stencil(blit->dst.format) ||
+       dst_samples > SI_MAX_COMPUTE_BLIT_SAMPLES ||
+       (!is_clear &&
+        /* Scaling is not implemented by the compute shader. */
+        (blit->dst.box.width != abs(blit->src.box.width) ||
+         blit->dst.box.height != abs(blit->src.box.height) ||
+         blit->dst.box.depth != abs(blit->src.box.depth) ||
+         util_format_is_depth_or_stencil(blit->src.format) ||
+         src_samples > SI_MAX_COMPUTE_BLIT_SAMPLES)))
+      return false;
+
+   /* Return a failure if a compute blit is slower than a gfx blit. */
+   if (options->fail_if_slow) {
+      if (is_clear) {
+         /* Verified on: Tahiti, Hawaii, Tonga, Vega10, Navi10, Navi21, Navi31 */
+         if (is_3d_tiling) {
+            if (info->gfx_level == GFX6 && blit->dst.surf->bpe == 8)
+               return false;
+         } else if (is_2d_tiling) {
+            if (!(info->gfx_level == GFX6 && blit->dst.surf->bpe <= 4 && dst_samples == 1) &&
+                !(info->gfx_level == GFX7 && blit->dst.surf->bpe == 1 && dst_samples == 1))
+               return false;
+         }
+      } else {
+         /* For upsampling, image stores don't compress MSAA as good as draws. */
+         if (is_upsampling)
+            return false;
+
+         switch (info->gfx_level) {
+         case GFX6:
+         case GFX7:
+         case GFX8:
+         case GFX9:
+         case GFX10:
+         case GFX10_3:
+            /* Verified on: Tahiti, Hawaii, Tonga, Vega10, Navi10, Navi21 */
+            if (is_resolve) {
+               if (!(info->gfx_level == GFX7 && blit->dst.surf->bpe == 16))
+                  return false;
+            } else {
+               assert(dst_samples == src_samples || sample0_only);
+
+               if (is_2d_tiling) {
+                  if (dst_samples == 1) {
+                     if (blit->dst.surf->bpe <= 8 &&
+                         !(info->gfx_level <= GFX7 && blit->dst.surf->bpe == 1) &&
+                         !(info->gfx_level == GFX6 && blit->dst.surf->bpe == 2 &&
+                           blit->src.surf->is_linear) &&
+                         !(info->gfx_level == GFX7 && blit->dst.surf->bpe >= 2 &&
+                           blit->src.surf->is_linear) &&
+                         !((info->gfx_level == GFX8 || info->gfx_level == GFX9) &&
+                           blit->dst.surf->bpe >= 2 && blit->src.surf->is_linear) &&
+                         !(info->gfx_level == GFX10 && blit->dst.surf->bpe <= 2 &&
+                           blit->src.surf->is_linear) &&
+                         !(info->gfx_level == GFX10_3 && blit->dst.surf->bpe == 8 &&
+                           blit->src.surf->is_linear))
+                        return false;
+
+                     if (info->gfx_level == GFX6 && blit->dst.surf->bpe == 16 &&
+                         blit->src.surf->is_linear && blit->dst.dim != 3)
+                        return false;
+
+                     if (blit->dst.surf->bpe == 16 && !blit->src.surf->is_linear &&
+                         /* Only GFX6 selects 2D tiling for 128bpp 3D textures. */
+                         !(info->gfx_level == GFX6 && blit->dst.dim == 3) &&
+                         info->gfx_level != GFX7)
+                        return false;
+                  } else {
+                     /* MSAA copies - tested only without FMASK on Navi21. */
+                     if (blit->dst.surf->bpe >= 4)
+                        return false;
+                  }
+               }
+            }
+            break;
+
+         case GFX11:
+         case GFX11_5:
+         default:
+            /* Verified on Navi31. */
+            if (is_resolve) {
+               if (!((blit->dst.surf->bpe <= 2 && src_samples == 2) ||
+                     (blit->dst.surf->bpe == 16 && src_samples == 4)))
+                  return false;
+            } else {
+               assert(dst_samples == src_samples || sample0_only);
+
+               if (is_2d_tiling) {
+                  if (blit->dst.surf->bpe == 2 && blit->src.surf->is_linear && dst_samples == 1)
+                     return false;
+
+                  if ((blit->dst.surf->bpe == 4 || blit->dst.surf->bpe == 8) && dst_samples == 1)
+                     return false;
+
+                  if (blit->dst.surf->bpe == 16 && dst_samples == 1 && !blit->src.surf->is_linear)
+                     return false;
+
+                  if (blit->dst.surf->bpe == 16 && dst_samples == 8)
+                     return false;
+               }
+            }
+            break;
+         }
+      }
+   }
+
+   unsigned width = blit->dst.box.width;
+   unsigned height = blit->dst.box.height;
+   unsigned depth = blit->dst.box.depth;
+   uvec3 lane_size = (uvec3){1, 1, 1};
+
+   /* Determine the size of the block of pixels that will be processed by a single lane.
+    * Generally we want to load and store about 8-16B per lane, but there are exceptions.
+    * The block sizes were fine-tuned for Navi31, and might be suboptimal on different generations.
+    */
+   if (blit->dst.surf->bpe <= 8 && (is_resolve ? src_samples : dst_samples) <= 4 &&
+       /* Small blits don't benefit. */
+       width * height * depth * blit->dst.surf->bpe * dst_samples > 128 * 1024) {
+      if (is_3d_tiling) {
+         /* Thick tiling. */
+         if (!is_clear && blit->src.surf->is_linear) {
+            /* Linear -> Thick. */
+            if (blit->dst.surf->bpe == 4)
+               lane_size = (uvec3){2, 1, 1}; /* 8B per lane */
+            else if (blit->dst.surf->bpe == 2)
+               lane_size = (uvec3){2, 1, 2}; /* 8B per lane */
+            else if (blit->dst.surf->bpe == 1)
+               lane_size = (uvec3){4, 1, 2}; /* 8B per lane */
+         } else {
+            if (blit->dst.surf->bpe == 8)
+               lane_size = (uvec3){1, 1, 2}; /* 16B per lane */
+            else if (blit->dst.surf->bpe == 4)
+               lane_size = (uvec3){1, 2, 2}; /* 16B per lane */
+            else if (blit->dst.surf->bpe == 2)
+               lane_size = (uvec3){1, 2, 4}; /* 16B per lane */
+            else
+               lane_size = (uvec3){2, 2, 2}; /* 8B per lane */
+         }
+      } else if (blit->dst.surf->is_linear) {
+         /* Linear layout. */
+         if (!is_clear && !blit->src.surf->is_linear) {
+            /* Tiled -> Linear. */
+            if (blit->dst.surf->bpe == 8 && !blit->src.surf->thick_tiling)
+               lane_size = (uvec3){2, 1, 1}; /* 16B per lane */
+            else if (blit->dst.surf->bpe == 4)
+               lane_size = (uvec3){1, 2, 1}; /* 8B per lane */
+            else if (blit->dst.surf->bpe == 2 && blit->src.surf->thick_tiling)
+               lane_size = (uvec3){2, 2, 1}; /* 8B per lane */
+            else if (blit->dst.surf->bpe == 1 && blit->src.surf->thick_tiling)
+               lane_size = (uvec3){2, 2, 2}; /* 8B per lane */
+            else if (blit->dst.surf->bpe <= 2)
+               lane_size = (uvec3){2, 4, 1}; /* 8-16B per lane */
+         } else {
+            /* Clear or Linear -> Linear. */
+            if (blit->dst.surf->bpe == 8)
+               lane_size = (uvec3){2, 1, 1}; /* 16B per lane */
+            else if (blit->dst.surf->bpe == 4)
+               lane_size = (uvec3){4, 1, 1}; /* 16B per lane */
+            else if (blit->dst.surf->bpe == 2)
+               lane_size = (uvec3){4, 2, 1}; /* 16B per lane */
+            else
+               lane_size = (uvec3){8, 1, 1}; /* 8B per lane */
+         }
+      } else {
+         /* Thin tiling. */
+         if (is_resolve) {
+            if (blit->dst.surf->bpe == 8 && src_samples == 2) {
+               lane_size = (uvec3){1, 2, 1}; /* 32B->16B per lane */
+            } else if (blit->dst.surf->bpe == 4) {
+               lane_size = (uvec3){2, 1, 1}; /* 32B->8B for 4 samples, 16B->8B for 2 samples */
+            } else if (blit->dst.surf->bpe <= 2) {
+               if (src_samples == 4)
+                  lane_size = (uvec3){2, 1, 1}; /* 16B->4B for 16bpp, 8B->2B for 8bpp */
+               else
+                  lane_size = (uvec3){2, 2, 1}; /* 16B->8B for 16bpp, 8B->4B for 8bpp */
+            }
+         } else {
+            if (blit->dst.surf->bpe == 8 && dst_samples == 1)
+               lane_size = (uvec3){1, 2, 1}; /* 16B per lane */
+            else if (blit->dst.surf->bpe == 4) {
+               if (dst_samples == 2)
+                  lane_size = (uvec3){2, 1, 1}; /* 16B per lane */
+               else if (dst_samples == 1)
+                  lane_size = (uvec3){2, 2, 1}; /* 16B per lane */
+            } else if (blit->dst.surf->bpe == 2) {
+               if (dst_samples == 4 || (!is_clear && blit->src.surf->is_linear))
+                  lane_size = (uvec3){2, 1, 1}; /* 16B per lane (4B for linear src) */
+               else if (dst_samples == 2)
+                  lane_size = (uvec3){2, 2, 1}; /* 16B per lane */
+               else
+                  lane_size = (uvec3){2, 4, 1}; /* 16B per lane */
+            } else if (blit->dst.surf->bpe == 1) {
+               if (dst_samples == 4)
+                  lane_size = (uvec3){2, 1, 1}; /* 8B per lane */
+               else if (dst_samples == 2 || (!is_clear && blit->src.surf->is_linear))
+                  lane_size = (uvec3){2, 2, 1}; /* 8B per lane (4B for linear src) */
+               else
+                  lane_size = (uvec3){2, 4, 1}; /* 8B per lane */
+            }
+         }
+      }
+   }
+
+   /* Check that the lane size fits into the shader key. */
+   static const union ac_cs_blit_key max_lane_size = {
+      .log_lane_width = ~0,
+      .log_lane_height = ~0,
+      .log_lane_depth = ~0,
+   };
+   assert(util_logbase2(lane_size.x) <= max_lane_size.log_lane_width);
+   assert(util_logbase2(lane_size.y) <= max_lane_size.log_lane_height);
+   assert(util_logbase2(lane_size.z) <= max_lane_size.log_lane_depth);
+
+   /* If the shader blits a block of pixels per lane, it must have the dst box aligned to that
+    * block because it can't blit a subset of pixels per lane.
+    *
+    * If the blit dst box is not aligned to the lane size, split it into multiple blits by cutting
+    * off the unaligned sides of the box and blitting the middle that's aligned to the lane size,
+    * then blit the unaligned sides separately. This splits the blit into up to 7 blits for 3D,
+    * and 5 blits for 2D.
+    */
+   if (blit->dst.box.x % lane_size.x ||
+       blit->dst.box.y % lane_size.y ||
+       blit->dst.box.z % lane_size.z ||
+       blit->dst.box.width % lane_size.x ||
+       blit->dst.box.height % lane_size.y ||
+       blit->dst.box.depth % lane_size.z) {
+      struct pipe_box middle;
+
+      /* Cut off unaligned regions on the sides of the box. */
+      middle.x = align(blit->dst.box.x, lane_size.x);
+      middle.y = align(blit->dst.box.y, lane_size.y);
+      middle.z = align(blit->dst.box.z, lane_size.z);
+
+      middle.width = blit->dst.box.width - (middle.x - blit->dst.box.x);
+      if (middle.width > 0)
+         middle.width -= middle.width % lane_size.x;
+      middle.height = blit->dst.box.height - (middle.y - blit->dst.box.y);
+      if (middle.height > 0)
+         middle.height -= middle.height % lane_size.y;
+      middle.depth = blit->dst.box.depth - (middle.z - blit->dst.box.z);
+      if (middle.depth > 0)
+         middle.depth -= middle.depth % lane_size.z;
+
+      /* Only a few cases are regressed by this. The vast majority benefits a lot.
+       * This was fine-tuned for Navi31, and might be suboptimal on different generations.
+       */
+      bool slow = (blit->dst.surf->is_linear && !is_clear && blit->src.surf->is_linear && depth > 1) ||
+                  (blit->dst.surf->thick_tiling &&
+                   ((blit->dst.surf->bpe == 8 && is_clear) ||
+                    (blit->dst.surf->bpe == 4 &&
+                     (blit->dst.surf->is_linear || (!is_clear && blit->src.surf->is_linear))) ||
+                    (blit->dst.surf->bpe == 2 && blit->dst.surf->is_linear && !is_clear &&
+                     blit->src.surf->is_linear))) ||
+                  (!blit->dst.surf->thick_tiling &&
+                   ((blit->dst.surf->bpe == 4 && blit->dst.surf->is_linear && !is_clear &&
+                     blit->src.surf->is_linear) ||
+                    (blit->dst.surf->bpe == 8 && !is_clear &&
+                     blit->dst.surf->is_linear != blit->src.surf->is_linear) ||
+                    (is_resolve && blit->dst.surf->bpe == 4 && src_samples == 4) ||
+                    (is_resolve && blit->dst.surf->bpe == 8 && src_samples == 2)));
+
+      /* Only use this if the middle blit is large enough. */
+      if (!slow && middle.width > 0 && middle.height > 0 && middle.depth > 0 &&
+          middle.width * middle.height * middle.depth * blit->dst.surf->bpe * dst_samples >
+          128 * 1024) {
+         /* Compute the size of unaligned regions on all sides of the box. */
+         struct pipe_box top, left, right, bottom, front, back;
+
+         assert(!options->is_nested);
+
+         top = blit->dst.box;
+         top.height = middle.y - top.y;
+
+         bottom = blit->dst.box;
+         bottom.y = middle.y + middle.height;
+         bottom.height = blit->dst.box.height - top.height - middle.height;
+
+         left = blit->dst.box;
+         left.y = middle.y;
+         left.height = middle.height;
+         left.width = middle.x - left.x;
+
+         right = blit->dst.box;
+         right.y = middle.y;
+         right.height = middle.height;
+         right.x = middle.x + middle.width;
+         right.width = blit->dst.box.width - left.width - middle.width;
+
+         front = blit->dst.box;
+         front.x = middle.x;
+         front.y = middle.y;
+         front.width = middle.width;
+         front.height = middle.height;
+         front.depth = middle.z - front.z;
+
+         back = blit->dst.box;
+         back.x = middle.x;
+         back.y = middle.y;
+         back.width = middle.width;
+         back.height = middle.height;
+         back.z = middle.z + middle.depth;
+         back.depth = blit->dst.box.depth - front.depth - middle.depth;
+
+         struct pipe_box boxes[] = {middle, top, bottom, left, right, front, back};
+
+         /* Verify that the boxes don't intersect. */
+         for (unsigned i = 0; i < ARRAY_SIZE(boxes); i++) {
+            for (unsigned j = i + 1; j < ARRAY_SIZE(boxes); j++) {
+               if (boxes[i].width > 0 && boxes[i].height > 0 && boxes[i].depth > 0 &&
+                   boxes[j].width > 0 && boxes[j].height > 0 && boxes[j].depth > 0) {
+                  if (u_box_test_intersection_3d(&boxes[i], &boxes[j])) {
+                     printf("\b   (%u, %u, %u) -> (%u, %u, %u) | (%u, %u, %u) -> (%u, %u, %u)\n",
+                            boxes[i].x, boxes[i].y, boxes[i].z,
+                            boxes[i].x + boxes[i].width - 1,
+                            boxes[i].y + boxes[i].height - 1,
+                            boxes[i].z + boxes[i].depth - 1,
+                            boxes[j].x, boxes[j].y, boxes[j].z,
+                            boxes[j].x + boxes[j].width,
+                            boxes[j].y + boxes[j].height,
+                            boxes[j].z + boxes[j].depth);
+                     assert(0);
+                  }
+               }
+            }
+         }
+
+         struct ac_cs_blit_options nested_options = *options;
+         nested_options.is_nested = true;
+
+         for (unsigned i = 0; i < ARRAY_SIZE(boxes); i++) {
+            if (boxes[i].width > 0 && boxes[i].height > 0 && boxes[i].depth > 0) {
+               struct ac_cs_blit_description new_blit;
+               ASSERTED bool ok;
+
+               set_trimmed_blit(blit, &boxes[i], is_clear, &new_blit);
+               ok = ac_prepare_compute_blit(&nested_options, &new_blit, out);
+               assert(ok);
+            }
+         }
+         return true;
+      }
+   }
+
+   /* If the box can't blit split, at least reduce the lane size to the alignment of the box. */
+   lane_size.x = MIN3(lane_size.x, compute_alignment(blit->dst.box.x), compute_alignment(width));
+   lane_size.y = MIN3(lane_size.y, compute_alignment(blit->dst.box.y), compute_alignment(height));
+   lane_size.z = MIN3(lane_size.z, compute_alignment(blit->dst.box.z), compute_alignment(depth));
+
+   /* Determine the alignment of coordinates of the first thread of each wave. The alignment should be
+    * to a 256B block or the size of 1 wave, whichever is less, but there are a few exceptions.
+    */
+   uvec3 align;
+   if (is_3d_tiling) {
+      /* Thick tiling. */
+      /* This is based on GFX11_SW_PATTERN_NIBBLE01, which also matches GFX10. */
+      if (blit->dst.surf->bpe == 1)
+         align = (uvec3){8, 4, 8};
+      else if (blit->dst.surf->bpe == 2)
+         align = (uvec3){4, 4, 8};
+      else if (blit->dst.surf->bpe == 4)
+         align = (uvec3){4, 4, 4};
+      else if (blit->dst.surf->bpe == 8)
+         align = (uvec3){4, 2, 4};
+      else {
+         /* 16bpp linear source image reads perform better with this. */
+         if (!is_clear && blit->src.surf->is_linear)
+            align = (uvec3){4, 2, 4}; /* align to 512B for linear->tiled */
+         else
+            align = (uvec3){2, 2, 4};
+      }
+
+      /* Clamp the alignment to the expected size of 1 wave. */
+      align.x = MIN2(align.x, 4 * lane_size.x);
+      align.y = MIN2(align.y, 4 * lane_size.y);
+      align.z = MIN2(align.z, 4 * lane_size.z);
+   } else if (blit->dst.surf->is_linear) {
+      /* 1D blits from linear to linear are faster unaligned.
+       * 1D image clears don't benefit from any alignment.
+       */
+      if (height == 1 && depth == 1 && (is_clear || blit->src.surf->is_linear)) {
+         align = (uvec3){1, 1, 1};
+      } else {
+         /* Linear blits should use the cache line size instead of 256B alignment.
+          * Clamp it to the expected size of 1 wave.
+          */
+         align.x = MIN2(options->info->tcc_cache_line_size / blit->dst.surf->bpe, 64 * lane_size.x);
+         align.y = 1;
+         align.z = 1;
+      }
+   } else {
+      /* Thin tiling. */
+      if (info->gfx_level >= GFX11) {
+         /* Samples are next to each other on GFX11+. */
+         unsigned pix_size = blit->dst.surf->bpe * dst_samples;
+
+         /* This is based on GFX11_SW_PATTERN_NIBBLE01. */
+         if (pix_size == 1)
+            align = (uvec3){16, 16, 1};
+         else if (pix_size == 2)
+            align = (uvec3){16, 8, 1};
+         else if (pix_size == 4)
+            align = (uvec3){8, 8, 1};
+         else if (pix_size == 8)
+            align = (uvec3){8, 4, 1};
+         else if (pix_size == 16)
+            align = (uvec3){4, 4, 1};
+         else if (pix_size == 32)
+            align = (uvec3){4, 2, 1};
+         else if (pix_size == 64)
+            align = (uvec3){2, 2, 1};
+         else
+            align = (uvec3){2, 1, 1}; /* 16bpp 8xAA */
+      } else {
+         /* This is for 64KB_R_X. (most likely to occur due to DCC)
+          * It's based on GFX10_SW_64K_R_X_*xaa_RBPLUS_PATINFO (GFX10.3).
+          * The patterns are GFX10_SW_PATTERN_NIBBLE01[0, 1, 39, 6, 7] for 8bpp-128bpp.
+          * GFX6-10.1 and other swizzle modes might be similar.
+          */
+         if (blit->dst.surf->bpe == 1)
+            align = (uvec3){16, 16, 1};
+         else if (blit->dst.surf->bpe == 2)
+            align = (uvec3){16, 8, 1};
+         else if (blit->dst.surf->bpe == 4)
+            align = (uvec3){8, 8, 1};
+         else if (blit->dst.surf->bpe == 8)
+            align = (uvec3){8, 4, 1};
+         else
+            align = (uvec3){4, 4, 1};
+      }
+
+      /* Clamp the alignment to the expected size of 1 wave. */
+      align.x = MIN2(align.x, 8 * lane_size.x);
+      align.y = MIN2(align.y, 8 * lane_size.y);
+   }
+
+   /* If we don't have much to copy, don't align. The threshold is guessed and isn't covered
+    * by benchmarking.
+    */
+   if (width <= align.x * 4)
+      align.x = 1;
+   if (height <= align.y * 4)
+      align.y = 1;
+   if (depth <= align.z * 4)
+      align.z = 1;
+
+   unsigned start_x, start_y, start_z;
+   unsigned block_x, block_y, block_z;
+
+   /* If the blit destination area is unaligned, launch extra threads before 0,0,0 to make it
+    * aligned. This makes sure that a wave doesn't straddle a DCC block boundary or a cache line
+    * unnecessarily, so that each cache line is only stored by exactly 1 CU. The shader will skip
+    * the extra threads. This makes unaligned compute blits faster.
+    */
+   start_x = blit->dst.box.x % align.x;
+   start_y = blit->dst.box.y % align.y;
+   start_z = blit->dst.box.z % align.z;
+   width += start_x;
+   height += start_y;
+   depth += start_z;
+
+   /* Divide by the dispatch parameters by the lane size. */
+   assert(start_x % lane_size.x == 0);
+   assert(start_y % lane_size.y == 0);
+   assert(start_z % lane_size.z == 0);
+   assert(width % lane_size.x == 0);
+   assert(height % lane_size.y == 0);
+   assert(depth % lane_size.z == 0);
+
+   start_x /= lane_size.x;
+   start_y /= lane_size.y;
+   start_z /= lane_size.z;
+   width /= lane_size.x;
+   height /= lane_size.y;
+   depth /= lane_size.z;
+
+   /* Choose the block (i.e. wave) dimensions based on the copy area size and the image layout
+    * of dst.
+    */
+   if (is_3d_tiling) {
+      /* Thick tiling. (microtiles are 3D boxes)
+       * If the box height and depth is > 2, the block size will be 4x4x4.
+       * If not, the threads will spill over to X.
+       */
+      block_y = util_next_power_of_two(MIN2(height, 4));
+      block_z = util_next_power_of_two(MIN2(depth, 4));
+      block_x = 64 / (block_y * block_z);
+   } else if (blit->dst.surf->is_linear) {
+      /* If the box width is > 128B, the block size will be 64x1 for bpp <= 4, 32x2 for bpp == 8,
+       * and 16x4 for bpp == 16.
+       * If not, the threads will spill over to Y, then Z if they aren't small.
+       *
+       * This is derived from the fact that the linear image layout has 256B linear blocks, and
+       * longer blocks don't benefit linear write performance, but they hurt tiled read performance.
+       * We want to prioritize blocks that are 256Bx2 over 512Bx1 because the source can be tiled.
+       *
+       * Using the cache line size (128B) instead of hardcoding 256B makes linear blits slower.
+       */
+      block_x = util_next_power_of_two(MIN3(width, 64, 256 / blit->dst.surf->bpe));
+      block_y = util_next_power_of_two(MIN2(height, 64 / block_x));
+      block_z = util_next_power_of_two(MIN2(depth, 64 / (block_x * block_y)));
+      block_x = 64 / (block_y * block_z);
+   } else {
+      /* Thin tiling. (microtiles are 2D rectangles)
+       * If the box width and height is > 4, the block size will be 8x8.
+       * If Y is <= 4, the threads will spill over to X.
+       * If X is <= 4, the threads will spill over to Y, then Z if they aren't small.
+       */
+      block_y = util_next_power_of_two(MIN2(height, 8));
+      block_x = util_next_power_of_two(MIN2(width, 64 / block_y));
+      block_y = util_next_power_of_two(MIN2(height, 64 / block_x));
+      block_z = util_next_power_of_two(MIN2(depth, 64 / (block_x * block_y)));
+      block_x = 64 / (block_y * block_z);
+   }
+
+   unsigned index = out->num_dispatches++;
+   assert(index < ARRAY_SIZE(out->dispatches));
+   struct ac_cs_blit_dispatch *dispatch = &out->dispatches[index];
+   unsigned wg_dim = set_work_size(dispatch, block_x, block_y, block_z, width, height, depth);
+
+   /* Get the shader key. */
+   union ac_cs_blit_key key;
+   key.key = 0;
+
+   /* Only ACO can form VMEM clauses for image stores, which is a requirement for performance. */
+   key.use_aco = true;
+   key.is_clear = is_clear;
+   key.wg_dim = wg_dim;
+   key.has_start_xyz = start_x || start_y || start_z;
+   key.log_lane_width = util_logbase2(lane_size.x);
+   key.log_lane_height = util_logbase2(lane_size.y);
+   key.log_lane_depth = util_logbase2(lane_size.z);
+   key.dst_is_1d = blit->dst.dim == 1;
+   key.dst_is_msaa = dst_samples > 1;
+   key.dst_has_z = blit->dst.dim == 3 || blit->dst.is_array;
+   key.last_dst_channel = util_format_get_last_component(blit->dst.format);
+
+   /* ACO doesn't support D16 on GFX8 */
+   bool has_d16 = info->gfx_level >= (key.use_aco || options->use_aco ? GFX9 : GFX8);
+
+   if (is_clear) {
+      assert(dst_samples <= 8);
+      key.log_samples = util_logbase2(dst_samples);
+      key.a16 = info->gfx_level >= GFX9 && util_is_box_sint16(&blit->dst.box);
+      key.d16 = has_d16 &&
+                max_dst_chan_size <= (util_format_is_float(blit->dst.format) ||
+                                      util_format_is_pure_integer(blit->dst.format) ? 16 : 11);
+   } else {
+      key.src_is_1d = blit->src.dim == 1;
+      key.src_is_msaa = src_samples > 1;
+      key.src_has_z = blit->src.dim == 3 || blit->src.is_array;
+      /* Resolving integer formats only copies sample 0. log_samples is then unused. */
+      key.sample0_only = sample0_only;
+      unsigned num_samples = MAX2(src_samples, dst_samples);
+      assert(num_samples <= 8);
+      key.log_samples = sample0_only ? 0 : util_logbase2(num_samples);
+      key.x_clamp_to_edge = should_blit_clamp_to_edge(blit, BITFIELD_BIT(0));
+      key.y_clamp_to_edge = should_blit_clamp_to_edge(blit, BITFIELD_BIT(1));
+      key.flip_x = blit->src.box.width < 0;
+      key.flip_y = blit->src.box.height < 0;
+      key.sint_to_uint = util_format_is_pure_sint(blit->src.format) &&
+                         util_format_is_pure_uint(blit->dst.format);
+      key.uint_to_sint = util_format_is_pure_uint(blit->src.format) &&
+                         util_format_is_pure_sint(blit->dst.format);
+      key.dst_is_srgb = util_format_is_srgb(blit->dst.format);
+      key.last_src_channel = MIN2(util_format_get_last_component(blit->src.format),
+                                  key.last_dst_channel);
+      key.use_integer_one = util_format_is_pure_integer(blit->dst.format) &&
+                            key.last_src_channel < key.last_dst_channel &&
+                            key.last_dst_channel == 3;
+      key.a16 = info->gfx_level >= GFX9 && util_is_box_sint16(&blit->dst.box) &&
+                util_is_box_sint16(&blit->src.box);
+      key.d16 = has_d16 &&
+                /* Blitting FP16 using D16 has precision issues. Resolving has precision
+                 * issues all the way down to R11G11B10_FLOAT. */
+                MIN2(max_dst_chan_size, max_src_chan_size) <=
+                (util_format_is_pure_integer(blit->dst.format) ?
+                    (key.sint_to_uint || key.uint_to_sint ? 10 : 16) :
+                    (is_resolve ? 10 : 11));
+   }
+
+   dispatch->shader_key = key;
+
+   dispatch->user_data[0] = (blit->src.box.x & 0xffff) | ((blit->dst.box.x & 0xffff) << 16);
+   dispatch->user_data[1] = (blit->src.box.y & 0xffff) | ((blit->dst.box.y & 0xffff) << 16);
+   dispatch->user_data[2] = (blit->src.box.z & 0xffff) | ((blit->dst.box.z & 0xffff) << 16);
+   dispatch->user_data[3] = (start_x & 0xff) | ((start_y & 0xff) << 8) | ((start_z & 0xff) << 16);
+
+   if (is_clear) {
+      union pipe_color_union final_value;
+      memcpy(&final_value, &blit->clear_color, sizeof(final_value));
+
+      /* Do the conversion to sRGB here instead of the shader. */
+      if (util_format_is_srgb(blit->dst.format)) {
+         for (int i = 0; i < 3; i++)
+            final_value.f[i] = util_format_linear_to_srgb_float(final_value.f[i]);
+      }
+
+      if (key.d16) {
+         enum pipe_format data_format;
+
+         if (util_format_is_pure_uint(blit->dst.format))
+            data_format = PIPE_FORMAT_R16G16B16A16_UINT;
+         else if (util_format_is_pure_sint(blit->dst.format))
+            data_format = PIPE_FORMAT_R16G16B16A16_SINT;
+         else
+            data_format = PIPE_FORMAT_R16G16B16A16_FLOAT;
+
+         util_pack_color_union(data_format, (union util_color *)&dispatch->user_data[4],
+                               &final_value);
+      } else {
+         memcpy(&dispatch->user_data[4], &final_value, sizeof(final_value));
+      }
+   }
+
+   return true;
+}
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@ -860,55 +860,32 @@ bool si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, u
   return success;
 }

-/* Return a power-of-two alignment of a number. */
-static unsigned compute_alignment(unsigned x)
+static unsigned get_tex_dim(struct si_texture *tex)
 {
-   return x ? BITFIELD_BIT(ffs(x) - 1) : BITFIELD_BIT(31);
-}
-
-/* Set the blit info, but change the dst box and trim the src box according to the new dst box. */
-static void set_trimmed_blit(const struct pipe_blit_info *old, const struct pipe_box *box,
-                             bool is_clear, struct pipe_blit_info *out)
-{
-   assert(old->dst.box.x <= box->x);
-   assert(old->dst.box.y <= box->y);
-   assert(old->dst.box.z <= box->z);
-   assert(box->x + box->width <= old->dst.box.x + old->dst.box.width);
-   assert(box->y + box->height <= old->dst.box.y + old->dst.box.height);
-   assert(box->z + box->depth <= old->dst.box.z + old->dst.box.depth);
-   /* No scaling. */
-   assert(is_clear || old->dst.box.width == abs(old->src.box.width));
-   assert(is_clear || old->dst.box.height == abs(old->src.box.height));
-   assert(is_clear || old->dst.box.depth == abs(old->src.box.depth));
-
-   *out = *old;
-   out->dst.box = *box;
-
-   if (!is_clear) {
-      if (out->src.box.width > 0) {
-         out->src.box.x += box->x - old->dst.box.x;
-         out->src.box.width = box->width;
-      } else {
-         out->src.box.x -= box->x - old->dst.box.x;
-         out->src.box.width = -box->width;
-      }
-
-      if (out->src.box.height > 0) {
-         out->src.box.y += box->y - old->dst.box.y;
-         out->src.box.height = box->height;
-      } else {
-         out->src.box.y -= box->y - old->dst.box.y;
-         out->src.box.height = -box->height;
-      }
-
-      out->src.box.z += box->z - old->dst.box.z;
-      out->src.box.depth = box->depth;
+   switch (tex->buffer.b.b.target) {
+   case PIPE_TEXTURE_3D:
+      return 3;
+   case PIPE_BUFFER:
+   case PIPE_TEXTURE_1D:
+   case PIPE_TEXTURE_1D_ARRAY:
+      return 1;
+   default:
+      return 2;
   }
 }

-typedef struct {
-   unsigned x, y, z;
-} uvec3;
+static bool get_tex_is_array(struct si_texture *tex)
+{
+   switch (tex->buffer.b.b.target) {
+   case PIPE_TEXTURE_CUBE:
+   case PIPE_TEXTURE_1D_ARRAY:
+   case PIPE_TEXTURE_2D_ARRAY:
+   case PIPE_TEXTURE_CUBE_ARRAY:
+      return true;
+   default:
+      return false;;
+   }
+}

 bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info,
                     const union pipe_color_union *clear_color, unsigned dst_access,
@ -916,31 +893,8 @@ bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info,
 {
   struct si_texture *sdst = (struct si_texture *)info->dst.resource;
   struct si_texture *ssrc = (struct si_texture *)info->src.resource;
-   bool is_2d_tiling = !sdst->surface.is_linear && !sdst->surface.thick_tiling;
-   bool is_3d_tiling = sdst->surface.thick_tiling;
-   bool is_clear = !info->src.resource;
+   bool is_clear = !ssrc;
   unsigned dst_samples = MAX2(1, sdst->buffer.b.b.nr_samples);
-   unsigned src_samples = is_clear ? 1 : MAX2(1, ssrc->buffer.b.b.nr_samples);
-   bool is_resolve = !is_clear && dst_samples == 1 && src_samples >= 2 &&
-                     !util_format_is_pure_integer(info->dst.format);
-   bool is_upsampling = !is_clear && src_samples == 1 && dst_samples >= 2;
-   bool sample0_only = src_samples >= 2 && dst_samples == 1 &&
-                       (info->sample0_only || util_format_is_pure_integer(info->dst.format));
-   /* Get the channel sizes. */
-   unsigned max_dst_chan_size = util_format_get_max_channel_size(info->dst.format);
-   unsigned max_src_chan_size = is_clear ? 0 : util_format_get_max_channel_size(info->src.format);
-
-   /* Reject blits with invalid parameters. */
-   if (info->dst.box.width < 0 || info->dst.box.height < 0 || info->dst.box.depth < 0 ||
-       info->src.box.depth < 0) {
-      assert(!"invalid box parameters"); /* this is reachable and prevents hangs */
-      return true;
-   }
-
-   /* Skip zero-area blits. */
-   if (!info->dst.box.width || !info->dst.box.height || !info->dst.box.depth ||
-       (!is_clear && (!info->src.box.width || !info->src.box.height || !info->src.box.depth)))
-      return true;

   /* MSAA image stores don't work on <= Gfx10.3. It's an issue with FMASK because
    * AMD_DEBUG=nofmask fixes them. EQAA image stores are also unimplemented.
@ -949,661 +903,62 @@ bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info,
   if (sctx->gfx_level < GFX11 && !(sctx->screen->debug_flags & DBG(NO_FMASK)) && dst_samples > 1)
      return false;

-   if (info->dst.format == PIPE_FORMAT_A8R8_UNORM || /* This format fails AMD_TEST=imagecopy. */
-       max_dst_chan_size == 5 || /* PIPE_FORMAT_R5G5B5A1_UNORM has precision issues */
-       util_format_is_depth_or_stencil(info->dst.resource->format) ||
-       dst_samples > SI_MAX_COMPUTE_BLIT_SAMPLES ||
-       info->dst_sample != 0 ||
-       /* Image stores support DCC since GFX10. Return only for gfx queues. DCC is disabled
-        * for compute queues farther below. */
+   if (info->dst_sample != 0 ||
+       /* Image stores support DCC since GFX10. Return only for gfx queues because compute queues
+        * can't return false. DCC is disabled for compute queues farther below. */
       (sctx->gfx_level < GFX10 && sctx->has_graphics && vi_dcc_enabled(sdst, info->dst.level) &&
        !src_access && !dst_access) ||
       info->alpha_blend ||
       info->num_window_rectangles ||
-       info->scissor_enable ||
-       (!is_clear &&
-        /* Scaling is not implemented by the compute shader. */
-        (info->dst.box.width != abs(info->src.box.width) ||
-         info->dst.box.height != abs(info->src.box.height) ||
-         info->dst.box.depth != abs(info->src.box.depth) ||
-         util_format_is_depth_or_stencil(info->src.resource->format) ||
-         src_samples > SI_MAX_COMPUTE_BLIT_SAMPLES)))
+       info->scissor_enable)
      return false;

-   /* Return a failure if a compute blit is slower than a gfx blit. */
-   if (sctx->has_graphics && flags & SI_OP_FAIL_IF_SLOW) {
-      if (is_clear) {
-         /* Verified on: Tahiti, Hawaii, Tonga, Vega10, Navi10, Navi21, Navi31 */
-         if (is_3d_tiling) {
-            if (sctx->gfx_level == GFX6 && sdst->surface.bpe == 8)
-               return false;
-         } else if (is_2d_tiling) {
-            if (!(sctx->gfx_level == GFX6 && sdst->surface.bpe <= 4 && dst_samples == 1) &&
-                !(sctx->gfx_level == GFX7 && sdst->surface.bpe == 1 && dst_samples == 1))
-               return false;
-         }
-      } else {
-         /* For upsampling, image stores don't compress MSAA as good as draws. */
-         if (is_upsampling)
-            return false;
+   struct ac_cs_blit_options options = {
+      .nir_options = sctx->screen->nir_options,
+      .info = &sctx->screen->info,
+      .use_aco = sctx->screen->use_aco,
+      .no_fmask = sctx->screen->debug_flags & DBG(NO_FMASK),
+      /* Compute queues can't fail because there is no alternative. */
+      .fail_if_slow = sctx->has_graphics && flags & SI_OP_FAIL_IF_SLOW,
+   };

-         switch (sctx->gfx_level) {
-         case GFX6:
-         case GFX7:
-         case GFX8:
-         case GFX9:
-         case GFX10:
-         case GFX10_3:
-            /* Verified on: Tahiti, Hawaii, Tonga, Vega10, Navi10, Navi21 */
-            if (is_resolve) {
-               if (!(sctx->gfx_level == GFX7 && sdst->surface.bpe == 16))
-                  return false;
-            } else {
-               assert(dst_samples == src_samples || sample0_only);
+   struct ac_cs_blit_description blit = {
+      .dst = {
+         .surf = &sdst->surface,
+         .dim = get_tex_dim(sdst),
+         .is_array = get_tex_is_array(sdst),
+         .width0 = info->dst.resource->width0,
+         .height0 = info->dst.resource->height0,
+         .num_samples = info->dst.resource->nr_samples,
+         .level = info->dst.level,
+         .box = info->dst.box,
+         .format = info->dst.format,
+      },
+      .src = {
+         .surf = ssrc ? &ssrc->surface : NULL,
+         .dim = ssrc ? get_tex_dim(ssrc) : 0,
+         .is_array = ssrc ? get_tex_is_array(ssrc) : false,
+         .width0 = ssrc ? info->src.resource->width0 : 0,
+         .height0 = ssrc ? info->src.resource->height0 : 0,
+         .num_samples = ssrc ? info->src.resource->nr_samples : 0,
+         .level = info->src.level,
+         .box = info->src.box,
+         .format = info->src.format,
+      },
+      .sample0_only = info->sample0_only,
+   };

-               if (is_2d_tiling) {
-                  if (dst_samples == 1) {
-                     if (sdst->surface.bpe <= 8 &&
-                         !(sctx->gfx_level <= GFX7 && sdst->surface.bpe == 1) &&
-                         !(sctx->gfx_level == GFX6 && sdst->surface.bpe == 2 &&
-                           ssrc->surface.is_linear) &&
-                         !(sctx->gfx_level == GFX7 && sdst->surface.bpe >= 2 &&
-                           ssrc->surface.is_linear) &&
-                         !((sctx->gfx_level == GFX8 || sctx->gfx_level == GFX9) &&
-                           sdst->surface.bpe >= 2 && ssrc->surface.is_linear) &&
-                         !(sctx->gfx_level == GFX10 && sdst->surface.bpe <= 2 &&
-                           ssrc->surface.is_linear) &&
-                         !(sctx->gfx_level == GFX10_3 && sdst->surface.bpe == 8 &&
-                           ssrc->surface.is_linear))
-                        return false;
+   if (clear_color)
+      blit.clear_color = *clear_color;

-                     if (sctx->gfx_level == GFX6 && sdst->surface.bpe == 16 &&
-                         ssrc->surface.is_linear && sdst->buffer.b.b.target != PIPE_TEXTURE_3D)
-                        return false;
-
-                     if (sdst->surface.bpe == 16 && !ssrc->surface.is_linear &&
-                         /* Only GFX6 selects 2D tiling for 128bpp 3D textures. */
-                         !(sctx->gfx_level == GFX6 && sdst->buffer.b.b.target == PIPE_TEXTURE_3D) &&
-                         sctx->gfx_level != GFX7)
-                        return false;
-                  } else {
-                     /* MSAA copies - tested only without FMASK on Navi21. */
-                     if (sdst->surface.bpe >= 4)
-                        return false;
-                  }
-               }
-            }
-            break;
-
-         case GFX11:
-         case GFX11_5:
-         default:
-            /* Verified on Navi31. */
-            if (is_resolve) {
-               if (!((sdst->surface.bpe <= 2 && src_samples == 2) ||
-                     (sdst->surface.bpe == 16 && src_samples == 4)))
-                  return false;
-            } else {
-               assert(dst_samples == src_samples || sample0_only);
-
-               if (is_2d_tiling) {
-                  if (sdst->surface.bpe == 2 && ssrc->surface.is_linear && dst_samples == 1)
-                     return false;
-
-                  if ((sdst->surface.bpe == 4 || sdst->surface.bpe == 8) && dst_samples == 1)
-                     return false;
-
-                  if (sdst->surface.bpe == 16 && dst_samples == 1 && !ssrc->surface.is_linear)
-                     return false;
-
-                  if (sdst->surface.bpe == 16 && dst_samples == 8)
-                     return false;
-               }
-            }
-            break;
-         }
-      }
-   }
+   struct ac_cs_blit_dispatches out;
+   if (!ac_prepare_compute_blit(&options, &blit, &out))
+      return false;

+   /* This is needed for compute queues if DCC stores are unsupported. */
   if (sctx->gfx_level < GFX10 && !sctx->has_graphics && vi_dcc_enabled(sdst, info->dst.level))
      si_texture_disable_dcc(sctx, sdst);

-   unsigned width = info->dst.box.width;
-   unsigned height = info->dst.box.height;
-   unsigned depth = info->dst.box.depth;
-   uvec3 lane_size = (uvec3){1, 1, 1};
-
-   /* Determine the size of the block of pixels that will be processed by a single lane.
-    * Generally we want to load and store about 8-16B per lane, but there are exceptions.
-    * The block sizes were fine-tuned for Navi31, and might be suboptimal on different generations.
-    */
-   if (sdst->surface.bpe <= 8 && (is_resolve ? src_samples : dst_samples) <= 4 &&
-       /* Small blits don't benefit. */
-       width * height * depth * sdst->surface.bpe * dst_samples > 128 * 1024) {
-      if (is_3d_tiling) {
-         /* Thick tiling. */
-         if (!is_clear && ssrc->surface.is_linear) {
-            /* Linear -> Thick. */
-            if (sdst->surface.bpe == 4)
-               lane_size = (uvec3){2, 1, 1}; /* 8B per lane */
-            else if (sdst->surface.bpe == 2)
-               lane_size = (uvec3){2, 1, 2}; /* 8B per lane */
-            else if (sdst->surface.bpe == 1)
-               lane_size = (uvec3){4, 1, 2}; /* 8B per lane */
-         } else {
-            if (sdst->surface.bpe == 8)
-               lane_size = (uvec3){1, 1, 2}; /* 16B per lane */
-            else if (sdst->surface.bpe == 4)
-               lane_size = (uvec3){1, 2, 2}; /* 16B per lane */
-            else if (sdst->surface.bpe == 2)
-               lane_size = (uvec3){1, 2, 4}; /* 16B per lane */
-            else
-               lane_size = (uvec3){2, 2, 2}; /* 8B per lane */
-         }
-      } else if (sdst->surface.is_linear) {
-         /* Linear layout. */
-         if (!is_clear && !ssrc->surface.is_linear) {
-            /* Tiled -> Linear. */
-            if (sdst->surface.bpe == 8 && !ssrc->surface.thick_tiling)
-               lane_size = (uvec3){2, 1, 1}; /* 16B per lane */
-            else if (sdst->surface.bpe == 4)
-               lane_size = (uvec3){1, 2, 1}; /* 8B per lane */
-            else if (sdst->surface.bpe == 2 && ssrc->surface.thick_tiling)
-               lane_size = (uvec3){2, 2, 1}; /* 8B per lane */
-            else if (sdst->surface.bpe == 1 && ssrc->surface.thick_tiling)
-               lane_size = (uvec3){2, 2, 2}; /* 8B per lane */
-            else if (sdst->surface.bpe <= 2)
-               lane_size = (uvec3){2, 4, 1}; /* 8-16B per lane */
-         } else {
-            /* Clear or Linear -> Linear. */
-            if (sdst->surface.bpe == 8)
-               lane_size = (uvec3){2, 1, 1}; /* 16B per lane */
-            else if (sdst->surface.bpe == 4)
-               lane_size = (uvec3){4, 1, 1}; /* 16B per lane */
-            else if (sdst->surface.bpe == 2)
-               lane_size = (uvec3){4, 2, 1}; /* 16B per lane */
-            else
-               lane_size = (uvec3){8, 1, 1}; /* 8B per lane */
-         }
-      } else {
-         /* Thin tiling. */
-         if (is_resolve) {
-            if (sdst->surface.bpe == 8 && src_samples == 2) {
-               lane_size = (uvec3){1, 2, 1}; /* 32B->16B per lane */
-            } else if (sdst->surface.bpe == 4) {
-               lane_size = (uvec3){2, 1, 1}; /* 32B->8B for 4 samples, 16B->8B for 2 samples */
-            } else if (sdst->surface.bpe <= 2) {
-               if (src_samples == 4)
-                  lane_size = (uvec3){2, 1, 1}; /* 16B->4B for 16bpp, 8B->2B for 8bpp */
-               else
-                  lane_size = (uvec3){2, 2, 1}; /* 16B->8B for 16bpp, 8B->4B for 8bpp */
-            }
-         } else {
-            if (sdst->surface.bpe == 8 && dst_samples == 1)
-               lane_size = (uvec3){1, 2, 1}; /* 16B per lane */
-            else if (sdst->surface.bpe == 4) {
-               if (dst_samples == 2)
-                  lane_size = (uvec3){2, 1, 1}; /* 16B per lane */
-               else if (dst_samples == 1)
-                  lane_size = (uvec3){2, 2, 1}; /* 16B per lane */
-            } else if (sdst->surface.bpe == 2) {
-               if (dst_samples == 4 || (!is_clear && ssrc->surface.is_linear))
-                  lane_size = (uvec3){2, 1, 1}; /* 16B per lane (4B for linear src) */
-               else if (dst_samples == 2)
-                  lane_size = (uvec3){2, 2, 1}; /* 16B per lane */
-               else
-                  lane_size = (uvec3){2, 4, 1}; /* 16B per lane */
-            } else if (sdst->surface.bpe == 1) {
-               if (dst_samples == 4)
-                  lane_size = (uvec3){2, 1, 1}; /* 8B per lane */
-               else if (dst_samples == 2 || (!is_clear && ssrc->surface.is_linear))
-                  lane_size = (uvec3){2, 2, 1}; /* 8B per lane (4B for linear src) */
-               else
-                  lane_size = (uvec3){2, 4, 1}; /* 8B per lane */
-            }
-         }
-      }
-   }
-
-   /* Check that the lane size fits into the shader key. */
-   static const union ac_cs_blit_key max_lane_size = {
-      .log_lane_width = ~0,
-      .log_lane_height = ~0,
-      .log_lane_depth = ~0,
-   };
-   assert(util_logbase2(lane_size.x) <= max_lane_size.log_lane_width);
-   assert(util_logbase2(lane_size.y) <= max_lane_size.log_lane_height);
-   assert(util_logbase2(lane_size.z) <= max_lane_size.log_lane_depth);
-
-   /* If the shader blits a block of pixels per lane, it must have the dst box aligned to that
-    * block because it can't blit a subset of pixels per lane.
-    *
-    * If the blit dst box is not aligned to the lane size, split it into multiple blits by cutting
-    * off the unaligned sides of the box and blitting the middle that's aligned to the lane size,
-    * then blit the unaligned sides separately. This splits the blit into up to 7 blits for 3D,
-    * and 5 blits for 2D.
-    */
-   if (info->dst.box.x % lane_size.x ||
-       info->dst.box.y % lane_size.y ||
-       info->dst.box.z % lane_size.z ||
-       info->dst.box.width % lane_size.x ||
-       info->dst.box.height % lane_size.y ||
-       info->dst.box.depth % lane_size.z) {
-      struct pipe_box middle;
-
-      /* Cut off unaligned regions on the sides of the box. */
-      middle.x = align(info->dst.box.x, lane_size.x);
-      middle.y = align(info->dst.box.y, lane_size.y);
-      middle.z = align(info->dst.box.z, lane_size.z);
-
-      middle.width = info->dst.box.width - (middle.x - info->dst.box.x);
-      if (middle.width > 0)
-         middle.width -= middle.width % lane_size.x;
-      middle.height = info->dst.box.height - (middle.y - info->dst.box.y);
-      if (middle.height > 0)
-         middle.height -= middle.height % lane_size.y;
-      middle.depth = info->dst.box.depth - (middle.z - info->dst.box.z);
-      if (middle.depth > 0)
-         middle.depth -= middle.depth % lane_size.z;
-
-      /* Only a few cases are regressed by this. The vast majority benefits a lot.
-       * This was fine-tuned for Navi31, and might be suboptimal on different generations.
-       */
-      bool slow = (sdst->surface.is_linear && !is_clear && ssrc->surface.is_linear && depth > 1) ||
-                  (sdst->surface.thick_tiling &&
-                   ((sdst->surface.bpe == 8 && is_clear) ||
-                    (sdst->surface.bpe == 4 &&
-                     (sdst->surface.is_linear || (!is_clear && ssrc->surface.is_linear))) ||
-                    (sdst->surface.bpe == 2 && sdst->surface.is_linear && !is_clear &&
-                     ssrc->surface.is_linear))) ||
-                  (!sdst->surface.thick_tiling &&
-                   ((sdst->surface.bpe == 4 && sdst->surface.is_linear && !is_clear &&
-                     ssrc->surface.is_linear) ||
-                    (sdst->surface.bpe == 8 && !is_clear &&
-                     sdst->surface.is_linear != ssrc->surface.is_linear) ||
-                    (is_resolve && sdst->surface.bpe == 4 && src_samples == 4) ||
-                    (is_resolve && sdst->surface.bpe == 8 && src_samples == 2)));
-
-      /* Only use this if the middle blit is large enough. */
-      if (!slow && middle.width > 0 && middle.height > 0 && middle.depth > 0 &&
-          middle.width * middle.height * middle.depth * sdst->surface.bpe * dst_samples >
-          128 * 1024) {
-         /* Compute the size of unaligned regions on all sides of the box. */
-         struct pipe_box top, left, right, bottom, front, back;
-
-         assert(!(flags & SI_OP_IS_NESTED));
-
-         top = info->dst.box;
-         top.height = middle.y - top.y;
-
-         bottom = info->dst.box;
-         bottom.y = middle.y + middle.height;
-         bottom.height = info->dst.box.height - top.height - middle.height;
-
-         left = info->dst.box;
-         left.y = middle.y;
-         left.height = middle.height;
-         left.width = middle.x - left.x;
-
-         right = info->dst.box;
-         right.y = middle.y;
-         right.height = middle.height;
-         right.x = middle.x + middle.width;
-         right.width = info->dst.box.width - left.width - middle.width;
-
-         front = info->dst.box;
-         front.x = middle.x;
-         front.y = middle.y;
-         front.width = middle.width;
-         front.height = middle.height;
-         front.depth = middle.z - front.z;
-
-         back = info->dst.box;
-         back.x = middle.x;
-         back.y = middle.y;
-         back.width = middle.width;
-         back.height = middle.height;
-         back.z = middle.z + middle.depth;
-         back.depth = info->dst.box.depth - front.depth - middle.depth;
-
-         struct pipe_box boxes[] = {middle, top, bottom, left, right, front, back};
-         int last = -1;
-
-         /* Verify that the boxes don't intersect. */
-         for (unsigned i = 0; i < ARRAY_SIZE(boxes); i++) {
-            for (unsigned j = i + 1; j < ARRAY_SIZE(boxes); j++) {
-               if (boxes[i].width > 0 && boxes[i].height > 0 && boxes[i].depth > 0 &&
-                   boxes[j].width > 0 && boxes[j].height > 0 && boxes[j].depth > 0) {
-                  if (u_box_test_intersection_3d(&boxes[i], &boxes[j])) {
-                     printf("\b   (%u, %u, %u) -> (%u, %u, %u) | (%u, %u, %u) -> (%u, %u, %u)\n",
-                            boxes[i].x, boxes[i].y, boxes[i].z,
-                            boxes[i].x + boxes[i].width - 1,
-                            boxes[i].y + boxes[i].height - 1,
-                            boxes[i].z + boxes[i].depth - 1,
-                            boxes[j].x, boxes[j].y, boxes[j].z,
-                            boxes[j].x + boxes[j].width,
-                            boxes[j].y + boxes[j].height,
-                            boxes[j].z + boxes[j].depth);
-                     assert(0);
-                  }
-               }
-            }
-         }
-
-         for (unsigned i = 0; i < ARRAY_SIZE(boxes); i++) {
-            if (boxes[i].width > 0 && boxes[i].height > 0 && boxes[i].depth > 0)
-               last = i;
-         }
-         assert(last > 0);
-
-         for (unsigned i = 0; i < ARRAY_SIZE(boxes); i++) {
-            if (boxes[i].width > 0 && boxes[i].height > 0 && boxes[i].depth > 0) {
-               struct pipe_blit_info new_info;
-               ASSERTED bool ok;
-
-               set_trimmed_blit(info, &boxes[i], is_clear, &new_info);
-               ok = si_compute_blit(sctx, &new_info, clear_color, dst_access, src_access,
-                                    (flags & ~SI_OP_SYNC_BEFORE_AFTER) | SI_OP_IS_NESTED |
-                                    (i == 0 ? flags & SI_OP_SYNC_BEFORE : 0) |
-                                    (i == last ? flags & SI_OP_SYNC_AFTER : 0));
-               assert(ok);
-            }
-         }
-         return true;
-      }
-   }
-
-   /* If the box can't blit split, at least reduce the lane size to the alignment of the box. */
-   lane_size.x = MIN3(lane_size.x, compute_alignment(info->dst.box.x), compute_alignment(width));
-   lane_size.y = MIN3(lane_size.y, compute_alignment(info->dst.box.y), compute_alignment(height));
-   lane_size.z = MIN3(lane_size.z, compute_alignment(info->dst.box.z), compute_alignment(depth));
-
-   /* Determine the alignment of coordinates of the first thread of each wave. The alignment should be
-    * to a 256B block or the size of 1 wave, whichever is less, but there are a few exceptions.
-    */
-   uvec3 align;
-   if (is_3d_tiling) {
-      /* Thick tiling. */
-      /* This is based on GFX11_SW_PATTERN_NIBBLE01, which also matches GFX10. */
-      if (sdst->surface.bpe == 1)
-         align = (uvec3){8, 4, 8};
-      else if (sdst->surface.bpe == 2)
-         align = (uvec3){4, 4, 8};
-      else if (sdst->surface.bpe == 4)
-         align = (uvec3){4, 4, 4};
-      else if (sdst->surface.bpe == 8)
-         align = (uvec3){4, 2, 4};
-      else {
-         /* 16bpp linear source image reads perform better with this. */
-         if (!is_clear && ssrc->surface.is_linear)
-            align = (uvec3){4, 2, 4}; /* align to 512B for linear->tiled */
-         else
-            align = (uvec3){2, 2, 4};
-      }
-
-      /* Clamp the alignment to the expected size of 1 wave. */
-      align.x = MIN2(align.x, 4 * lane_size.x);
-      align.y = MIN2(align.y, 4 * lane_size.y);
-      align.z = MIN2(align.z, 4 * lane_size.z);
-   } else if (sdst->surface.is_linear) {
-      /* 1D blits from linear to linear are faster unaligned.
-       * 1D image clears don't benefit from any alignment.
-       */
-      if (height == 1 && depth == 1 && (is_clear || ssrc->surface.is_linear)) {
-         align = (uvec3){1, 1, 1};
-      } else {
-         /* Linear blits should use the cache line size instead of 256B alignment.
-          * Clamp it to the expected size of 1 wave.
-          */
-         align.x = MIN2(sctx->screen->info.tcc_cache_line_size / sdst->surface.bpe, 64 * lane_size.x);
-         align.y = 1;
-         align.z = 1;
-      }
-   } else {
-      /* Thin tiling. */
-      if (sctx->gfx_level >= GFX11) {
-         /* Samples are next to each other on GFX11+. */
-         unsigned pix_size = sdst->surface.bpe * dst_samples;
-
-         /* This is based on GFX11_SW_PATTERN_NIBBLE01. */
-         if (pix_size == 1)
-            align = (uvec3){16, 16, 1};
-         else if (pix_size == 2)
-            align = (uvec3){16, 8, 1};
-         else if (pix_size == 4)
-            align = (uvec3){8, 8, 1};
-         else if (pix_size == 8)
-            align = (uvec3){8, 4, 1};
-         else if (pix_size == 16)
-            align = (uvec3){4, 4, 1};
-         else if (pix_size == 32)
-            align = (uvec3){4, 2, 1};
-         else if (pix_size == 64)
-            align = (uvec3){2, 2, 1};
-         else
-            align = (uvec3){2, 1, 1}; /* 16bpp 8xAA */
-      } else {
-         /* This is for 64KB_R_X. (most likely to occur due to DCC)
-          * It's based on GFX10_SW_64K_R_X_*xaa_RBPLUS_PATINFO (GFX10.3).
-          * The patterns are GFX10_SW_PATTERN_NIBBLE01[0, 1, 39, 6, 7] for 8bpp-128bpp.
-          * GFX6-10.1 and other swizzle modes might be similar.
-          */
-         if (sdst->surface.bpe == 1)
-            align = (uvec3){16, 16, 1};
-         else if (sdst->surface.bpe == 2)
-            align = (uvec3){16, 8, 1};
-         else if (sdst->surface.bpe == 4)
-            align = (uvec3){8, 8, 1};
-         else if (sdst->surface.bpe == 8)
-            align = (uvec3){8, 4, 1};
-         else
-            align = (uvec3){4, 4, 1};
-      }
-
-      /* Clamp the alignment to the expected size of 1 wave. */
-      align.x = MIN2(align.x, 8 * lane_size.x);
-      align.y = MIN2(align.y, 8 * lane_size.y);
-   }
-
-   /* If we don't have much to copy, don't align. The threshold is guessed and isn't covered
-    * by benchmarking.
-    */
-   if (width <= align.x * 4)
-      align.x = 1;
-   if (height <= align.y * 4)
-      align.y = 1;
-   if (depth <= align.z * 4)
-      align.z = 1;
-
-   unsigned start_x, start_y, start_z;
-   unsigned block_x, block_y, block_z;
-
-   /* If the blit destination area is unaligned, launch extra threads before 0,0,0 to make it
-    * aligned. This makes sure that a wave doesn't straddle a DCC block boundary or a cache line
-    * unnecessarily, so that each cache line is only stored by exactly 1 CU. The shader will skip
-    * the extra threads. This makes unaligned compute blits faster.
-    */
-   start_x = info->dst.box.x % align.x;
-   start_y = info->dst.box.y % align.y;
-   start_z = info->dst.box.z % align.z;
-   width += start_x;
-   height += start_y;
-   depth += start_z;
-
-   /* Divide by the dispatch parameters by the lane size. */
-   assert(start_x % lane_size.x == 0);
-   assert(start_y % lane_size.y == 0);
-   assert(start_z % lane_size.z == 0);
-   assert(width % lane_size.x == 0);
-   assert(height % lane_size.y == 0);
-   assert(depth % lane_size.z == 0);
-
-   start_x /= lane_size.x;
-   start_y /= lane_size.y;
-   start_z /= lane_size.z;
-   width /= lane_size.x;
-   height /= lane_size.y;
-   depth /= lane_size.z;
-
-   /* Choose the block (i.e. wave) dimensions based on the copy area size and the image layout
-    * of dst.
-    */
-   if (is_3d_tiling) {
-      /* Thick tiling. (microtiles are 3D boxes)
-       * If the box height and depth is > 2, the block size will be 4x4x4.
-       * If not, the threads will spill over to X.
-       */
-      block_y = util_next_power_of_two(MIN2(height, 4));
-      block_z = util_next_power_of_two(MIN2(depth, 4));
-      block_x = 64 / (block_y * block_z);
-   } else if (sdst->surface.is_linear) {
-      /* If the box width is > 128B, the block size will be 64x1 for bpp <= 4, 32x2 for bpp == 8,
-       * and 16x4 for bpp == 16.
-       * If not, the threads will spill over to Y, then Z if they aren't small.
-       *
-       * This is derived from the fact that the linear image layout has 256B linear blocks, and
-       * longer blocks don't benefit linear write performance, but they hurt tiled read performance.
-       * We want to prioritize blocks that are 256Bx2 over 512Bx1 because the source can be tiled.
-       *
-       * Using the cache line size (128B) instead of hardcoding 256B makes linear blits slower.
-       */
-      block_x = util_next_power_of_two(MIN3(width, 64, 256 / sdst->surface.bpe));
-      block_y = util_next_power_of_two(MIN2(height, 64 / block_x));
-      block_z = util_next_power_of_two(MIN2(depth, 64 / (block_x * block_y)));
-      block_x = 64 / (block_y * block_z);
-   } else {
-      /* Thin tiling. (microtiles are 2D rectangles)
-       * If the box width and height is > 4, the block size will be 8x8.
-       * If Y is <= 4, the threads will spill over to X.
-       * If X is <= 4, the threads will spill over to Y, then Z if they aren't small.
-       */
-      block_y = util_next_power_of_two(MIN2(height, 8));
-      block_x = util_next_power_of_two(MIN2(width, 64 / block_y));
-      block_y = util_next_power_of_two(MIN2(height, 64 / block_x));
-      block_z = util_next_power_of_two(MIN2(depth, 64 / (block_x * block_y)));
-      block_x = 64 / (block_y * block_z);
-   }
-
-   struct pipe_grid_info grid = {0};
-   unsigned wg_dim = set_work_size(&grid, block_x, block_y, block_z, width, height, depth);
-
-   /* Get the shader key. */
-   union ac_cs_blit_key key;
-   key.key = 0;
-
-   /* Only ACO can form VMEM clauses for image stores, which is a requirement for performance. */
-   key.use_aco = true;
-   key.is_clear = is_clear;
-   key.wg_dim = wg_dim;
-   key.has_start_xyz = start_x || start_y || start_z;
-   key.log_lane_width = util_logbase2(lane_size.x);
-   key.log_lane_height = util_logbase2(lane_size.y);
-   key.log_lane_depth = util_logbase2(lane_size.z);
-   key.dst_is_1d = info->dst.resource->target == PIPE_TEXTURE_1D ||
-                       info->dst.resource->target == PIPE_TEXTURE_1D_ARRAY;
-   key.dst_is_msaa = dst_samples > 1;
-   key.dst_has_z = info->dst.resource->target == PIPE_TEXTURE_3D ||
-                       info->dst.resource->target == PIPE_TEXTURE_CUBE ||
-                       info->dst.resource->target == PIPE_TEXTURE_1D_ARRAY ||
-                       info->dst.resource->target == PIPE_TEXTURE_2D_ARRAY ||
-                       info->dst.resource->target == PIPE_TEXTURE_CUBE_ARRAY;
-   key.last_dst_channel = util_format_get_last_component(info->dst.format);
-
-   /* ACO doesn't support D16 on GFX8 */
-   bool has_d16 = sctx->gfx_level >= (key.use_aco || sctx->screen->use_aco ? GFX9 : GFX8);
-
-   if (is_clear) {
-      assert(dst_samples <= 8);
-      key.log_samples = util_logbase2(dst_samples);
-      key.a16 = sctx->gfx_level >= GFX9 && util_is_box_sint16(&info->dst.box);
-      key.d16 = has_d16 &&
-                    max_dst_chan_size <= (util_format_is_float(info->dst.format) ||
-                                          util_format_is_pure_integer(info->dst.format) ? 16 : 11);
-   } else {
-      key.src_is_1d = info->src.resource->target == PIPE_TEXTURE_1D ||
-                          info->src.resource->target == PIPE_TEXTURE_1D_ARRAY;
-      key.src_is_msaa = src_samples > 1;
-      key.src_has_z = info->src.resource->target == PIPE_TEXTURE_3D ||
-                          info->src.resource->target == PIPE_TEXTURE_CUBE ||
-                          info->src.resource->target == PIPE_TEXTURE_1D_ARRAY ||
-                          info->src.resource->target == PIPE_TEXTURE_2D_ARRAY ||
-                          info->src.resource->target == PIPE_TEXTURE_CUBE_ARRAY;
-      /* Resolving integer formats only copies sample 0. log_samples is then unused. */
-      key.sample0_only = sample0_only;
-      unsigned num_samples = MAX2(src_samples, dst_samples);
-      assert(num_samples <= 8);
-      key.log_samples = sample0_only ? 0 : util_logbase2(num_samples);
-      key.x_clamp_to_edge = si_should_blit_clamp_to_edge(info, BITFIELD_BIT(0));
-      key.y_clamp_to_edge = si_should_blit_clamp_to_edge(info, BITFIELD_BIT(1));
-      key.flip_x = info->src.box.width < 0;
-      key.flip_y = info->src.box.height < 0;
-      key.sint_to_uint = util_format_is_pure_sint(info->src.format) &&
-                             util_format_is_pure_uint(info->dst.format);
-      key.uint_to_sint = util_format_is_pure_uint(info->src.format) &&
-                             util_format_is_pure_sint(info->dst.format);
-      key.dst_is_srgb = util_format_is_srgb(info->dst.format);
-      key.last_src_channel = MIN2(util_format_get_last_component(info->src.format),
-                                      key.last_dst_channel);
-      key.use_integer_one = util_format_is_pure_integer(info->dst.format) &&
-                                key.last_src_channel < key.last_dst_channel &&
-                                key.last_dst_channel == 3;
-      key.a16 = sctx->gfx_level >= GFX9 && util_is_box_sint16(&info->dst.box) &&
-                    util_is_box_sint16(&info->src.box);
-      key.d16 = has_d16 &&
-                    /* Blitting FP16 using D16 has precision issues. Resolving has precision
-                     * issues all the way down to R11G11B10_FLOAT. */
-                    MIN2(max_dst_chan_size, max_src_chan_size) <=
-                    (util_format_is_pure_integer(info->dst.format) ?
-                        (key.sint_to_uint || key.uint_to_sint ? 10 : 16) :
-                        (is_resolve ? 10 : 11));
-   }
-
-   void *shader = _mesa_hash_table_u64_search(sctx->cs_blit_shaders, key.key);
-   if (!shader) {
-      struct ac_cs_blit_options options = {
-         .nir_options = sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR,
-                                                             PIPE_SHADER_COMPUTE),
-         .info = &sctx->screen->info,
-         .use_aco = sctx->screen->use_aco,
-         .no_fmask = sctx->screen->debug_flags & DBG(NO_FMASK),
-         .print_key = si_can_dump_shader(sctx->screen, MESA_SHADER_COMPUTE, SI_DUMP_SHADER_KEY),
-      };
-      shader = si_create_shader_state(sctx, ac_create_blit_cs(&options, &key));
-      _mesa_hash_table_u64_insert(sctx->cs_blit_shaders, key.key, shader);
-   }
-
-   sctx->cs_user_data[0] = (info->src.box.x & 0xffff) | ((info->dst.box.x & 0xffff) << 16);
-   sctx->cs_user_data[1] = (info->src.box.y & 0xffff) | ((info->dst.box.y & 0xffff) << 16);
-   sctx->cs_user_data[2] = (info->src.box.z & 0xffff) | ((info->dst.box.z & 0xffff) << 16);
-   sctx->cs_user_data[3] = (start_x & 0xff) | ((start_y & 0xff) << 8) | ((start_z & 0xff) << 16);
-
-   if (is_clear) {
-      union pipe_color_union final_value;
-      memcpy(&final_value, clear_color, sizeof(final_value));
-
-      /* Do the conversion to sRGB here instead of the shader. */
-      if (util_format_is_srgb(info->dst.format)) {
-         for (int i = 0; i < 3; i++)
-            final_value.f[i] = util_format_linear_to_srgb_float(final_value.f[i]);
-      }
-
-      if (key.d16) {
-         enum pipe_format data_format;
-
-         if (util_format_is_pure_uint(info->dst.format))
-            data_format = PIPE_FORMAT_R16G16B16A16_UINT;
-         else if (util_format_is_pure_sint(info->dst.format))
-            data_format = PIPE_FORMAT_R16G16B16A16_SINT;
-         else
-            data_format = PIPE_FORMAT_R16G16B16A16_FLOAT;
-
-         util_pack_color_union(data_format, (union util_color *)&sctx->cs_user_data[4],
-                               &final_value);
-      } else {
-         memcpy(&sctx->cs_user_data[4], &final_value, sizeof(final_value));
-      }
-   }
-
   /* Shader images. */
   struct pipe_image_view image[2];
   unsigned dst_index = is_clear ? 0 : 1;
@ -1624,7 +979,44 @@ bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info,
   image[dst_index].u.tex.first_layer = 0;
   image[dst_index].u.tex.last_layer = util_max_layer(info->dst.resource, info->dst.level);

-   si_launch_grid_internal_images(sctx, image, is_clear ? 1 : 2, &grid, shader, flags |
-                                  (info->render_condition_enable ? SI_OP_CS_RENDER_COND_ENABLE : 0));
+   if (info->render_condition_enable)
+      flags |= SI_OP_CS_RENDER_COND_ENABLE;
+
+   /* Execute compute blits. */
+   for (unsigned i = 0; i < out.num_dispatches; i++) {
+      struct ac_cs_blit_dispatch *dispatch = &out.dispatches[i];
+
+      void *shader = _mesa_hash_table_u64_search(sctx->cs_blit_shaders, dispatch->shader_key.key);
+      if (!shader) {
+         shader = si_create_shader_state(sctx, ac_create_blit_cs(&options, &dispatch->shader_key));
+         _mesa_hash_table_u64_insert(sctx->cs_blit_shaders, dispatch->shader_key.key, shader);
+      }
+
+      memcpy(sctx->cs_user_data, dispatch->user_data, sizeof(sctx->cs_user_data));
+
+      struct pipe_grid_info grid = {
+         .block = {
+            dispatch->wg_size[0],
+            dispatch->wg_size[1],
+            dispatch->wg_size[2],
+         },
+         .last_block = {
+            dispatch->last_wg_size[0],
+            dispatch->last_wg_size[1],
+            dispatch->last_wg_size[2],
+         },
+         .grid = {
+            dispatch->num_workgroups[0],
+            dispatch->num_workgroups[1],
+            dispatch->num_workgroups[2],
+         },
+      };
+
+      si_launch_grid_internal_images(sctx, image, is_clear ? 1 : 2, &grid, shader,
+                                     (flags & ~SI_OP_SYNC_BEFORE_AFTER) |
+                                     (i == 0 ? flags & SI_OP_SYNC_BEFORE : 0) |
+                                     (i == out.num_dispatches - 1 ? flags & SI_OP_SYNC_AFTER : 0));
+   }
+
   return true;
 }