diff --git a/src/asahi/lib/agx_device.c b/src/asahi/lib/agx_device.c
index b4b6ecbe66e..09752128364 100644
--- a/src/asahi/lib/agx_device.c
+++ b/src/asahi/lib/agx_device.c
@@ -50,7 +50,6 @@ static const struct debug_named_value agx_debug_options[] = {
 #ifndef NDEBUG
    {"dirty",     AGX_DBG_DIRTY,    "Disable dirty tracking"},
 #endif
-   {"compblit",  AGX_DBG_COMPBLIT, "Enable compute blitter"},
    {"precompile",AGX_DBG_PRECOMPILE,"Precompile shaders for shader-db"},
    {"nocompress",AGX_DBG_NOCOMPRESS,"Disable lossless compression"},
    {"nocluster", AGX_DBG_NOCLUSTER,"Disable vertex clustering"},
diff --git a/src/asahi/lib/agx_device.h b/src/asahi/lib/agx_device.h
index bc489b108cf..db166c6124a 100644
--- a/src/asahi/lib/agx_device.h
+++ b/src/asahi/lib/agx_device.h
@@ -41,7 +41,7 @@ enum agx_dbg {
    AGX_DBG_NOSHADOW = BITFIELD_BIT(16),
    /* bit 17 unused */
    AGX_DBG_SCRATCH = BITFIELD_BIT(18),
-   AGX_DBG_COMPBLIT = BITFIELD_BIT(19),
+   /* bit 19 unused */
    AGX_DBG_FEEDBACK = BITFIELD_BIT(20),
    AGX_DBG_1QUEUE = BITFIELD_BIT(21),
 };
diff --git a/src/gallium/drivers/asahi/agx_blit.c b/src/gallium/drivers/asahi/agx_blit.c
index 1015d4bf241..a95b307c8d7 100644
--- a/src/gallium/drivers/asahi/agx_blit.c
+++ b/src/gallium/drivers/asahi/agx_blit.c
@@ -20,18 +20,43 @@
 #include "pipe/p_state.h"
 #include "util/format/u_format.h"
 #include "util/format/u_formats.h"
+#include "util/hash_table.h"
 #include "util/macros.h"
+#include "util/ralloc.h"
 #include "util/u_sampler.h"
 #include "util/u_surface.h"
 #include "agx_formats.h"
+#include "agx_internal_formats.h"
 #include "agx_state.h"
+#include "glsl_types.h"
+#include "nir.h"
+#include "nir_builder_opcodes.h"
 #include "shader_enums.h"
 
-#define BLIT_WG_SIZE 32
+/* For block based blit kernels, we hardcode the maximum tile size which we can
+ * always achieve. This simplifies our life.
+ */
+#define TILE_WIDTH  32
+#define TILE_HEIGHT 32
+
+static enum pipe_format
+effective_format(enum pipe_format format)
+{
+   switch (format) {
+   case PIPE_FORMAT_Z32_FLOAT:
+   case PIPE_FORMAT_Z24X8_UNORM:
+      return PIPE_FORMAT_R32_FLOAT;
+   case PIPE_FORMAT_Z16_UNORM:
+      return PIPE_FORMAT_R16_UNORM;
+   case PIPE_FORMAT_S8_UINT:
+      return PIPE_FORMAT_R8_UINT;
+   default:
+      return format;
+   }
+}
 
 static void *
-asahi_blit_compute_shader(struct pipe_context *ctx, enum asahi_blit_clamp clamp,
-                          bool array)
+asahi_blit_compute_shader(struct pipe_context *ctx, struct asahi_blit_key *key)
 {
    const nir_shader_compiler_options *options =
       ctx->screen->get_compiler_options(ctx->screen, PIPE_SHADER_IR_NIR,
@@ -40,8 +65,8 @@ asahi_blit_compute_shader(struct pipe_context *ctx, enum asahi_blit_clamp clamp,
    nir_builder b_ =
       nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "blit_cs");
    nir_builder *b = &b_;
-   b->shader->info.workgroup_size[0] = BLIT_WG_SIZE;
-   b->shader->info.workgroup_size[1] = BLIT_WG_SIZE;
+   b->shader->info.workgroup_size[0] = TILE_WIDTH;
+   b->shader->info.workgroup_size[1] = TILE_HEIGHT;
    b->shader->info.num_ubos = 1;
 
    BITSET_SET(b->shader->info.textures_used, 0);
@@ -50,51 +75,153 @@ asahi_blit_compute_shader(struct pipe_context *ctx, enum asahi_blit_clamp clamp,
 
    nir_def *zero = nir_imm_int(b, 0);
 
-   nir_def *params[3];
+   nir_def *params[4];
    b->shader->num_uniforms = ARRAY_SIZE(params);
    for (unsigned i = 0; i < b->shader->num_uniforms; ++i) {
       params[i] = nir_load_ubo(b, 2, 32, zero, nir_imm_int(b, i * 8),
                                .align_mul = 4, .range = ~0);
    }
 
-   nir_def *ids =
-      nir_trim_vector(b, nir_load_global_invocation_id(b, 32), array ? 3 : 2);
+   nir_def *trans_offs = params[0];
+   nir_def *trans_scale = params[1];
+   nir_def *dst_offs_2d = params[2];
+   nir_def *dimensions_el_2d = params[3];
 
-   nir_def *tex_pos = nir_u2f32(b, ids);
-   nir_def *pos2 =
-      nir_ffma(b, nir_trim_vector(b, tex_pos, 2), params[1], params[0]);
-   if (array) {
-      tex_pos = nir_vector_insert_imm(b, nir_pad_vector(b, pos2, 3),
-                                      nir_channel(b, tex_pos, 2), 2);
-   } else {
-      tex_pos = pos2;
+   nir_def *phys_id_el_nd = nir_trim_vector(
+      b, nir_load_global_invocation_id(b, 32), key->array ? 3 : 2);
+   nir_def *phys_id_el_2d = nir_trim_vector(b, phys_id_el_nd, 2);
+   nir_def *layer = key->array ? nir_channel(b, phys_id_el_nd, 2) : NULL;
+
+   /* Offset within the tile. We're dispatched for the entire tile but the
+    * beginning might be out-of-bounds, so fix up.
+    */
+   nir_def *offs_in_tile_el_2d = nir_iand_imm(b, dst_offs_2d, 31);
+   nir_def *logical_id_el_2d = nir_isub(b, phys_id_el_2d, offs_in_tile_el_2d);
+
+   nir_def *image_pos_2d = nir_iadd(b, logical_id_el_2d, dst_offs_2d);
+   nir_def *image_pos_nd = image_pos_2d;
+   if (layer) {
+      image_pos_nd =
+         nir_vector_insert_imm(b, nir_pad_vector(b, image_pos_nd, 3), layer, 2);
    }
 
-   nir_tex_instr *tex = nir_tex_instr_create(b->shader, 1);
-   tex->dest_type = nir_type_uint32; /* irrelevant */
-   tex->sampler_dim = GLSL_SAMPLER_DIM_2D;
-   tex->is_array = array;
-   tex->op = nir_texop_tex;
-   tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, tex_pos);
-   tex->backend_flags = AGX_TEXTURE_FLAG_NO_CLAMP;
-   tex->coord_components = array ? 3 : 2;
-   tex->texture_index = 0;
-   tex->sampler_index = 0;
-   nir_def_init(&tex->instr, &tex->def, 4, 32);
-   nir_builder_instr_insert(b, &tex->instr);
-   nir_def *color = &tex->def;
+   nir_def *in_bounds = nir_ige(b, logical_id_el_2d, nir_imm_ivec2(b, 0, 0));
+   in_bounds =
+      nir_iand(b, in_bounds, nir_ilt(b, logical_id_el_2d, dimensions_el_2d));
+
+   nir_def *colour0, *colour1;
+   nir_push_if(b, nir_ball(b, in_bounds));
+   {
+      /* For pixels within the copy area, texture from the source */
+      nir_def *coords_el_2d =
+         nir_ffma(b, nir_u2f32(b, logical_id_el_2d), trans_scale, trans_offs);
+
+      nir_def *coords_el_nd = coords_el_2d;
+      if (layer) {
+         coords_el_nd = nir_vector_insert_imm(
+            b, nir_pad_vector(b, coords_el_nd, 3), nir_u2f32(b, layer), 2);
+      }
+
+      nir_tex_instr *tex = nir_tex_instr_create(b->shader, 1);
+      tex->dest_type = nir_type_uint32; /* irrelevant */
+      tex->sampler_dim = GLSL_SAMPLER_DIM_2D;
+      tex->is_array = key->array;
+      tex->op = nir_texop_tex;
+      tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, coords_el_nd);
+      tex->backend_flags = AGX_TEXTURE_FLAG_NO_CLAMP;
+      tex->coord_components = coords_el_nd->num_components;
+      tex->texture_index = 0;
+      tex->sampler_index = 0;
+      nir_def_init(&tex->instr, &tex->def, 4, 32);
+      nir_builder_instr_insert(b, &tex->instr);
+      colour0 = &tex->def;
+   }
+   nir_push_else(b, NULL);
+   {
+      /* For out-of-bounds pixels, copy in the destination */
+      colour1 = nir_image_load(
+         b, 4, 32, nir_imm_int(b, 0), nir_pad_vec4(b, image_pos_nd), zero, zero,
+         .image_array = key->array, .image_dim = GLSL_SAMPLER_DIM_2D,
+         .access = ACCESS_IN_BOUNDS_AGX, .dest_type = nir_type_uint32);
+   }
+   nir_pop_if(b, NULL);
+   nir_def *color = nir_if_phi(b, colour0, colour1);
+
+   enum asahi_blit_clamp clamp = ASAHI_BLIT_CLAMP_NONE;
+   bool src_sint = util_format_is_pure_sint(key->src_format);
+   bool dst_sint = util_format_is_pure_sint(key->dst_format);
+   if (util_format_is_pure_integer(key->src_format) &&
+       util_format_is_pure_integer(key->dst_format)) {
+
+      if (src_sint && !dst_sint)
+         clamp = ASAHI_BLIT_CLAMP_SINT_TO_UINT;
+      else if (!src_sint && dst_sint)
+         clamp = ASAHI_BLIT_CLAMP_UINT_TO_SINT;
+   }
 
    if (clamp == ASAHI_BLIT_CLAMP_SINT_TO_UINT)
       color = nir_imax(b, color, nir_imm_int(b, 0));
    else if (clamp == ASAHI_BLIT_CLAMP_UINT_TO_SINT)
       color = nir_umin(b, color, nir_imm_int(b, INT32_MAX));
 
-   nir_def *image_pos =
-      nir_iadd(b, ids, nir_pad_vector_imm_int(b, params[2], 0, array ? 3 : 2));
+   nir_def *local_offset = nir_imm_intN_t(b, 0, 16);
+   nir_def *lid = nir_trim_vector(b, nir_load_local_invocation_id(b), 2);
+   lid = nir_u2u16(b, lid);
 
-   nir_image_store(b, nir_imm_int(b, 0), nir_pad_vec4(b, image_pos), zero,
-                   color, zero, .image_dim = GLSL_SAMPLER_DIM_2D,
-                   .access = ACCESS_NON_READABLE, .image_array = array);
+   /* Pure integer formatss need to be clamped in software, at least in some
+    * cases. We do so on store. Piglit gl-3.0-render-integer checks this, as
+    * does KHR-GL33.packed_pixels.*.
+    *
+    * TODO: Make this common code somehow.
+    */
+   const struct util_format_description *desc =
+      util_format_description(key->dst_format);
+   unsigned c = util_format_get_first_non_void_channel(key->dst_format);
+
+   if (desc->channel[c].size <= 16 &&
+       util_format_is_pure_integer(key->dst_format)) {
+
+      unsigned bits[4] = {
+         desc->channel[0].size ?: desc->channel[0].size,
+         desc->channel[1].size ?: desc->channel[0].size,
+         desc->channel[2].size ?: desc->channel[0].size,
+         desc->channel[3].size ?: desc->channel[0].size,
+      };
+
+      if (util_format_is_pure_sint(key->dst_format))
+         color = nir_format_clamp_sint(b, color, bits);
+      else
+         color = nir_format_clamp_uint(b, color, bits);
+
+      color = nir_u2u16(b, color);
+   }
+
+   /* The source texel has been converted into a 32-bit value. We need to
+    * convert it to a tilebuffer format that can then be converted to the
+    * destination format in the PBE hardware. That's the renderable format for
+    * the destination format, which must exist along this path. This mirrors the
+    * flow of fragment and end-of-tile shaders.
+    */
+   enum pipe_format tib_format =
+      agx_pixel_format[effective_format(key->dst_format)].renderable;
+
+   nir_store_local_pixel_agx(b, color, nir_imm_int(b, 1), lid, .base = 0,
+                             .write_mask = 0xf, .format = tib_format,
+                             .explicit_coord = true);
+
+   nir_barrier(b, .execution_scope = SCOPE_WORKGROUP);
+
+   nir_push_if(b, nir_ball(b, nir_ieq_imm(b, lid, 0)));
+   {
+      nir_def *pbe_index = nir_imm_int(b, 2);
+      nir_block_image_store_agx(
+         b, pbe_index, local_offset, image_pos_nd, .format = tib_format,
+         .image_dim = GLSL_SAMPLER_DIM_2D, .image_array = key->array,
+         .explicit_coord = true);
+   }
+   nir_pop_if(b, NULL);
+   b->shader->info.cs.image_block_size_per_thread_agx =
+      util_format_get_blocksize(key->dst_format);
 
    return pipe_shader_from_nir(ctx, b->shader);
 }
@@ -219,6 +346,8 @@ asahi_compute_blit(struct pipe_context *ctx, const struct pipe_blit_info *info,
       fui(y_scale),
       info->dst.box.x,
       info->dst.box.y,
+      info->dst.box.width,
+      info->dst.box.height,
    };
 
    struct pipe_constant_buffer cb = {
@@ -273,32 +402,51 @@ asahi_compute_blit(struct pipe_context *ctx, const struct pipe_blit_info *info,
    src_view = ctx->create_sampler_view(ctx, src, &src_templ);
    ctx->set_sampler_views(ctx, PIPE_SHADER_COMPUTE, 0, 1, 0, true, &src_view);
 
-   enum asahi_blit_clamp clamp = ASAHI_BLIT_CLAMP_NONE;
-   bool src_sint = util_format_is_pure_sint(info->src.format);
-   bool dst_sint = util_format_is_pure_sint(info->dst.format);
-   if (util_format_is_pure_integer(info->src.format) &&
-       util_format_is_pure_integer(info->dst.format)) {
+   struct asahi_blit_key key = {
+      .src_format = info->src.format,
+      .dst_format = info->dst.format,
+      .array = array,
+   };
+   struct hash_entry *ent = _mesa_hash_table_search(blitter->blit_cs, &key);
+   void *cs = NULL;
 
-      if (src_sint && !dst_sint)
-         clamp = ASAHI_BLIT_CLAMP_SINT_TO_UINT;
-      else if (!src_sint && dst_sint)
-         clamp = ASAHI_BLIT_CLAMP_UINT_TO_SINT;
+   if (ent) {
+      cs = ent->data;
+   } else {
+      cs = asahi_blit_compute_shader(ctx, &key);
+      _mesa_hash_table_insert(
+         blitter->blit_cs, ralloc_memdup(blitter->blit_cs, &key, sizeof(key)),
+         cs);
    }
 
-   if (!blitter->blit_cs[clamp][array]) {
-      blitter->blit_cs[clamp][array] =
-         asahi_blit_compute_shader(ctx, clamp, array);
-   }
+   assert(cs != NULL);
+   ctx->bind_compute_state(ctx, cs);
 
-   ctx->bind_compute_state(ctx, blitter->blit_cs[clamp][array]);
+   /* Expand the grid so destinations are in tiles */
+   unsigned expanded_x0 = info->dst.box.x & ~(TILE_WIDTH - 1);
+   unsigned expanded_y0 = info->dst.box.y & ~(TILE_HEIGHT - 1);
+   unsigned expanded_x1 =
+      align(info->dst.box.x + info->dst.box.width, TILE_WIDTH);
+   unsigned expanded_y1 =
+      align(info->dst.box.y + info->dst.box.height, TILE_HEIGHT);
+
+   /* But clamp to the destination size to save some redundant threads */
+   expanded_x1 =
+      MIN2(expanded_x1, u_minify(info->dst.resource->width0, info->dst.level));
+   expanded_y1 =
+      MIN2(expanded_y1, u_minify(info->dst.resource->height0, info->dst.level));
+
+   /* Recalculate the width/height based on the expanded grid */
+   width = expanded_x1 - expanded_x0;
+   height = expanded_y1 - expanded_y0;
 
    struct pipe_grid_info grid_info = {
-      .block = {BLIT_WG_SIZE, BLIT_WG_SIZE, 1},
-      .last_block = {width % BLIT_WG_SIZE, height % BLIT_WG_SIZE, 1},
+      .block = {TILE_WIDTH, TILE_HEIGHT, 1},
+      .last_block = {width % TILE_WIDTH, height % TILE_HEIGHT, 1},
       .grid =
          {
-            DIV_ROUND_UP(width, BLIT_WG_SIZE),
-            DIV_ROUND_UP(height, BLIT_WG_SIZE),
+            DIV_ROUND_UP(width, TILE_WIDTH),
+            DIV_ROUND_UP(height, TILE_HEIGHT),
             depth,
          },
    };
@@ -378,11 +526,7 @@ agx_blit(struct pipe_context *pipe, const struct pipe_blit_info *info)
    agx_legalize_compression(ctx, agx_resource(info->src.resource),
                             info->src.format);
 
-   if (asahi_compute_blit_supported(info) &&
-       (agx_device(pipe->screen)->debug & AGX_DBG_COMPBLIT) &&
-       !(ail_is_compressed(&agx_resource(info->dst.resource)->layout) &&
-         util_format_get_blocksize(info->dst.format) == 16)) {
-
+   if (asahi_compute_blit_supported(info)) {
       asahi_compute_blit(pipe, info, &ctx->compute_blitter);
       return;
    }
diff --git a/src/gallium/drivers/asahi/agx_disk_cache.c b/src/gallium/drivers/asahi/agx_disk_cache.c
index 906daa01ce6..8efa8e6a7fa 100644
--- a/src/gallium/drivers/asahi/agx_disk_cache.c
+++ b/src/gallium/drivers/asahi/agx_disk_cache.c
@@ -20,7 +20,7 @@
 #include "agx_state.h"
 
 /* Flags that are allowed and do not disable the disk cache */
-#define ALLOWED_FLAGS (AGX_DBG_NO16 | AGX_DBG_COMPBLIT)
+#define ALLOWED_FLAGS (AGX_DBG_NO16)
 
 /**
  * Compute a disk cache key for the given uncompiled shader and shader key.
diff --git a/src/gallium/drivers/asahi/agx_pipe.c b/src/gallium/drivers/asahi/agx_pipe.c
index a6ce72b0f81..0713ecf2db0 100644
--- a/src/gallium/drivers/asahi/agx_pipe.c
+++ b/src/gallium/drivers/asahi/agx_pipe.c
@@ -1919,6 +1919,7 @@ agx_create_context(struct pipe_screen *screen, void *priv, unsigned flags)
    agx_init_meta_shaders(ctx);
 
    ctx->blitter = util_blitter_create(pctx);
+   ctx->compute_blitter.blit_cs = asahi_blit_key_table_create(ctx);
 
    ctx->result_buf =
       agx_bo_create(agx_device(screen),
diff --git a/src/gallium/drivers/asahi/agx_state.c b/src/gallium/drivers/asahi/agx_state.c
index 939828cd172..b9799ac4a4f 100644
--- a/src/gallium/drivers/asahi/agx_state.c
+++ b/src/gallium/drivers/asahi/agx_state.c
@@ -2767,9 +2767,13 @@ agx_upload_textures(struct agx_batch *batch, struct agx_compiled_shader *cs,
       struct pipe_sampler_view sampler_view = util_image_to_sampler_view(view);
 
       /* For the texture descriptor, lower cubes to 2D arrays. This matches the
-       * transform done in the compiler.
+       * transform done in the compiler. Also, force 2D arrays for internal
+       * blitter images, this helps reduce shader variants.
        */
-      if (target_is_cube(sampler_view.target))
+      bool internal = (view->access & PIPE_IMAGE_ACCESS_DRIVER_INTERNAL);
+
+      if (target_is_cube(sampler_view.target) ||
+          (sampler_view.target == PIPE_TEXTURE_3D && internal))
          sampler_view.target = PIPE_TEXTURE_2D_ARRAY;
 
       agx_pack_texture(texture, agx_resource(view->resource), view->format,
diff --git a/src/gallium/drivers/asahi/agx_state.h b/src/gallium/drivers/asahi/agx_state.h
index 496a50b1b2b..403ba72273b 100644
--- a/src/gallium/drivers/asahi/agx_state.h
+++ b/src/gallium/drivers/asahi/agx_state.h
@@ -575,11 +575,16 @@ enum asahi_blit_clamp {
    ASAHI_BLIT_CLAMP_COUNT,
 };
 
+struct asahi_blit_key {
+   enum pipe_format src_format, dst_format;
+   bool array;
+};
+
+DERIVE_HASH_TABLE(asahi_blit_key);
+
 struct asahi_blitter {
    bool active;
-
-   /* [clamp_type][is_array] */
-   void *blit_cs[ASAHI_BLIT_CLAMP_COUNT][2];
+   struct hash_table *blit_cs;
 
    /* [filter] */
    void *sampler[2];