asahi: Implement clear_buffer using libagx_fill*

Use either libagx_fill_uint4 or libagx_fill based of size and object alignment for clear_sizes which are a power of two up to 16. Reported fill rate for 256MB buffers on a M1 Ultra (G13D) in gpu-ratemeter is 355 GB/s for 16 byte aligned buffers and 155 GB/s for 4 byte aligned buffers. Signed-off-by: Janne Grunau <janne-fdr@jannau.net> (cherry picked from commit 5c2d62c030) Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40092>
2026-05-07 04:58:05 +02:00 · 2026-01-16 11:53:24 +01:00 · 2026-01-16 11:53:24 +01:00 · 1ce5b5b361
commit 1ce5b5b361
parent 37a269e303
2 changed files with 42 additions and 2 deletions
--- a/.pick_status.json
+++ b/.pick_status.json
@ -2914,7 +2914,7 @@
        "description": "asahi: Implement clear_buffer using libagx_fill*",
        "nominated": false,
        "nomination_type": 0,
-        "resolution": 4,
+        "resolution": 1,
        "main_sha": null,
        "because_sha": null,
        "notes": null
--- a/src/gallium/drivers/asahi/agx_pipe.c
+++ b/src/gallium/drivers/asahi/agx_pipe.c
@ -1713,6 +1713,46 @@ asahi_get_device_reset_status(struct pipe_context *pipe)
   return ctx->any_faults ? PIPE_GUILTY_CONTEXT_RESET : PIPE_NO_RESET;
 }

+static void
+asahi_clear_buffer(struct pipe_context *pipe, struct pipe_resource *resource,
+                   unsigned offset, unsigned size, const void *clear_value,
+                   int clear_value_size)
+{
+   assert(clear_value_size > 0);
+   if (clear_value_size <= 16 && util_is_power_of_two_nonzero(clear_value_size)) {
+      union pipe_color_union color;
+      bool aligned_16 = util_is_aligned(offset, 16) && util_is_aligned(size, 16);
+      bool aligned_4 = util_is_aligned(offset, 4) && util_is_aligned(size, 4);
+
+      /* Splat out to 128-bit */
+      uint8_t *bytes = (uint8_t *)color.ui;
+      memcpy(bytes, clear_value, clear_value_size);
+      for (unsigned i = clear_value_size; i < 16; ++i) {
+         bytes[i] = bytes[i % clear_value_size];
+      }
+
+      if (aligned_16) {
+         struct agx_batch *batch = agx_get_compute_batch(agx_context(pipe));
+         agx_batch_init_state(batch);
+         agx_batch_writes_range(batch, agx_resource(resource), offset, size);
+         libagx_fill_uint4(batch, agx_2d(size / 16, 1), AGX_BARRIER_ALL,
+                           agx_map_gpu(agx_resource(resource)) + offset, 16,
+                           color.ui[0], color.ui[1], color.ui[2], color.ui[3]);
+         return;
+      } else if (aligned_4 && clear_value_size <= 4) {
+         struct agx_batch *batch = agx_get_compute_batch(agx_context(pipe));
+         agx_batch_init_state(batch);
+         agx_batch_writes_range(batch, agx_resource(resource), offset, size);
+         libagx_fill(batch, agx_1d(size / 4), AGX_BARRIER_ALL,
+                     agx_map_gpu(agx_resource(resource)) + offset, color.ui[0]);
+         return;
+      }
+   }
+
+   u_default_clear_buffer(pipe, resource, offset, size, clear_value,
+                          clear_value_size);
+}
+
 static struct pipe_context *
 agx_create_context(struct pipe_screen *screen, void *priv, unsigned flags)
 {
@ -1764,7 +1804,7 @@ agx_create_context(struct pipe_screen *screen, void *priv, unsigned flags)
   pctx->transfer_flush_region = u_transfer_helper_transfer_flush_region;

   pctx->buffer_subdata = u_default_buffer_subdata;
-   pctx->clear_buffer = u_default_clear_buffer;
+   pctx->clear_buffer = asahi_clear_buffer;
   pctx->texture_subdata = u_default_texture_subdata;
   pctx->set_debug_callback = u_default_set_debug_callback;
   pctx->get_sample_position = u_default_get_sample_position;