asahi: Implement scratch allocation

Implement our helper program to map scratch blocks into stack memory, and the driver side that allocates these blocks as necessary. Alloction is grow-only right now. Drivers are expected to instantiate scratch memory managers for each shader type (VS, FS, CS) and the same buffers are reused across commands for each one, growing as necessary. Signed-off-by: Asahi Lina <lina@asahilina.net> Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27616>
2026-05-04 22:49:13 +02:00 · 2023-11-08 22:26:40 +09:00 · 2023-11-08 22:26:40 +09:00 · 494399c65c
commit 494399c65c
parent da9677f14b
5 changed files with 415 additions and 3 deletions
--- a/src/asahi/lib/agx_device.h
+++ b/src/asahi/lib/agx_device.h
@ -32,6 +32,7 @@ enum agx_dbg {
   AGX_DBG_NOMSAA = BITFIELD_BIT(15),
   AGX_DBG_NOSHADOW = BITFIELD_BIT(16),
   AGX_DBG_VARYINGS = BITFIELD_BIT(17),
+   AGX_DBG_SCRATCH = BITFIELD_BIT(18),
 };

 /* Dummy partial declarations, pending real UAPI */
--- a/src/asahi/lib/agx_scratch.c
+++ b/src/asahi/lib/agx_scratch.c
@ -5,11 +5,30 @@

 #include "agx_scratch.h"
 #include "asahi/compiler/agx_compile.h"
+#include "shaders/helper.h"
+#include "util/u_hexdump.h"
 #include "agx_bo.h"
 #include "libagx_shaders.h"
 #include "nir.h"
 #include "nir_builder_opcodes.h"

+#define AGX_ADDR_SHIFT        8
+#define AGX_THREADS_PER_GROUP 32
+#define AGX_SPILL_UNIT_DWORDS 8
+
+// FIXME: What is the actual value here? Seems to be 96 + 8 or so?
+#define AGX_MAX_SUBGROUPS_PER_CORE 128
+
+// Unknown if this goes higher.
+#define AGX_MAX_SCRATCH_BLOCK_LOG4 6
+#define AGX_MAX_SCRATCH_DWORDS                                                 \
+   ((AGX_SPILL_UNIT_DWORDS << (2 * AGX_MAX_SCRATCH_BLOCK_LOG4)) * 4)
+
+struct spill_size {
+   uint32_t log4_bsize;
+   uint32_t count;
+};
+
 struct agx_bo *
 agx_build_helper(struct agx_device *dev)
 {
@ -38,15 +57,263 @@ agx_build_helper(struct agx_device *dev)
   util_dynarray_fini(&binary);
   ralloc_free(b.shader);

+   if (dev->debug & AGX_DBG_SCRATCH)
+      fprintf(stderr, "Helper: 0x%" PRIx64 "\n", bo->ptr.gpu);
+
   return bo;
 }

+static struct spill_size
+agx_scratch_get_spill_size(unsigned dwords)
+{
+   if (!dwords) {
+      return (struct spill_size){0, 0};
+   }
+   assert(dwords <= AGX_MAX_SCRATCH_DWORDS && "Scratch size too large");
+
+   unsigned log4 =
+      util_logbase2(DIV_ROUND_UP(dwords, AGX_SPILL_UNIT_DWORDS)) / 2;
+   unsigned blocks = DIV_ROUND_UP(dwords, AGX_SPILL_UNIT_DWORDS << (2 * log4));
+   if (log4 > AGX_MAX_SCRATCH_BLOCK_LOG4) {
+      // Max size case (4 blocks)
+      assert(log4 == (AGX_MAX_SCRATCH_BLOCK_LOG4 + 1));
+      log4--;
+      blocks = 4;
+   } else if (blocks == 4) {
+      // Non max size 4 block case, shift to next log4 unit for consistency.
+      log4++;
+      blocks = 1;
+   }
+
+   return (struct spill_size){log4, blocks};
+}
+
+unsigned
+agx_scratch_get_bucket(uint32_t dwords)
+{
+   /* For debugging/analysis purposes, scratch allocation sizes are
+    * divided into buckets. Since we only allocate a single global
+    * worst-case scratch buffer, these buckets do not have any meaning
+    * for the actual allocation mechanism. They are only used to log
+    * allocation sizes. We just use a simple log2 of the size here.
+    */
+
+   if (!dwords)
+      return 0;
+   assert(dwords <= AGX_MAX_SCRATCH_DWORDS && "Scratch size too large");
+
+   return MIN2(
+      AGX_SPILL_SIZE_BUCKETS - 1,
+      1 + util_logbase2_ceil(DIV_ROUND_UP(dwords, AGX_SPILL_UNIT_DWORDS)));
+}
+
+static void
+agx_scratch_realloc(struct agx_scratch *scratch)
+{
+   if (scratch->buf)
+      agx_bo_unreference(scratch->buf);
+
+   struct spill_size size = agx_scratch_get_spill_size(scratch->size_dwords);
+
+   if (scratch->dev->debug & AGX_DBG_SCRATCH)
+      fprintf(stderr, "Scratch realloc: %d (%d:%d) x %d\n",
+              scratch->size_dwords, size.log4_bsize, size.count,
+              scratch->subgroups);
+
+   unsigned block_dwords = AGX_SPILL_UNIT_DWORDS << (2 * size.log4_bsize);
+   size_t block_size_bytes = (AGX_THREADS_PER_GROUP * 4) * block_dwords;
+   scratch->size_dwords = block_dwords * size.count;
+
+   if (scratch->dev->debug & AGX_DBG_SCRATCH)
+      fprintf(stderr, "Block size: 0x%zx bytes (%d)\n", block_size_bytes,
+              size.log4_bsize);
+
+   unsigned block_count = size.count;
+
+   if (scratch->dev->debug & AGX_DBG_SCRATCH)
+      fprintf(stderr, "Block count: %d\n", block_count);
+
+   size_t core_alloc = block_size_bytes * block_count * scratch->subgroups;
+
+   size_t header_size = sizeof(struct agx_helper_header);
+
+   size_t blocklist_off = header_size;
+   size_t blocklist_core_size =
+      scratch->subgroups * sizeof(struct agx_helper_block);
+   size_t blocklist_size = blocklist_core_size * scratch->num_cores;
+
+   size_t blocks_off = align(header_size + blocklist_size, block_size_bytes);
+   size_t total_alloc = blocks_off + core_alloc * scratch->num_cores;
+
+   unsigned flags = 0;
+#ifdef SCRATCH_DEBUG
+   flags = AGX_BO_WRITEBACK;
+#endif
+   scratch->buf = agx_bo_create_aligned(scratch->dev, total_alloc,
+                                        block_size_bytes, flags, "Scratch");
+   memset(scratch->buf->ptr.cpu, 0, blocks_off);
+
+   struct agx_helper_header *hdr = scratch->buf->ptr.cpu;
+   scratch->header = hdr;
+
+   uint64_t blocklist_gpu = scratch->buf->ptr.gpu + blocklist_off;
+   struct agx_helper_block *blocklist_cpu =
+      scratch->buf->ptr.cpu + blocklist_off;
+
+#ifdef SCRATCH_DEBUG
+   scratch->blocklist = blocklist_cpu;
+   scratch->data = scratch->buf->ptr.cpu + blocks_off;
+   scratch->core_size = block_size_bytes * block_count * scratch->subgroups;
+#endif
+
+   uint64_t blocks_gpu = scratch->buf->ptr.gpu + blocks_off;
+
+   hdr->subgroups = scratch->subgroups;
+
+   unsigned num_cores = 0;
+   unsigned core_id;
+   for (core_id = 0; core_id < AGX_MAX_CORE_ID; core_id++) {
+      unsigned cores_per_cluster =
+         util_next_power_of_two(scratch->dev->params.num_cores_per_cluster);
+      unsigned cluster = core_id / cores_per_cluster;
+      unsigned core = core_id % cores_per_cluster;
+      if (cluster >= scratch->dev->params.num_clusters_total)
+         break;
+      if (core >= scratch->dev->params.num_cores_per_cluster ||
+          !(scratch->dev->params.core_masks[cluster] & BITFIELD_BIT(core)))
+         continue;
+      num_cores++;
+#ifdef SCRATCH_DEBUG
+      scratch->core_present[core_id] = true;
+#endif
+
+      hdr->cores[core_id].blocklist = blocklist_gpu;
+
+      for (unsigned sg = 0; sg < scratch->subgroups; sg++) {
+         uint32_t mask = BITFIELD_MASK(size.log4_bsize + 1);
+         assert(!(blocks_gpu & (block_size_bytes - 1)));
+
+         uint32_t base = blocks_gpu >> AGX_ADDR_SHIFT;
+         uint32_t stride = block_size_bytes >> AGX_ADDR_SHIFT;
+         blocklist_cpu[sg].blocks[0] = mask | base;
+         for (int block = 1; block <= 3; block++) {
+            if (block_count >= (block + 1))
+               blocklist_cpu[sg].blocks[block] = 1 | (base + block * stride);
+            else
+               blocklist_cpu[sg].blocks[block] = 0;
+         }
+
+         blocks_gpu += block_size_bytes * block_count;
+      }
+
+      blocklist_gpu += sizeof(struct agx_helper_block) * scratch->subgroups;
+      blocklist_cpu += scratch->subgroups;
+   }
+   scratch->max_core_id = core_id;
+   assert(num_cores == scratch->num_cores);
+
+   if (scratch->dev->debug & AGX_DBG_SCRATCH)
+      fprintf(stderr, "New Scratch @ 0x%" PRIx64 " (size: 0x%zx)\n",
+              scratch->buf->ptr.gpu, scratch->buf->size);
+}
+
+void
+agx_scratch_alloc(struct agx_scratch *scratch, unsigned dwords,
+                  size_t subgroups)
+{
+   bool realloc = false;
+
+   if (!dwords)
+      return;
+
+   assert(dwords <= AGX_MAX_SCRATCH_DWORDS && "Scratch size too large");
+
+   if (!subgroups)
+      subgroups = AGX_MAX_SUBGROUPS_PER_CORE;
+
+   subgroups = MIN2(AGX_MAX_SUBGROUPS_PER_CORE, subgroups);
+
+   if (dwords > scratch->size_dwords) {
+      scratch->size_dwords = dwords;
+      realloc = true;
+   }
+
+   if (subgroups > scratch->subgroups) {
+      scratch->subgroups = subgroups;
+      realloc = true;
+   }
+
+   if (realloc) {
+      agx_scratch_realloc(scratch);
+   }
+}
+
+void
+agx_scratch_debug_pre(struct agx_scratch *scratch)
+{
+   if (!scratch->buf)
+      return;
+
+   for (int core = 0; core < scratch->max_core_id; core++) {
+      assert(!scratch->header->cores[core].alloc_cur);
+      scratch->header->cores[core].alloc_max = 0;
+      scratch->header->cores[core].alloc_failed = 0;
+      memset(scratch->header->cores[core].alloc_count, 0,
+             sizeof(scratch->header->cores[core].alloc_count));
+   }
+}
+
+void
+agx_scratch_debug_post(struct agx_scratch *scratch)
+{
+   if (!scratch->buf)
+      return;
+
+   fprintf(stderr, "Scratch @ 0x%" PRIx64 "\n", scratch->buf->ptr.gpu);
+
+   for (int core = 0; core < scratch->max_core_id; core++) {
+      fprintf(stderr, "Core %3d: max %d, failed %d, counts:", core,
+              scratch->header->cores[core].alloc_max,
+              scratch->header->cores[core].alloc_failed);
+
+      for (unsigned bucket = 0; bucket < AGX_SPILL_SIZE_BUCKETS; bucket++) {
+         fprintf(stderr, " %d:%-3d",
+                 bucket ? (AGX_SPILL_UNIT_DWORDS << (bucket - 1)) : 0,
+                 scratch->header->cores[core].alloc_count[bucket]);
+      }
+      fprintf(stderr, "\n");
+      assert(!scratch->header->cores[core].alloc_cur);
+      assert(!scratch->header->cores[core].alloc_failed);
+   }
+
+#ifdef SCRATCH_DEBUG
+   unsigned core_index = 0;
+   for (int core = 0; core < scratch->max_core_id; core++) {
+      if (!scratch->core_present[core])
+         continue;
+      void *p = scratch->data + scratch->core_size * core_index++;
+      fprintf(stderr, "\nCORE %d (0x%lx)\n", core, scratch->core_size);
+      u_hexdump(stderr, p, scratch->core_size, true);
+   }
+#endif
+}
+
 void
 agx_scratch_init(struct agx_device *dev, struct agx_scratch *scratch)
 {
+   memset(scratch, 0, sizeof(*scratch));
+
+   scratch->dev = dev;
+   scratch->num_cores = 0;
+   for (unsigned cl = 0; cl < dev->params.num_clusters_total; cl++) {
+      scratch->num_cores += util_bitcount(dev->params.core_masks[cl]);
+   }
 }

 void
 agx_scratch_fini(struct agx_scratch *scratch)
 {
+   if (scratch->buf)
+      agx_bo_unreference(scratch->buf);
+   scratch->buf = NULL;
 }
--- a/src/asahi/lib/agx_scratch.h
+++ b/src/asahi/lib/agx_scratch.h
@ -6,15 +6,38 @@
 #define AGX_SCRATCH_H

 #include "agx_device.h"
+#include <agx_pack.h>
+
+// #define SCRATCH_DEBUG

 struct agx_scratch {
   struct agx_device *dev;
   struct agx_bo *buf;
+   uint32_t max_core_id;
+   uint32_t num_cores;
+
+   uint32_t subgroups;
+   uint32_t size_dwords;
+
+   struct agx_helper_header *header;
+
+#ifdef SCRATCH_DEBUG
+   bool core_present[1024];
+   struct agx_helper_block *blocklist;
+   void *data;
+   size_t core_size;
+#endif
 };

 struct agx_bo *agx_build_helper(struct agx_device *dev);

 void agx_scratch_init(struct agx_device *dev, struct agx_scratch *scratch);
 void agx_scratch_fini(struct agx_scratch *scratch);
+void agx_scratch_debug_pre(struct agx_scratch *scratch);
+void agx_scratch_debug_post(struct agx_scratch *scratch);
+
+uint32_t agx_scratch_get_bucket(uint32_t dwords);
+void agx_scratch_alloc(struct agx_scratch *scratch, uint32_t dwords,
+                       size_t subgroups);

 #endif
--- a/src/asahi/lib/shaders/helper.cl
+++ b/src/asahi/lib/shaders/helper.cl
@ -2,13 +2,92 @@
 * Copyright 2023 Asahi Lina
 * SPDX-License-Identifier: MIT
 */
+#include "helper.h"
 #include "libagx.h"

-#define DB_NEXT   32
-#define DB_ACK    48
-#define DB_NACK   49
+#define DB_NEXT 32
+#define DB_ACK  48
+#define DB_NACK 49
+
+enum helper_op {
+   OP_STACK_ALLOC = 0,
+   OP_STACK_FREE = 1,
+   OP_THREADGROUP_ALLOC = 4,
+   OP_THREADGROUP_FREE = 5,
+   OP_END = 15,
+};

 void
 libagx_helper(void)
 {
+   uint64_t arg =
+      nir_load_helper_arg_lo_agx() | (((uint64_t)nir_load_helper_arg_hi_agx()) << 32);
+
+   global struct agx_helper_header *hdr =
+      (global struct agx_helper_header *)arg;
+
+   uint32_t core_index = nir_load_core_id_agx();
+   uint32_t subgroups = hdr->subgroups;
+   global struct agx_helper_core *core = &hdr->cores[core_index];
+
+   while (1) {
+      nir_doorbell_agx(DB_NEXT);
+      uint32_t op = nir_load_helper_op_id_agx();
+      uint32_t arg = nir_load_helper_arg_lo_agx();
+
+      switch (op) {
+      case OP_STACK_ALLOC: {
+         uint32_t idx = core->alloc_cur;
+         if (idx >= subgroups) {
+            core->alloc_failed++;
+            nir_doorbell_agx(DB_NACK);
+            break;
+         }
+         core->alloc_max = max(core->alloc_max, ++core->alloc_cur);
+         core->alloc_count[arg]++;
+
+         nir_stack_map_agx(0, core->blocklist[idx].blocks[0]);
+         nir_stack_map_agx(1, core->blocklist[idx].blocks[1]);
+         nir_stack_map_agx(2, core->blocklist[idx].blocks[2]);
+         nir_stack_map_agx(3, core->blocklist[idx].blocks[3]);
+         nir_doorbell_agx(DB_ACK);
+         break;
+      }
+
+      case OP_STACK_FREE: {
+         if (!core->alloc_cur) { // underflow
+            nir_doorbell_agx(DB_NACK);
+            break;
+         }
+         uint32_t idx = --core->alloc_cur;
+         core->blocklist[idx].blocks[0] = nir_stack_unmap_agx(0);
+         core->blocklist[idx].blocks[1] = nir_stack_unmap_agx(1);
+         core->blocklist[idx].blocks[2] = nir_stack_unmap_agx(2);
+         core->blocklist[idx].blocks[3] = nir_stack_unmap_agx(3);
+         nir_doorbell_agx(DB_ACK);
+         break;
+      }
+
+      // TODO: Implement threadgroup allocs (for compute preemption)
+      case OP_THREADGROUP_ALLOC: {
+         nir_doorbell_agx(DB_NACK);
+         break;
+      }
+
+      case OP_THREADGROUP_FREE: {
+         nir_doorbell_agx(DB_NACK);
+         break;
+      }
+
+      case OP_END: {
+         nir_fence_helper_exit_agx();
+         return;
+      }
+
+      default:
+         *(global uint32_t *)(0xdead0000 | (op << 8)) = 0;
+         nir_fence_helper_exit_agx();
+         return;
+      }
+   }
 }
--- a/src/asahi/lib/shaders/helper.h
+++ b/src/asahi/lib/shaders/helper.h
@ -0,0 +1,42 @@
+/*
+ * Copyright 2023 Asahi Lina
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef LIBAGX_HELPER_H
+#define LIBAGX_HELPER_H
+
+#include "agx_pack.h"
+#include "libagx.h"
+
+#define AGX_SPILL_SIZE_BUCKETS 16
+
+#define AGX_MAX_CORES_PER_CLUSTER 16
+#define AGX_MAX_CLUSTERS          8
+#define AGX_MAX_CORE_ID           (AGX_MAX_CLUSTERS * AGX_MAX_CORES_PER_CLUSTER)
+
+struct agx_helper_block {
+   uint32_t blocks[4];
+} PACKED;
+AGX_STATIC_ASSERT(sizeof(struct agx_helper_block) == 16);
+
+struct agx_helper_core {
+   GLOBAL(struct agx_helper_block) blocklist;
+   uint32_t alloc_cur;
+   uint32_t alloc_max;
+   uint32_t alloc_failed;
+   uint32_t _pad;
+   uint32_t alloc_count[AGX_SPILL_SIZE_BUCKETS];
+} PACKED;
+AGX_STATIC_ASSERT(sizeof(struct agx_helper_core) ==
+                  (8 + 3 * 4 + AGX_SPILL_SIZE_BUCKETS * 4 + 4));
+
+struct agx_helper_header {
+   uint32_t subgroups;
+   uint32_t _pad;
+   struct agx_helper_core cores[AGX_MAX_CORE_ID];
+} PACKED;
+AGX_STATIC_ASSERT(sizeof(struct agx_helper_header) ==
+                  (4 + 4 + AGX_MAX_CORE_ID * sizeof(struct agx_helper_core)));
+
+#endif