hk: parallelize after-graphics available sets

reduces cdm overhead. Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31532>
2026-05-05 07:28:11 +02:00 · 2024-09-18 18:40:44 -04:00 · 2024-09-18 18:40:44 -04:00 · ad8f005ecb
commit ad8f005ecb
parent a2edffad2f
6 changed files with 87 additions and 6 deletions
--- a/src/asahi/lib/shaders/query.cl
+++ b/src/asahi/lib/shaders/query.cl
@ -109,3 +109,9 @@ libagx_increment_ia_counters(constant struct libagx_increment_ia_counters *p,
      *(p->vs_invocations) += count;
   }
 }
+
+void
+libagx_write_u32s(constant struct libagx_imm_write *p, uint id)
+{
+   *(p[id].address) = p[id].value;
+}
--- a/src/asahi/lib/shaders/query.h
+++ b/src/asahi/lib/shaders/query.h
@ -67,3 +67,8 @@ struct libagx_increment_ia_counters {
   uint32_t index_buffer_range_el;
   uint32_t restart_index;
 };
+
+struct libagx_imm_write {
+   GLOBAL(uint32_t) address;
+   uint32_t value;
+};
--- a/src/asahi/vulkan/hk_cmd_buffer.c
+++ b/src/asahi/vulkan/hk_cmd_buffer.c
@ -269,7 +269,7 @@ hk_EndCommandBuffer(VkCommandBuffer commandBuffer)

   perf_debug(dev, "End command buffer");
   hk_cmd_buffer_end_compute(cmd);
-   hk_cmd_buffer_end_compute_internal(&cmd->current_cs.post_gfx);
+   hk_cmd_buffer_end_compute_internal(cmd, &cmd->current_cs.post_gfx);

   /* With rasterizer discard, we might end up with empty VDM batches.
    * It is difficult to avoid creating these empty batches, but it's easy to
--- a/src/asahi/vulkan/hk_cmd_buffer.h
+++ b/src/asahi/vulkan/hk_cmd_buffer.h
@ -325,6 +325,11 @@ struct hk_cs {
      struct hk_scratch_req fs;
   } scratch;

+   /* Immediate writes, type libagx_imm_write. These all happen in parallel at
+    * the end of the control stream. This accelerates queries. Implies CDM.
+    */
+   struct util_dynarray imm_writes;
+
   /* Statistics */
   struct {
      uint32_t calls, cmds, flushes;
@ -565,16 +570,29 @@ hk_cs_destroy(struct hk_cs *cs)
   if (cs->type == HK_CS_VDM) {
      util_dynarray_fini(&cs->scissor);
      util_dynarray_fini(&cs->depth_bias);
+   } else {
+      util_dynarray_fini(&cs->imm_writes);
   }

   free(cs);
 }

+void hk_dispatch_imm_writes(struct hk_cmd_buffer *cmd, struct hk_cs *cs);
+
 static void
-hk_cmd_buffer_end_compute_internal(struct hk_cs **ptr)
+hk_cmd_buffer_end_compute_internal(struct hk_cmd_buffer *cmd,
+                                   struct hk_cs **ptr)
 {
   if (*ptr) {
      struct hk_cs *cs = *ptr;
+
+      /* This control stream may write immediates as it ends. Queue the writes
+       * now that we're done emitting everything else.
+       */
+      if (cs->imm_writes.size) {
+         hk_dispatch_imm_writes(cmd, cs);
+      }
+
      void *map = cs->current;
      agx_push(map, CDM_STREAM_TERMINATE, _)
         ;
@ -588,7 +606,7 @@ hk_cmd_buffer_end_compute_internal(struct hk_cs **ptr)
 static void
 hk_cmd_buffer_end_compute(struct hk_cmd_buffer *cmd)
 {
-   hk_cmd_buffer_end_compute_internal(&cmd->current_cs.cs);
+   hk_cmd_buffer_end_compute_internal(cmd, &cmd->current_cs.cs);
 }

 static void
@ -615,8 +633,8 @@ hk_cmd_buffer_end_graphics(struct hk_cmd_buffer *cmd)

      cmd->current_cs.gfx->current = map;
      cmd->current_cs.gfx = NULL;
-      hk_cmd_buffer_end_compute_internal(&cmd->current_cs.pre_gfx);
-      hk_cmd_buffer_end_compute_internal(&cmd->current_cs.post_gfx);
+      hk_cmd_buffer_end_compute_internal(cmd, &cmd->current_cs.pre_gfx);
+      hk_cmd_buffer_end_compute_internal(cmd, &cmd->current_cs.post_gfx);
   }

   assert(cmd->current_cs.gfx == NULL);
--- a/src/asahi/vulkan/hk_cmd_draw.c
+++ b/src/asahi/vulkan/hk_cmd_draw.c
@ -752,7 +752,7 @@ hk_CmdBeginRendering(VkCommandBuffer commandBuffer,
   cs->cr.zls_control = render->cr.zls_control;

   /* Reordering barrier for post-gfx, in case we had any. */
-   hk_cmd_buffer_end_compute_internal(&cmd->current_cs.post_gfx);
+   hk_cmd_buffer_end_compute_internal(cmd, &cmd->current_cs.post_gfx);

   /* Don't reorder compute across render passes.
    *
--- a/src/asahi/vulkan/hk_query_pool.c
+++ b/src/asahi/vulkan/hk_query_pool.c
@ -28,6 +28,7 @@
 #include "compiler/nir/nir_builder.h"

 #include "util/os_time.h"
+#include "util/u_dynarray.h"
 #include "vulkan/vulkan_core.h"

 struct hk_query_report {
@ -209,6 +210,45 @@ hk_nir_write_u32(nir_builder *b, UNUSED const void *key)
   nir_store_global(b, addr, 4, value, nir_component_mask(1));
 }

+static void
+hk_nir_write_u32s(nir_builder *b, const void *data)
+{
+   nir_def *params = nir_load_preamble(b, 1, 64, .base = 0);
+   nir_def *id = nir_channel(b, nir_load_global_invocation_id(b, 32), 0);
+
+   libagx_write_u32s(b, params, id);
+}
+
+void
+hk_dispatch_imm_writes(struct hk_cmd_buffer *cmd, struct hk_cs *cs)
+{
+   hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */);
+
+   /* As soon as we mark a query available, it needs to be available system
+    * wide, otherwise a CPU-side get result can query. As such, we cache flush
+    * before and then let coherency works its magic. Without this barrier, we
+    * get flakes in
+    *
+    * dEQP-VK.query_pool.occlusion_query.get_results_conservative_size_64_wait_query_without_availability_draw_triangles_discard
+    */
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   hk_cdm_cache_flush(dev, cs);
+
+   perf_debug(dev, "Queued writes");
+
+   struct hk_shader *s = hk_meta_kernel(dev, hk_nir_write_u32s, NULL, 0);
+   uint64_t params =
+      hk_pool_upload(cmd, cs->imm_writes.data, cs->imm_writes.size, 16);
+   uint32_t usc = hk_upload_usc_words_kernel(cmd, s, &params, sizeof(params));
+
+   uint32_t count =
+      util_dynarray_num_elements(&cs->imm_writes, struct libagx_imm_write);
+   assert(count > 0);
+
+   hk_dispatch_with_usc(dev, cs, s, usc, hk_grid(count, 1, 1),
+                        hk_grid(32, 1, 1));
+}
+
 void
 hk_queue_write(struct hk_cmd_buffer *cmd, uint64_t address, uint32_t value,
               bool after_gfx)
@ -218,6 +258,18 @@ hk_queue_write(struct hk_cmd_buffer *cmd, uint64_t address, uint32_t value,
   if (!cs)
      return;

+   /* TODO: Generalize this mechanism suitably */
+   if (after_gfx) {
+      struct libagx_imm_write imm = {.address = address, .value = value};
+
+      if (!cs->imm_writes.data) {
+         util_dynarray_init(&cs->imm_writes, NULL);
+      }
+
+      util_dynarray_append(&cs->imm_writes, struct libagx_imm_write, imm);
+      return;
+   }
+
   hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */);

   /* As soon as we mark a query available, it needs to be available system