turnip: Make autotuner work with reusable command buffers

To achieve it each command buffer now has its own GPU memory. However the BOs usage by autotuner is not optimal, the ideal pattern would be to use some memory pool to suballocate small GPU memory chunks, since most command buffers have only a few renderpasses. Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/5990 Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14996>
2026-05-05 20:28:04 +02:00 · 2022-02-11 18:15:27 +02:00 · 2022-02-11 18:15:27 +02:00 · 2e878293f4
commit 2e878293f4
parent d9e400b9b6
4 changed files with 319 additions and 210 deletions
--- a/src/freedreno/vulkan/tu_autotune.c
+++ b/src/freedreno/vulkan/tu_autotune.c
@ -27,17 +27,34 @@
 #include "tu_private.h"
 #include "tu_cs.h"

-/* In Vulkan application may fill command buffer from many threads
- * and expect no locking to occur. We do introduce the possibility of
- * locking on renderpass end, however assuming that application
- * doesn't have a huge amount of slightly different renderpasses,
- * there would be minimal to none contention.
+/* How does it work?
 *
- * Other assumptions are:
- * - Application does submit command buffers soon after their creation.
+ * - For each renderpass we calculate the number of samples passed
+ *   by storing the number before and after in GPU memory.
+ * - To store the values each command buffer holds GPU memory which
+ *   expands with more renderpasses being written.
+ * - For each renderpass we create tu_renderpass_result entry which
+ *   points to the results in GPU memory.
+ *   - Later on tu_renderpass_result would be added to the
+ *     tu_renderpass_history entry which aggregate results for a
+ *     given renderpass.
+ * - On submission:
+ *   - Process results which fence was signalled.
+ *   - Free per-submission data which we now don't need.
 *
- * Breaking the above may lead to some decrease in performance or
- * autotuner turning itself off.
+ *   - Create a command stream to write a fence value. This way we would
+ *     know when we could safely read the results.
+ *   - We cannot rely on the command buffer's lifetime when referencing
+ *     its resources since the buffer could be destroyed before we process
+ *     the results.
+ *   - For each command buffer:
+ *     - Reference its GPU memory.
+ *     - Move if ONE_TIME_SUBMIT or copy all tu_renderpass_result to the queue.
+ *
+ * Since the command buffers could be recorded on different threads
+ * we have to maintaining some amount of locking history table,
+ * however we change the table only in a single thread at the submission
+ * time, so in most cases there will be no locking.
 */

 #define TU_AUTOTUNE_DEBUG_LOG 0
@ -46,9 +63,13 @@
 */
 #define TU_AUTOTUNE_LOG_AT_FINISH 0

+/* How many last renderpass stats are taken into account. */
 #define MAX_HISTORY_RESULTS 5
+/* For how many submissions we store renderpass stats. */
 #define MAX_HISTORY_LIFETIME 128

+#define TU_AUTOTUNE_RP_BO_SIZE 4096
+
 /**
 * Tracks results for a given renderpass key
 */
@ -67,13 +88,101 @@ struct tu_renderpass_history {
   uint32_t avg_samples;
 };

-/* Holds per-submission cs which writes the fence. */
-struct tu_submission_fence_cs {
-   struct list_head node;
-   struct tu_cs cs;
-   uint32_t fence;
+struct tu_autotune_results_buffer
+{
+   int32_t ref_cnt;
+
+   struct tu_device *device;
+
+   /* TODO: It would be better to suballocate the space from
+    * a memory pool which would create less BOs and waste less space.
+    */
+   struct tu_bo **bos;
+   uint32_t num_bos;
+   uint32_t results_written;
 };

+static struct tu_autotune_results_buffer*
+tu_autotune_results_buffer_create(struct tu_device *dev)
+{
+   struct tu_autotune_results_buffer* buffer =
+      malloc(sizeof(struct tu_autotune_results_buffer));
+
+   buffer->ref_cnt = 1;
+   buffer->device = dev;
+   buffer->results_written = 0;
+   buffer->num_bos = 0;
+   buffer->bos = NULL;
+
+   return buffer;
+}
+
+void
+tu_autotune_results_buffer_ref(struct tu_autotune_results_buffer *buffer)
+{
+   assert(buffer && buffer->ref_cnt >= 1);
+   p_atomic_inc(&buffer->ref_cnt);
+}
+
+void
+tu_autotune_results_buffer_unref(struct tu_autotune_results_buffer *buffer)
+{
+   assert(buffer && buffer->ref_cnt >= 1);
+   if (p_atomic_dec_zero(&buffer->ref_cnt)) {
+      for (int i = 0; i < buffer->num_bos; i++)
+         tu_bo_finish(buffer->device, buffer->bos[i]);
+
+      ralloc_free(buffer->bos);
+      free(buffer);
+   }
+}
+
+/* Holds per-submission cs which writes the fence. */
+struct tu_submission_data {
+   struct list_head node;
+   uint32_t fence;
+
+   struct tu_cs fence_cs;
+   struct tu_autotune_results_buffer **buffers;
+   uint32_t buffers_count;
+};
+
+static struct tu_submission_data *
+create_submission_data(struct tu_device *dev, struct tu_autotune *at)
+{
+   struct tu_submission_data *submission_data =
+      calloc(1, sizeof(struct tu_submission_data));
+   submission_data->fence = at->fence_counter;
+
+   struct tu_cs* fence_cs = &submission_data->fence_cs;
+   tu_cs_init(fence_cs, dev, TU_CS_MODE_GROW, 5);
+   tu_cs_begin(fence_cs);
+
+   tu_cs_emit_pkt7(fence_cs, CP_EVENT_WRITE, 4);
+   tu_cs_emit(fence_cs, CP_EVENT_WRITE_0_EVENT(CACHE_FLUSH_TS));
+   tu_cs_emit_qw(fence_cs, dev->global_bo->iova + gb_offset(autotune_fence));
+   tu_cs_emit(fence_cs, at->fence_counter);
+
+   tu_cs_end(fence_cs);
+
+   list_addtail(&submission_data->node, &at->pending_submission_data);
+
+   return submission_data;
+}
+
+static void
+free_submission_data(struct tu_submission_data *data)
+{
+   list_del(&data->node);
+   tu_cs_finish(&data->fence_cs);
+   for (uint32_t i = 0; i < data->buffers_count; i++) {
+      tu_autotune_results_buffer_unref(data->buffers[i]);
+   }
+
+   free(data->buffers);
+   free(data);
+}
+
 #define APPEND_TO_HASH(state, field) \
   XXH64_update(state, &field, sizeof(field));

@ -125,8 +234,6 @@ static void
 result_destructor(void *r)
 {
   struct tu_renderpass_result *result = r;
-
-   /* Just in case we manage to somehow still be on the pending_results list: */
   list_del(&result->node);
 }

@ -157,8 +264,6 @@ static struct tu_renderpass_result *
 create_history_result(struct tu_autotune *at, uint64_t rp_key)
 {
   struct tu_renderpass_result *result = rzalloc_size(NULL, sizeof(*result));
-
-   result->idx = p_atomic_inc_return(&at->idx_counter);
   result->rp_key = rp_key;

   ralloc_set_destructor(result, result_destructor);
@ -199,10 +304,8 @@ history_add_result(struct tu_renderpass_history *history,
 static void
 process_results(struct tu_autotune *at)
 {
-   uint32_t current_fence = at->results->fence;
-
-   uint32_t min_idx = ~0;
-   uint32_t max_idx = 0;
+   struct tu6_global *global = at->device->global_bo->map;
+   uint32_t current_fence = global->autotune_fence;

   list_for_each_entry_safe(struct tu_renderpass_result, result,
                            &at->pending_results, node) {
@ -210,56 +313,42 @@ process_results(struct tu_autotune *at)
         break;

      struct tu_renderpass_history *history = result->history;
-
-      min_idx = MIN2(min_idx, result->idx);
-      max_idx = MAX2(max_idx, result->idx);
-      uint32_t idx = result->idx % ARRAY_SIZE(at->results->result);
-
-      result->samples_passed = at->results->result[idx].samples_end -
-                               at->results->result[idx].samples_start;
+      result->samples_passed =
+         result->samples->samples_end - result->samples->samples_start;

      history_add_result(history, result);
   }

-   list_for_each_entry_safe(struct tu_submission_fence_cs, submission_cs,
-                            &at->pending_submission_cs, node) {
-      if (submission_cs->fence > current_fence)
+   list_for_each_entry_safe(struct tu_submission_data, submission_data,
+                            &at->pending_submission_data, node) {
+      if (submission_data->fence > current_fence)
         break;

-      list_del(&submission_cs->node);
-      tu_cs_finish(&submission_cs->cs);
-      free(submission_cs);
-   }
-
-   if (max_idx - min_idx > TU_AUTOTUNE_MAX_RESULTS) {
-      /* If results start to trample each other it's better to bail out */
-      at->enabled = false;
-      mesa_logw("disabling sysmem vs gmem autotuner because results "
-                "are trampling each other: min_idx=%u, max_idx=%u",
-                min_idx, max_idx);
+      free_submission_data(submission_data);
   }
 }

-static struct tu_cs *
-create_fence_cs(struct tu_device *dev, struct tu_autotune *at)
+static void
+queue_pending_results(struct tu_autotune *at, struct tu_cmd_buffer *cmdbuf)
 {
-   struct tu_submission_fence_cs *submission_cs =
-      calloc(1, sizeof(struct tu_submission_fence_cs));
-   submission_cs->fence = at->fence_counter;
+   bool one_time_submit = cmdbuf->usage_flags &
+         VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;

-   tu_cs_init(&submission_cs->cs, dev, TU_CS_MODE_GROW, 5);
-   tu_cs_begin(&submission_cs->cs);
-
-   tu_cs_emit_pkt7(&submission_cs->cs, CP_EVENT_WRITE, 4);
-   tu_cs_emit(&submission_cs->cs, CP_EVENT_WRITE_0_EVENT(CACHE_FLUSH_TS));
-   tu_cs_emit_qw(&submission_cs->cs, autotune_results_ptr(at, fence));
-   tu_cs_emit(&submission_cs->cs, at->fence_counter);
-
-   tu_cs_end(&submission_cs->cs);
-
-   list_addtail(&submission_cs->node, &at->pending_submission_cs);
-
-   return &submission_cs->cs;
+   if (one_time_submit) {
+      /* We can just steal the list since it won't be resubmitted again */
+      list_splicetail(&cmdbuf->renderpass_autotune_results,
+                        &at->pending_results);
+      list_inithead(&cmdbuf->renderpass_autotune_results);
+   } else {
+      list_for_each_entry_safe(struct tu_renderpass_result, result,
+                              &cmdbuf->renderpass_autotune_results, node) {
+         /* TODO: copying each result isn't nice */
+         struct tu_renderpass_result *copy = ralloc_size(NULL, sizeof(*result));
+         ralloc_set_destructor(result, result_destructor);
+         *copy = *result;
+         list_addtail(&copy->node, &at->pending_results);
+      }
+   }
 }

 struct tu_cs *
@ -274,6 +363,7 @@ tu_autotune_on_submit(struct tu_device *dev,

   /* pre-increment so zero isn't valid fence */
   uint32_t new_fence = ++at->fence_counter;
+   uint32_t result_buffers = 0;

   /* Create history entries here to minimize work and locking being
    * done on renderpass end.
@ -305,12 +395,28 @@ tu_autotune_on_submit(struct tu_device *dev,
      }

      if (!list_is_empty(&cmdbuf->renderpass_autotune_results)) {
-         list_splicetail(&cmdbuf->renderpass_autotune_results,
-                         &at->pending_results);
-         list_inithead(&cmdbuf->renderpass_autotune_results);
+         result_buffers++;
      }
   }

+   struct tu_submission_data *submission_data =
+      create_submission_data(dev, at);
+   submission_data->buffers_count = result_buffers;
+   submission_data->buffers =
+      malloc(sizeof(struct tu_autotune_results_buffer *) * result_buffers);
+
+   uint32_t buffer_idx = 0;
+   for (uint32_t i = 0; i < cmd_buffer_count; i++) {
+      struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
+      if (list_is_empty(&cmdbuf->renderpass_autotune_results))
+         continue;
+
+      queue_pending_results(at, cmdbuf);
+
+      submission_data->buffers[buffer_idx++] = cmdbuf->autotune_buffer;
+      tu_autotune_results_buffer_ref(cmdbuf->autotune_buffer);
+   }
+
 #if TU_AUTOTUNE_DEBUG_LOG != 0
   mesa_logi("Total history entries: %u", at->ht->entries);
 #endif
@ -336,7 +442,7 @@ tu_autotune_on_submit(struct tu_device *dev,
      ralloc_free(history);
   }

-   return create_fence_cs(dev, at);
+   return &submission_data->fence_cs;
 }

 static bool
@ -354,44 +460,17 @@ renderpass_key_hash(const void *_a)
 VkResult
 tu_autotune_init(struct tu_autotune *at, struct tu_device *dev)
 {
-   VkResult result;
-
   at->enabled = true;
+   at->device = dev;
   at->ht = _mesa_hash_table_create(NULL,
                                    renderpass_key_hash,
                                    renderpass_key_equals);
   u_rwlock_init(&at->ht_lock);

-   result = tu_bo_init_new(dev, &at->results_bo,
-                           sizeof(struct tu_autotune_results),
-                           TU_BO_ALLOC_NO_FLAGS);
-   if (result != VK_SUCCESS) {
-      vk_startup_errorf(dev->instance, result, "Autotune BO init");
-      goto fail_bo;
-   }
-
-   result = tu_bo_map(dev, at->results_bo);
-
-   if (result != VK_SUCCESS) {
-      vk_startup_errorf(dev->instance, result, "Autotune BO map");
-      goto fail_map_bo;
-   }
-
-   at->results = at->results_bo->map;
-
   list_inithead(&at->pending_results);
-   list_inithead(&at->pending_submission_cs);
+   list_inithead(&at->pending_submission_data);

   return VK_SUCCESS;
-
-fail_map_bo:
-   tu_bo_finish(dev, at->results_bo);
-
-fail_bo:
-   u_rwlock_destroy(&at->ht_lock);
-   _mesa_hash_table_destroy(at->ht, NULL);
-
-   return result;
 }

 void
@ -417,15 +496,13 @@ tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev)
      ralloc_free(history);
   }

-   list_for_each_entry_safe(struct tu_submission_fence_cs, submission_cs,
-                            &at->pending_submission_cs, node) {
-      tu_cs_finish(&submission_cs->cs);
-      free(submission_cs);
+   list_for_each_entry_safe(struct tu_submission_data, submission_data,
+                            &at->pending_submission_data, node) {
+      free_submission_data(submission_data);
   }

   _mesa_hash_table_destroy(at->ht, NULL);
   u_rwlock_destroy(&at->ht_lock);
-   tu_bo_finish(dev, at->results_bo);
 }

 bool
@ -487,16 +564,16 @@ tu_autotune_use_bypass(struct tu_autotune *at,
         return false;
   }

-   /* If we would want to support buffers that could be submitted
-    * several times we would have to copy the sample counts of renderpasses
-    * after each submission of such buffer (like with u_trace support).
-    * This is rather messy and since almost all apps use ONE_TIME_SUBMIT
-    * we choose to unconditionally use fallback.
+   /* For VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT buffers
+    * we would have to allocate GPU memory at the submit time and copy
+    * results into it.
+    * Native games ususally don't use it, Zink and DXVK don't use it,
+    * D3D12 doesn't have such concept.
    */
-   bool one_time_submit = cmd_buffer->usage_flags &
-      VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+   bool simultaneous_use =
+      cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;

-   if (!at->enabled || !one_time_submit)
+   if (!at->enabled || simultaneous_use)
      return fallback_use_bypass(pass, framebuffer, cmd_buffer);

   /* We use 64bit hash as a key since we don't fear rare hash collision,
@ -555,3 +632,83 @@ tu_autotune_use_bypass(struct tu_autotune *at,

   return fallback_use_bypass(pass, framebuffer, cmd_buffer);
 }
+
+static uint32_t
+get_offset_for_renderpass(struct tu_autotune_results_buffer *buffer)
+{
+   uint32_t results_per_bo =
+      TU_AUTOTUNE_RP_BO_SIZE / sizeof(struct tu_renderpass_samples);
+   return (buffer->results_written % results_per_bo) *
+          sizeof(struct tu_renderpass_samples);
+}
+
+static struct tu_bo *
+get_bo_for_renderpass(struct tu_autotune_results_buffer *buffer)
+{
+   if (get_offset_for_renderpass(buffer) == 0) {
+      buffer->num_bos++;
+      buffer->bos =
+         reralloc(NULL, buffer->bos, struct tu_bo *, buffer->num_bos);
+      struct tu_bo **new_bo = &buffer->bos[buffer->num_bos - 1];
+
+      tu_bo_init_new(buffer->device, new_bo, TU_AUTOTUNE_RP_BO_SIZE,
+                     TU_BO_ALLOC_NO_FLAGS);
+      tu_bo_map(buffer->device, *new_bo);
+   }
+
+   return buffer->bos[buffer->num_bos - 1];
+}
+
+void
+tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd,
+                             struct tu_cs *cs,
+                             struct tu_renderpass_result *autotune_result)
+{
+   if (!autotune_result)
+      return;
+
+   /* Lazily allocate memory for renderpass results.
+    * Secondary command buffers do not support renderpasses.
+    */
+   assert(cmd->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
+   if (!cmd->autotune_buffer) {
+      cmd->autotune_buffer = tu_autotune_results_buffer_create(cmd->device);
+   }
+
+   uint32_t bo_offset = get_offset_for_renderpass(cmd->autotune_buffer);
+   struct tu_bo *bo = get_bo_for_renderpass(cmd->autotune_buffer);
+
+   uint64_t result_iova = bo->iova + bo_offset;
+
+   autotune_result->samples =
+      (struct tu_renderpass_samples *) (bo->map + bo_offset);
+
+   tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
+
+   tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_ADDR(.qword = result_iova));
+
+   tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
+   tu_cs_emit(cs, ZPASS_DONE);
+}
+
+void tu_autotune_end_renderpass(struct tu_cmd_buffer *cmd,
+                                struct tu_cs *cs,
+                                struct tu_renderpass_result *autotune_result)
+{
+   if (!autotune_result)
+      return;
+
+   uint32_t bo_offset = get_offset_for_renderpass(cmd->autotune_buffer);
+   struct tu_bo *bo = cmd->autotune_buffer->bos[cmd->autotune_buffer->num_bos - 1];
+   cmd->autotune_buffer->results_written += 1;
+
+   uint64_t result_iova = bo->iova + bo_offset +
+                          offsetof(struct tu_renderpass_samples, samples_end);
+
+   tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
+
+   tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_ADDR(.qword = result_iova));
+
+   tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
+   tu_cs_emit(cs, ZPASS_DONE);
+}
--- a/src/freedreno/vulkan/tu_autotune.h
+++ b/src/freedreno/vulkan/tu_autotune.h
@ -28,15 +28,9 @@
 #include "util/list.h"
 #include "util/rwlock.h"

-#define autotune_offset(base, ptr) ((uint8_t *)(ptr) - (uint8_t *)(base))
-#define autotune_results_ptr(at, member)             \
-   (at->results_bo->iova +                           \
-      autotune_offset((at)->results, &(at)->results->member))
-
 struct tu_device;
 struct tu_cmd_buffer;

-struct tu_autotune_results;
 struct tu_renderpass_history;

 /**
@ -75,6 +69,8 @@ struct tu_autotune {
    */
   bool enabled;

+   struct tu_device *device;
+
   /**
    * Cache to map renderpass key to historical information about
    * rendering to that particular render target.
@ -82,12 +78,6 @@ struct tu_autotune {
   struct hash_table *ht;
   struct u_rwlock ht_lock;

-   /**
-    * GPU buffer used to communicate back results to the CPU
-    */
-   struct tu_bo *results_bo;
-   struct tu_autotune_results *results;
-
   /**
    * List of per-renderpass results that we are waiting for the GPU
    * to finish with before reading back the results.
@ -95,49 +85,30 @@ struct tu_autotune {
   struct list_head pending_results;

   /**
-    * List of per-submission CS that we are waiting for the GPU
-    * to finish using.
+    * List of per-submission data that we may want to free after we
+    * processed submission results.
+    * This could happend after command buffers which were in the submission
+    * are destroyed.
    */
-   struct list_head pending_submission_cs;
+   struct list_head pending_submission_data;

   uint32_t fence_counter;
   uint32_t idx_counter;
 };

-#define TU_AUTOTUNE_MAX_RESULTS 256
-
 /**
- * The layout of the memory used to read back per-batch results from the
- * GPU
+ * From the cmdstream, the captured samples-passed values are recorded
+ * at the start and end of the batch.
 *
- * Note this struct is intentionally aligned to 4k.  And hw requires the
- * sample start/stop locations to be 128b aligned.
+ * Note that we do the math on the CPU to avoid a WFI.  But pre-emption
+ * may force us to revisit that.
 */
-struct tu_autotune_results {
-
-   /**
-    * The GPU writes back a "fence" seqno value from the cmdstream after
-    * it finishes the submission, so that the CPU knows when
-    * results are valid.
-    */
-   uint32_t fence;
-
-   uint32_t __pad0;
+struct tu_renderpass_samples {
+   uint64_t samples_start;
+   /* hw requires the sample start/stop locations to be 128b aligned. */
+   uint64_t __pad0;
+   uint64_t samples_end;
   uint64_t __pad1;
-
-   /**
-    * From the cmdstream, the captured samples-passed values are recorded
-    * at the start and end of the batch.
-    *
-    * Note that we do the math on the CPU to avoid a WFI.  But pre-emption
-    * may force us to revisit that.
-    */
-   struct {
-      uint64_t samples_start;
-      uint64_t __pad0;
-      uint64_t samples_end;
-      uint64_t __pad1;
-   } result[TU_AUTOTUNE_MAX_RESULTS];
 };

 /**
@ -146,12 +117,8 @@ struct tu_autotune_results {
 * time, when the GPU has finished writing the results, we fill samples_passed.
 */
 struct tu_renderpass_result {
-
-   /**
-    * The index/slot in tu_autotune_results::result[] to write start/end
-    * counter to
-    */
-   unsigned idx;
+   /* Points into GPU memory */
+   struct tu_renderpass_samples* samples;

   /*
    * Below here, only used internally within autotune
@ -183,5 +150,17 @@ struct tu_cs *tu_autotune_on_submit(struct tu_device *dev,
                                    struct tu_cmd_buffer **cmd_buffers,
                                    uint32_t cmd_buffer_count);

+struct tu_autotune_results_buffer;
+
+void tu_autotune_results_buffer_ref(struct tu_autotune_results_buffer *buffer);
+void tu_autotune_results_buffer_unref(struct tu_autotune_results_buffer *buffer);
+
+void tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd,
+                                  struct tu_cs *cs,
+                                  struct tu_renderpass_result *autotune_result);
+
+void tu_autotune_end_renderpass(struct tu_cmd_buffer *cmd,
+                                struct tu_cs *cs,
+                                struct tu_renderpass_result *autotune_result);

 #endif /* TU_AUTOTUNE_H */
--- a/src/freedreno/vulkan/tu_cmd_buffer.c
+++ b/src/freedreno/vulkan/tu_cmd_buffer.c
@ -1220,51 +1220,9 @@ tu_emit_renderpass_begin(struct tu_cmd_buffer *cmd,
   tu_cond_exec_end(cs);
 }

-static void
-tu6_autotune_begin(struct tu_cs *cs, struct tu_autotune *at,
-                   const struct tu_renderpass_result *autotune_result)
-{
-   if (!autotune_result)
-      return;
-
-   uint32_t result_idx = autotune_result->idx % TU_AUTOTUNE_MAX_RESULTS;
-   uint64_t begin_iova = autotune_results_ptr(at, result[result_idx].samples_start);
-
-   tu_cs_emit_regs(cs,
-                   A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
-
-   tu_cs_emit_regs(cs,
-                   A6XX_RB_SAMPLE_COUNT_ADDR(.qword = begin_iova));
-
-   tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
-   tu_cs_emit(cs, ZPASS_DONE);
-}
-
-static void
-tu6_autotune_end(struct tu_cs *cs, struct tu_autotune *at,
-                 const struct tu_renderpass_result *autotune_result)
-{
-   if (!autotune_result)
-      return;
-
-   uint32_t result_idx = autotune_result->idx % TU_AUTOTUNE_MAX_RESULTS;
-   uint64_t end_iova = autotune_results_ptr(at, result[result_idx].samples_end);
-
-   tu_cs_emit_regs(cs,
-                   A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
-
-   tu_cs_emit_regs(cs,
-                   A6XX_RB_SAMPLE_COUNT_ADDR(.qword = end_iova));
-
-   tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
-   tu_cs_emit(cs, ZPASS_DONE);
-
-   /* A fence would be emitted at the submission time */
-}
-
 static void
 tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
-                        const struct tu_renderpass_result *autotune_result)
+                        struct tu_renderpass_result *autotune_result)
 {
   const struct tu_framebuffer *fb = cmd->state.framebuffer;

@ -1294,16 +1252,16 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
   tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
   tu_cs_emit(cs, 0x0);

-   tu6_autotune_begin(cs, &cmd->device->autotune, autotune_result);
+   tu_autotune_begin_renderpass(cmd, cs, autotune_result);

   tu_cs_sanity_check(cs);
 }

 static void
 tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
-                      const struct tu_renderpass_result *autotune_result)
+                      struct tu_renderpass_result *autotune_result)
 {
-   tu6_autotune_end(cs, &cmd->device->autotune, autotune_result);
+   tu_autotune_end_renderpass(cmd, cs, autotune_result);

   /* Do any resolves of the last subpass. These are handled in the
    * tile_store_cs in the gmem path.
@ -1322,7 +1280,7 @@ tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,

 static void
 tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
-                      const struct tu_renderpass_result *autotune_result)
+                      struct tu_renderpass_result *autotune_result)
 {
   struct tu_physical_device *phys_dev = cmd->device->physical_device;

@ -1372,7 +1330,7 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
                        A6XX_RB_BIN_CONTROL_LRZ_FEEDBACK_ZMODE_MASK(0x6));
   }

-   tu6_autotune_begin(cs, &cmd->device->autotune, autotune_result);
+   tu_autotune_begin_renderpass(cmd, cs, autotune_result);

   tu_cs_sanity_check(cs);
 }
@ -1403,9 +1361,9 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs)

 static void
 tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
-                    const struct tu_renderpass_result *autotune_result)
+                    struct tu_renderpass_result *autotune_result)
 {
-   tu6_autotune_end(cs, &cmd->device->autotune, autotune_result);
+   tu_autotune_end_renderpass(cmd, cs, autotune_result);

   tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);

@ -1421,7 +1379,7 @@ tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,

 static void
 tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
-                    const struct tu_renderpass_result *autotune_result)
+                    struct tu_renderpass_result *autotune_result)
 {
   const struct tu_framebuffer *fb = cmd->state.framebuffer;

@ -1458,7 +1416,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,

 static void
 tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,
-                     const struct tu_renderpass_result *autotune_result)
+                     struct tu_renderpass_result *autotune_result)
 {
   tu6_sysmem_render_begin(cmd, &cmd->cs, autotune_result);

@ -1536,6 +1494,8 @@ tu_cmd_buffer_destroy(struct tu_cmd_buffer *cmd_buffer)

   u_trace_fini(&cmd_buffer->trace);

+   if (cmd_buffer->autotune_buffer)
+      tu_autotune_results_buffer_unref(cmd_buffer->autotune_buffer);
   tu_autotune_free_results(&cmd_buffer->renderpass_autotune_results);

   for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
@ -1562,6 +1522,15 @@ tu_reset_cmd_buffer(struct tu_cmd_buffer *cmd_buffer)
   tu_cs_reset(&cmd_buffer->draw_epilogue_cs);
   tu_cs_reset(&cmd_buffer->sub_cs);

+   /* We can't just reset the autotune_buffer's contents, because it is also
+    * referenced by the submission_data if the command buffer was submitted
+    * and we may be accessing it after cmdbuf reset/free.
+    */
+   if (cmd_buffer->autotune_buffer) {
+      tu_autotune_results_buffer_unref(cmd_buffer->autotune_buffer);
+      cmd_buffer->autotune_buffer = NULL;
+   }
+
   tu_autotune_free_results(&cmd_buffer->renderpass_autotune_results);

   for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
--- a/src/freedreno/vulkan/tu_private.h
+++ b/src/freedreno/vulkan/tu_private.h
@ -411,6 +411,9 @@ struct tu6_global

   ALIGN16 uint32_t cs_indirect_xyz[3];

+   /* To know when renderpass stats for autotune are valid */
+   volatile uint32_t autotune_fence;
+
   /* note: larger global bo will be used for customBorderColors */
   struct bcolor_entry bcolor_builtin[TU_BORDER_COLOR_BUILTIN], bcolor[];
 };
@ -1170,6 +1173,7 @@ struct tu_cmd_buffer
   struct u_trace_iterator trace_renderpass_end;

   struct list_head renderpass_autotune_results;
+   struct tu_autotune_results_buffer* autotune_buffer;

   VkCommandBufferUsageFlags usage_flags;
   enum tu_cmd_buffer_status status;