tu: Rewrite autotune in C++

Completely overhauls the autotuner in C++ with the functionality being extended as well. Signed-off-by: Dhruv Mark Collins <mark@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37802>
2026-05-16 14:08:07 +02:00 · 2025-10-09 19:34:43 +00:00 · 2025-10-09 19:34:43 +00:00 · 40ffc052af
commit 40ffc052af
parent bd88997c0a
9 changed files with 1234 additions and 842 deletions
--- a/docs/drivers/freedreno.rst
+++ b/docs/drivers/freedreno.rst
@ -670,3 +670,38 @@ are supported at the moment: ``nir``, ``nobin``, ``sysmem``, ``gmem``, ``forcebi
 Some of these options will behave differently when toggled at runtime, for example:
 ``nolrz`` will still result in LRZ allocation which would not happen if the option
 was set in the environment variable.
 Autotune
 ^^^^^^^^
 Turnip supports dynamically selecting between SYSMEM and GMEM rendering with the
 autotune system, the behavior of which can be controlled with the following
 environment variables:
 .. envvar:: TU_AUTOTUNE_ALGO
  Selects the algorithm used for autotuning. Supported values are:
  ``bandwidth``
    Estimates the bandwidth usage of rendering in SYSMEM and GMEM modes, and chooses
    the one with lower estimated bandwidth. This is the default algorithm.
 .. envvar:: TU_AUTOTUNE_FLAGS
  Modifies the behavior of the selected algorithm. Supported flags are:
  ``big_gmem``
    Always chooses GMEM rendering if the amount of draw calls in the render pass
    is greater than a certain threshold. Larger RPs generally benefit more from
    GMEM rendering due to less overhead from tiling. This tends to lead to worse
    performance in most cases, so it's only useful for testing.
  ``small_sysmem``
    Always chooses SYSMEM rendering if the amount of draw calls in the render pass
    is lower than a certain threshold. The benefits of GMEM rendering are less
    pronounced in these smaller RPs and SYSMEM rendering tends to win more often.
  Multiple flags can be combined by separating them with commas, e.g.
  ``TU_AUTOTUNE_FLAGS=big_gmem,small_sysmem``.
  If no flags are specified, the default behavior is used.
--- a/src/freedreno/vulkan/tu_autotune.cc
+++ b/src/freedreno/vulkan/tu_autotune.cc
--- a/src/freedreno/vulkan/tu_autotune.h
+++ b/src/freedreno/vulkan/tu_autotune.h
@ -8,150 +8,237 @@
 #include "tu_common.h"
-#include "util/hash_table.h"
+#include <atomic>
-#include "util/rwlock.h"
+#include <deque>
 #include <memory>
 #include <mutex>
 #include <shared_mutex>
 #include <unordered_map>
 #include <vector>
 #include "tu_cs.h"
 #include "tu_suballoc.h"
-struct tu_renderpass_history;
+/* Autotune allows for us to tune rendering parameters (such as GMEM vs SYSMEM, tile size divisor, etc.) based on
-
+ * dynamic analysis of the rendering workload via on-GPU profiling. This lets us make much better decisions than static
-/**
+ * analysis, since we can adapt to the actual workload rather than relying on heuristics.
 * "autotune" our decisions about bypass vs GMEM rendering, based on historical
 * data about a given render target.
 *
 * In deciding which path to take there are tradeoffs, including some that
 * are not reasonably estimateable without having some additional information:
 *
 *  (1) If you know you are touching every pixel (ie. there is a clear),
 *      then the GMEM path will at least not cost more memory bandwidth than
 *      sysmem[1]
 *
 *  (2) If there is no clear, GMEM could potentially cost *more* bandwidth
 *      if there is sysmem->GMEM restore pass.
 *
 *  (3) If you see a high draw count, that is an indication that there will be
 *      enough pixels accessed multiple times to benefit from the reduced
 *      memory bandwidth that GMEM brings
 *
 *  (4) But high draw count where there is not much overdraw can actually be
 *      faster in bypass mode if it is pushing a lot of state change, due to
 *      not having to go thru the state changes per-tile[1]
 *
 * The approach taken is to measure the samples-passed for the batch to estimate
 * the amount of overdraw to detect cases where the number of pixels touched is
 * low.
 *
 * [1] ignoring early-tile-exit optimizations, but any draw that touches all/
 *     most of the tiles late in the tile-pass can defeat that
 */
 struct tu_autotune {
-
+ private:
-   /* We may have to disable autotuner if there are too many
+   bool enabled = true;
    * renderpasses in-flight.
    */
   bool enabled;
   struct tu_device *device;
-   /**
+   /** Configuration **/
    * Cache to map renderpass key to historical information about
    * rendering to that particular render target.
    */
   struct hash_table *ht;
   struct u_rwlock ht_lock;
-   /**
+   enum class algorithm : uint8_t;
-    * List of per-renderpass results that we are waiting for the GPU
+   enum class mod_flag : uint8_t;
-    * to finish with before reading back the results.
+   enum class metric_flag : uint8_t;
-    */
+   /* Container for all autotune configuration options. */
-   struct list_head pending_results;
+   struct PACKED config_t;
   union PACKED packed_config_t;
-   /**
+   /* Allows for thread-safe access to the configurations. */
-    * List of per-submission data that we may want to free after we
+   struct atomic_config_t {
-    * processed submission results.
+    private:
-    * This could happend after command buffers which were in the submission
+      std::atomic<uint32_t> config_bits = 0;
    * are destroyed.
    */
   struct list_head pending_submission_data;
-   /**
+    public:
-    * List of per-submission data that has been finished and can be reused.
+      atomic_config_t(config_t initial_config);
    */
   struct list_head submission_data_pool;
-   uint32_t fence_counter;
+      config_t load() const;
-   uint32_t idx_counter;
+
      bool compare_and_store(config_t expected, config_t updated);
   } active_config;
   config_t get_env_config();
   /** Global Fence and Internal CS Management **/
   /* BO suballocator for reducing BO management for small GMEM/SYSMEM autotune result buffers.
    * Synchronized by suballoc_mutex.
    */
   struct tu_suballocator suballoc;
   std::mutex suballoc_mutex;
   /* The next value to assign to tu6_global::autotune_fence, this is incremented during on_submit. */
   uint32_t next_fence = 1;
   /* A wrapper around a CS which sets the global autotune fence to a certain fence value, this allows for ergonomically
    * managing the lifetime of the CS including recycling it after the fence value has been reached.
    */
   struct submission_entry {
    private:
      uint32_t fence;
      struct tu_cs fence_cs;
    public:
      explicit submission_entry(tu_device *device);
      ~submission_entry();
      /* Disable move/copy, since this holds stable pointers to the fence_cs. */
      submission_entry(const submission_entry &) = delete;
      submission_entry &operator=(const submission_entry &) = delete;
      submission_entry(submission_entry &&) = delete;
      submission_entry &operator=(submission_entry &&) = delete;
      /* The current state of the submission entry, this is used to track whether the CS is available for reuse, pending
       * GPU completion or currently being processed.
       */
      bool is_active() const;
      /* If the CS is free, returns the CS which will write out the specified fence value. Otherwise, returns nullptr. */
      struct tu_cs *try_get_cs(uint32_t new_fence);
   };
   /* Unified pool for submission CSes.
    * Note: This is a deque rather than a vector due to the lack of move semantics in the submission_entry.
    */
   std::deque<submission_entry> submission_entries;
   /* Returns a CS which will write out the specified fence value to the global BO's autotune fence. */
   struct tu_cs *get_cs_for_fence(uint32_t fence);
   /** RP Entry Management **/
   struct rp_gpu_data;
   struct tile_gpu_data;
   struct rp_entry;
   /* A wrapper over all entries associated with a single command buffer. */
   struct rp_entry_batch {
      bool active;    /* If the entry is ready to be processed, i.e. the entry is submitted to the GPU queue and has a
                         valid fence. */
      uint32_t fence; /* The fence value which is used to signal the completion of the CB submission. This is used to
                         determine when the entries can be processed. */
      std::vector<std::unique_ptr<rp_entry>> entries;
      rp_entry_batch();
      /* Disable the copy/move to avoid performance hazards. */
      rp_entry_batch(const rp_entry_batch &) = delete;
      rp_entry_batch &operator=(const rp_entry_batch &) = delete;
      rp_entry_batch(rp_entry_batch &&) = delete;
      rp_entry_batch &operator=(rp_entry_batch &&) = delete;
      void assign_fence(uint32_t new_fence);
      void mark_inactive();
   };
   /* A deque of entry batches that are strongly ordered by the fence value that was written by the GPU, for efficient
    * iteration and to ensure that we process the entries in the same order they were submitted.
    */
   std::deque<std::shared_ptr<rp_entry_batch>> active_batches;
   /* Handles processing of entry batches that are pending to be processed.
    *
    * Note: This must be called regularly to process the entries that have been written by the GPU. We currently do this
    *       in the on_submit() method, which is called on every submit of a command buffer.
    */
   void process_entries();
   /** Renderpass State Tracking **/
   struct rp_history;
   struct rp_history_handle;
   /* A strongly typed key which generates a hash to uniquely identify a renderpass instance. This hash is expected to
    * be stable across runs, so it can be used to identify the same renderpass instance consistently.
    *
    * Note: We can potentially include the vector of data we extract from the parameters to generate the hash into
    *       rp_key, which would lead to true value-based equality rather than just hash-based equality which has a cost
    *       but avoids hash collisions causing issues.
    */
   struct rp_key {
      uint64_t hash;
      rp_key(const struct tu_render_pass *pass,
             const struct tu_framebuffer *framebuffer,
             const struct tu_cmd_buffer *cmd);
      /* Equality operator, used in unordered_map. */
      constexpr bool operator==(const rp_key &other) const noexcept
      {
         return hash == other.hash;
      }
   };
   /* A thin wrapper to satisfy C++'s Hash named requirement for rp_key.
    *
    * Note: This should *NEVER* be used to calculate the hash itself as it would lead to the hash being calculated
    *       multiple times, rather than being calculated once and reused when there's multiple successive lookups like
    *       with find_or_create_rp_history() and providing the hash to the rp_history constructor.
    */
   struct rp_hash {
      constexpr size_t operator()(const rp_key &key) const noexcept
      {
         /* Note: This will throw away the upper 32-bits on 32-bit architectures. */
         return static_cast<size_t>(key.hash);
      }
   };
   /* A map between the hash of an RP and the historical state of the RP. Synchronized by rp_mutex. */
   using rp_histories_t = std::unordered_map<rp_key, rp_history, rp_hash>;
   rp_histories_t rp_histories;
   std::shared_mutex rp_mutex;
   uint64_t last_reap_ts = 0;
   /* Note: These will internally lock rp_mutex internally, no need to lock it. */
   rp_history_handle find_rp_history(const rp_key &key);
   rp_history_handle find_or_create_rp_history(const rp_key &key);
   void reap_old_rp_histories();
 public:
   tu_autotune(struct tu_device *device, VkResult &result);
   ~tu_autotune();
   /* Opaque pointer to internal structure with RP context that needs to be preserved across begin/end calls. */
   using rp_ctx_t = rp_entry *;
   /* An internal structure that needs to be held by tu_cmd_buffer to track the state of the autotuner for a given CB.
    *
    * Note: tu_cmd_buffer is only responsible for the lifetime of this object, all the access to the context state is
    *       done through tu_autotune.
    */
   struct cmd_buf_ctx {
    private:
      /* A batch of all entries from RPs within this CB. */
      std::shared_ptr<rp_entry_batch> batch;
      /* Creates a new RP entry attached to this CB. */
      rp_entry *
      attach_rp_entry(struct tu_device *device, rp_history_handle &&history, config_t config, uint32_t draw_count);
      friend struct tu_autotune;
    public:
      cmd_buf_ctx();
      ~cmd_buf_ctx();
      /* Resets the internal context, should be called when tu_cmd_buffer state has been reset. */
      void reset();
   };
   enum class render_mode {
      SYSMEM,
      GMEM,
   };
   render_mode get_optimal_mode(struct tu_cmd_buffer *cmd_buffer, rp_ctx_t *rp_ctx);
   void begin_renderpass(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx, bool sysmem);
   void end_renderpass(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx);
   /* The submit-time hook for autotuner, this may return a CS (can be NULL) which must be amended for autotuner
    * tracking to function correctly.
    *
    * Note: This must be called from a single-threaded context. There should never be multiple threads calling this
    *       function at the same time.
    */
   struct tu_cs *on_submit(struct tu_cmd_buffer **cmd_buffers, uint32_t cmd_buffer_count);
 };
-/**
+#endif /* TU_AUTOTUNE_H */
 * From the cmdstream, the captured samples-passed values are recorded
 * at the start and end of the batch.
 *
 * Note that we do the math on the CPU to avoid a WFI.  But pre-emption
 * may force us to revisit that.
 */
 struct PACKED tu_renderpass_samples {
   uint64_t samples_start;
   /* hw requires the sample start/stop locations to be 128b aligned. */
   uint64_t __pad0;
   uint64_t samples_end;
   uint64_t __pad1;
 };
 /* Necessary when writing sample counts using CP_EVENT_WRITE7::ZPASS_DONE. */
 static_assert(offsetof(struct tu_renderpass_samples, samples_end) == 16);
 /**
 * Tracks the results from an individual renderpass. Initially created
 * per renderpass, and appended to the tail of at->pending_results. At a later
 * time, when the GPU has finished writing the results, we fill samples_passed.
 */
 struct tu_renderpass_result {
   /* Points into GPU memory */
   struct tu_renderpass_samples* samples;
   struct tu_suballoc_bo bo;
   /*
    * Below here, only used internally within autotune
    */
   uint64_t rp_key;
   struct tu_renderpass_history *history;
   struct list_head node;
   uint32_t fence;
   uint64_t samples_passed;
 };
 VkResult tu_autotune_init(struct tu_autotune *at, struct tu_device *dev);
 void tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev);
 bool tu_autotune_use_bypass(struct tu_autotune *at,
                            struct tu_cmd_buffer *cmd_buffer,
                            struct tu_renderpass_result **autotune_result);
 void tu_autotune_free_results(struct tu_device *dev, struct list_head *results);
 bool tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers,
                                       uint32_t cmd_buffer_count);
 /**
 * A magic 8-ball that tells the gmem code whether we should do bypass mode
 * for moar fps.
 */
 struct tu_cs *tu_autotune_on_submit(struct tu_device *dev,
                                    struct tu_autotune *at,
                                    struct tu_cmd_buffer **cmd_buffers,
                                    uint32_t cmd_buffer_count);
 struct tu_autotune_results_buffer;
 template <chip CHIP>
 void tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd,
                                  struct tu_cs *cs,
                                  struct tu_renderpass_result *autotune_result);
 template <chip CHIP>
 void tu_autotune_end_renderpass(struct tu_cmd_buffer *cmd,
                                struct tu_cs *cs,
                                struct tu_renderpass_result *autotune_result);
 #endif /* TU_AUTOTUNE_H */
--- a/src/freedreno/vulkan/tu_cmd_buffer.cc
+++ b/src/freedreno/vulkan/tu_cmd_buffer.cc
@ -14,6 +14,7 @@
 #include "vk_render_pass.h"
 #include "vk_util.h"
 #include "tu_autotune.h"
 #include "tu_buffer.h"
 #include "tu_clear_blit.h"
 #include "tu_cs.h"
@ -1314,7 +1315,7 @@ use_hw_binning(struct tu_cmd_buffer *cmd)
 static bool
 use_sysmem_rendering(struct tu_cmd_buffer *cmd,
-                     struct tu_renderpass_result **autotune_result)
+                     tu_autotune::rp_ctx_t *rp_ctx)
 {
   if (TU_DEBUG(SYSMEM)) {
      cmd->state.rp.gmem_disable_reason = "TU_DEBUG(SYSMEM)";
@ -1375,15 +1376,9 @@ use_sysmem_rendering(struct tu_cmd_buffer *cmd,
   if (TU_DEBUG(GMEM))
      return false;
-   bool use_sysmem = tu_autotune_use_bypass(&cmd->device->autotune,
+   bool use_sysmem = cmd->device->autotune->get_optimal_mode(cmd, rp_ctx) == tu_autotune::render_mode::SYSMEM;
-                                            cmd, autotune_result);
+   if (use_sysmem)
   if (*autotune_result) {
      list_addtail(&(*autotune_result)->node, &cmd->renderpass_autotune_results);
   }
   if (use_sysmem) {
      cmd->state.rp.gmem_disable_reason = "Autotune selected sysmem";
   }
   return use_sysmem;
 }
@ -3128,7 +3123,7 @@ tu7_emit_concurrent_binning_sysmem(struct tu_cmd_buffer *cmd,
 template <chip CHIP>
 static void
 tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
-                        struct tu_renderpass_result *autotune_result)
+                        tu_autotune::rp_ctx_t rp_ctx)
 {
   const struct tu_framebuffer *fb = cmd->state.framebuffer;
@ -3181,7 +3176,7 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
      tu_cs_emit_regs(cs, RB_BIN_FOVEAT(CHIP));
   }
-   tu_autotune_begin_renderpass<CHIP>(cmd, cs, autotune_result);
+   cmd->device->autotune->begin_renderpass(cmd, cs, rp_ctx, true);
   tu_cs_sanity_check(cs);
 }
@ -3189,7 +3184,7 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
 template <chip CHIP>
 static void
 tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
-                      struct tu_renderpass_result *autotune_result)
+                      tu_autotune::rp_ctx_t rp_ctx)
 {
   /* Do any resolves of the last subpass. These are handled in the
    * tile_store_cs in the gmem path.
@ -3229,7 +3224,7 @@ tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
      tu_cs_emit(cs, 0); /* value */
   }
-   tu_autotune_end_renderpass<CHIP>(cmd, cs, autotune_result);
+   cmd->device->autotune->end_renderpass(cmd, cs, rp_ctx);
   tu_cs_sanity_check(cs);
 }
@ -3379,7 +3374,7 @@ tu7_emit_concurrent_binning_gmem(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
 template <chip CHIP>
 static void
 tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
-                      struct tu_renderpass_result *autotune_result,
+                      tu_autotune::rp_ctx_t rp_ctx,
                      const VkOffset2D *fdm_offsets)
 {
   struct tu_physical_device *phys_dev = cmd->device->physical_device;
@ -3565,7 +3560,7 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
   if (use_cb)
      tu_trace_start_render_pass(cmd);
-   tu_autotune_begin_renderpass<CHIP>(cmd, cs, autotune_result);
+   cmd->device->autotune->begin_renderpass(cmd, cs, rp_ctx, false);
   tu_cs_sanity_check(cs);
 }
@ -3628,7 +3623,7 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
 template <chip CHIP>
 static void
 tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
-                    struct tu_renderpass_result *autotune_result)
+                    tu_autotune::rp_ctx_t rp_ctx)
 {
   tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
@ -3658,7 +3653,7 @@ tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
   tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_BLIT_CACHE);
-   tu_autotune_end_renderpass<CHIP>(cmd, cs, autotune_result);
+   cmd->device->autotune->end_renderpass(cmd, cs, rp_ctx);
   tu_cs_sanity_check(cs);
 }
@ -3767,7 +3762,7 @@ tu_emit_subsampled(struct tu_cmd_buffer *cmd,
 template <chip CHIP>
 static void
 tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
-                    struct tu_renderpass_result *autotune_result,
+                    tu_autotune::rp_ctx_t rp_ctx,
                    const VkOffset2D *fdm_offsets)
 {
   const struct tu_tiling_config *tiling = cmd->state.tiling;
@ -3808,7 +3803,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
   tu6_emit_tile_store_cs<CHIP>(cmd, &cmd->tile_store_cs);
   tu_cs_end(&cmd->tile_store_cs);
-   tu6_tile_render_begin<CHIP>(cmd, &cmd->cs, autotune_result, fdm_offsets);
+   tu6_tile_render_begin<CHIP>(cmd, &cmd->cs, rp_ctx, fdm_offsets);
   /* Note: we reverse the order of walking the pipes and tiles on every
    * other row, to improve texture cache locality compared to raster order.
@ -3861,7 +3856,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
      }
   }
-   tu6_tile_render_end<CHIP>(cmd, &cmd->cs, autotune_result);
+   tu6_tile_render_end<CHIP>(cmd, &cmd->cs, rp_ctx);
   /* Outside of renderpasses we assume all draw states are disabled. We do
    * this outside the draw CS for the normal case where 3d gmem stores aren't
@ -3894,7 +3889,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
 template <chip CHIP>
 static void
 tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,
-                     struct tu_renderpass_result *autotune_result)
+                     tu_autotune::rp_ctx_t rp_ctx)
 {
   VkResult result = tu_allocate_transient_attachments(cmd, true);
@ -3905,7 +3900,7 @@ tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,
   tu_trace_start_render_pass(cmd);
-   tu6_sysmem_render_begin<CHIP>(cmd, &cmd->cs, autotune_result);
+   tu6_sysmem_render_begin<CHIP>(cmd, &cmd->cs, rp_ctx);
   trace_start_draw_ib_sysmem(&cmd->trace, &cmd->cs, cmd);
@ -3913,7 +3908,7 @@ tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,
   trace_end_draw_ib_sysmem(&cmd->trace, &cmd->cs);
-   tu6_sysmem_render_end<CHIP>(cmd, &cmd->cs, autotune_result);
+   tu6_sysmem_render_end<CHIP>(cmd, &cmd->cs, rp_ctx);
   /* Outside of renderpasses we assume all draw states are disabled. */
   tu_disable_draw_states(cmd, &cmd->cs);
@ -3933,11 +3928,11 @@ tu_cmd_render(struct tu_cmd_buffer *cmd_buffer,
   if (cmd_buffer->state.rp.has_tess)
      tu6_lazy_emit_tessfactor_addr<CHIP>(cmd_buffer);
-   struct tu_renderpass_result *autotune_result = NULL;
+   tu_autotune::rp_ctx_t rp_ctx = NULL;
-   if (use_sysmem_rendering(cmd_buffer, &autotune_result))
+   if (use_sysmem_rendering(cmd_buffer, &rp_ctx))
-      tu_cmd_render_sysmem<CHIP>(cmd_buffer, autotune_result);
+      tu_cmd_render_sysmem<CHIP>(cmd_buffer, rp_ctx);
   else
-      tu_cmd_render_tiles<CHIP>(cmd_buffer, autotune_result, fdm_offsets);
+      tu_cmd_render_tiles<CHIP>(cmd_buffer, rp_ctx, fdm_offsets);
 }
 static void tu_reset_render_pass(struct tu_cmd_buffer *cmd_buffer)
@ -4003,7 +3998,7 @@ tu_create_cmd_buffer(struct vk_command_pool *pool,
   u_trace_init(&cmd_buffer->rp_trace, &device->trace_context);
   cmd_buffer->trace_renderpass_start =
      u_trace_begin_iterator(&cmd_buffer->rp_trace);
-   list_inithead(&cmd_buffer->renderpass_autotune_results);
+   new (&cmd_buffer->autotune_ctx) tu_autotune::cmd_buf_ctx();
   if (TU_DEBUG_START(CHECK_CMD_BUFFER_STATUS)) {
      cmd_buffer->status_bo = tu_cmd_buffer_setup_status_tracking(device);
@ -4052,7 +4047,7 @@ tu_cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer)
   u_trace_fini(&cmd_buffer->trace);
   u_trace_fini(&cmd_buffer->rp_trace);
-   tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results);
+   cmd_buffer->autotune_ctx.~cmd_buf_ctx();
   for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
      if (cmd_buffer->descriptors[i].push_set.layout)
@ -4129,7 +4124,7 @@ tu_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer,
   tu_cs_reset(&cmd_buffer->pre_chain.draw_cs);
   tu_cs_reset(&cmd_buffer->pre_chain.draw_epilogue_cs);
-   tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results);
+   cmd_buffer->autotune_ctx.reset();
   for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
      memset(&cmd_buffer->descriptors[i].sets, 0, sizeof(cmd_buffer->descriptors[i].sets));
--- a/src/freedreno/vulkan/tu_cmd_buffer.h
+++ b/src/freedreno/vulkan/tu_cmd_buffer.h
@ -653,8 +653,7 @@ struct tu_cmd_buffer
   struct u_trace_iterator trace_renderpass_start;
   struct u_trace trace, rp_trace;
-   struct list_head renderpass_autotune_results;
+   tu_autotune::cmd_buf_ctx autotune_ctx;
   struct tu_autotune_results_buffer* autotune_buffer;
   void *patchpoints_ctx;
   struct util_dynarray fdm_bin_patchpoints;
--- a/src/freedreno/vulkan/tu_device.cc
+++ b/src/freedreno/vulkan/tu_device.cc
@ -2692,7 +2692,6 @@ tu_device_destroy_mutexes(struct tu_device *device)
 {
   mtx_destroy(&device->bo_mutex);
   mtx_destroy(&device->pipeline_mutex);
   mtx_destroy(&device->autotune_mutex);
   mtx_destroy(&device->kgsl_profiling_mutex);
   mtx_destroy(&device->event_mutex);
   mtx_destroy(&device->trace_mutex);
@ -2808,7 +2807,6 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
   mtx_init(&device->bo_mutex, mtx_plain);
   mtx_init(&device->pipeline_mutex, mtx_plain);
   mtx_init(&device->autotune_mutex, mtx_plain);
   mtx_init(&device->kgsl_profiling_mutex, mtx_plain);
   mtx_init(&device->event_mutex, mtx_plain);
   mtx_init(&device->trace_mutex, mtx_plain);
@ -2933,9 +2931,6 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
                                TU_BO_ALLOC_ALLOW_DUMP |
                                TU_BO_ALLOC_INTERNAL_RESOURCE),
      "pipeline_suballoc");
   tu_bo_suballocator_init(&device->autotune_suballoc, device,
                           128 * 1024, TU_BO_ALLOC_INTERNAL_RESOURCE,
                           "autotune_suballoc");
   if (is_kgsl(physical_device->instance)) {
      tu_bo_suballocator_init(&device->kgsl_profiling_suballoc, device,
                              128 * 1024, TU_BO_ALLOC_INTERNAL_RESOURCE,
@ -3083,10 +3078,9 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
   }
   pthread_condattr_destroy(&condattr);
-   result = tu_autotune_init(&device->autotune, device);
+   device->autotune = new tu_autotune(device, result);
-   if (result != VK_SUCCESS) {
+   if (result != VK_SUCCESS)
      goto fail_timeline_cond;
   }
   device->use_z24uint_s8uint =
      physical_device->info->props.has_z24uint_s8uint &&
@ -3244,10 +3238,9 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
      free(device->dbg_renderpass_stomp_cs);
   }
-   tu_autotune_fini(&device->autotune, device);
+   delete device->autotune;
   tu_bo_suballocator_finish(&device->pipeline_suballoc);
   tu_bo_suballocator_finish(&device->autotune_suballoc);
   tu_bo_suballocator_finish(&device->kgsl_profiling_suballoc);
   tu_bo_suballocator_finish(&device->event_suballoc);
   tu_bo_suballocator_finish(&device->vis_stream_suballocator);
--- a/src/freedreno/vulkan/tu_device.h
+++ b/src/freedreno/vulkan/tu_device.h
@ -29,6 +29,7 @@
 #include "common/fd6_gmem_cache.h"
 #include "util/vma.h"
 #include "util/u_vector.h"
 #include "util/rwlock.h"
 /* queue types */
 #define TU_QUEUE_GENERAL 0
@ -267,7 +268,12 @@ struct tu6_global
   volatile uint32_t vtx_stats_query_not_running;
-   /* To know when renderpass stats for autotune are valid */
+   /* A fence with a monotonically increasing value that is
    * incremented by the GPU on each submission that includes
    * a tu_autotune::submission_entry CS. This is used to track
    * which submissions have been processed by the GPU before
    * processing the autotune packet on the CPU.
    */
   volatile uint32_t autotune_fence;
   /* For recycling command buffers for dynamic suspend/resume comamnds */
@ -357,12 +363,6 @@ struct tu_device
   struct tu_suballocator pipeline_suballoc;
   mtx_t pipeline_mutex;
   /* Device-global BO suballocator for reducing BO management for small
    * gmem/sysmem autotune result buffers.  Synchronized by autotune_mutex.
    */
   struct tu_suballocator autotune_suballoc;
   mtx_t autotune_mutex;
   /* KGSL requires a small chunk of GPU mem to retrieve raw GPU time on
    * each submission.
    */
@ -460,7 +460,7 @@ struct tu_device
   pthread_cond_t timeline_cond;
   pthread_mutex_t submit_mutex;
-   struct tu_autotune autotune;
+   struct tu_autotune *autotune;
   struct breadcrumbs_context *breadcrumbs_ctx;
--- a/src/freedreno/vulkan/tu_pass.cc
+++ b/src/freedreno/vulkan/tu_pass.cc
@ -549,27 +549,6 @@ tu_render_pass_disable_fdm(struct tu_device *dev, struct tu_render_pass *pass)
   return false;
 }
 static void
 tu_render_pass_calc_hash(struct tu_render_pass *pass)
 {
   #define HASH(hash, data) XXH64(&(data), sizeof(data), hash)
   uint64_t hash = HASH(0, pass->attachment_count);
   hash = XXH64(pass->attachments,
         pass->attachment_count * sizeof(pass->attachments[0]), hash);
   hash = HASH(hash, pass->subpass_count);
   for (unsigned i = 0; i < pass->subpass_count; i++) {
      hash = HASH(hash, pass->subpasses[i].samples);
      hash = HASH(hash, pass->subpasses[i].input_count);
      hash = HASH(hash, pass->subpasses[i].color_count);
      hash = HASH(hash, pass->subpasses[i].resolve_count);
   }
   pass->autotune_hash = hash;
   #undef HASH
 }
 static void
 tu_render_pass_cond_config(struct tu_device *device,
                           struct tu_render_pass *pass)
@ -1354,7 +1333,6 @@ tu_CreateRenderPass2(VkDevice _device,
   tu_render_pass_gmem_config(pass, device->physical_device);
   tu_render_pass_bandwidth_config(pass);
   tu_render_pass_calc_views(pass);
   tu_render_pass_calc_hash(pass);
   for (unsigned i = 0; i < pCreateInfo->dependencyCount; ++i) {
      tu_render_pass_add_subpass_dep(pass, &pCreateInfo->pDependencies[i]);
@ -1834,7 +1812,6 @@ tu_setup_dynamic_render_pass(struct tu_cmd_buffer *cmd_buffer,
   tu_render_pass_gmem_config(pass, device->physical_device);
   tu_render_pass_bandwidth_config(pass);
   tu_render_pass_calc_views(pass);
   tu_render_pass_calc_hash(pass);
 }
 void
--- a/src/freedreno/vulkan/tu_queue.cc
+++ b/src/freedreno/vulkan/tu_queue.cc
@ -418,6 +418,7 @@ queue_submit(struct vk_queue *_queue, struct vk_queue_submit *vk_submit)
   struct tu_device *device = queue->device;
   bool u_trace_enabled = u_trace_should_process(&queue->device->trace_context);
   struct util_dynarray dump_cmds;
   struct tu_cs *autotune_cs = NULL;
   if (vk_submit->buffer_bind_count ||
       vk_submit->image_bind_count ||
@ -495,9 +496,8 @@ queue_submit(struct vk_queue *_queue, struct vk_queue_submit *vk_submit)
      }
   }
-   if (tu_autotune_submit_requires_fence(cmd_buffers, cmdbuf_count)) {
+   autotune_cs = device->autotune->on_submit(cmd_buffers, cmdbuf_count);
-      struct tu_cs *autotune_cs = tu_autotune_on_submit(
+   if (autotune_cs) {
         device, &device->autotune, cmd_buffers, cmdbuf_count);
      submit_add_entries(device, submit, &dump_cmds, autotune_cs->entries,
                         autotune_cs->entry_count);
   }