Merge branch 'tu-newat' into 'main'

turnip: Autotuner Overhaul See merge request mesa/mesa!37802
2025-12-20 07:20:10 +01:00 · 2025-12-20 06:20:41 +05:30 · 2025-12-20 06:20:41 +05:30 · adbb7f760f
commit adbb7f760f
parent c430f394c5 97f6f8b1a0
17 changed files with 2309 additions and 848 deletions
--- a/docs/drivers/freedreno.rst
+++ b/docs/drivers/freedreno.rst
@ -665,3 +665,66 @@ are supported at the moment: ``nir``, ``nobin``, ``sysmem``, ``gmem``, ``forcebi
 Some of these options will behave differently when toggled at runtime, for example:
 ``nolrz`` will still result in LRZ allocation which would not happen if the option
 was set in the environment variable.
 Autotune
 ^^^^^^^^
 Turnip supports dynamically selecting between SYSMEM and GMEM rendering with the
 autotune system, the behavior of which can be controlled with the following
 environment variables:
 .. envvar:: TU_AUTOTUNE_ALGO
  Selects the algorithm used for autotuning. Supported values are:
  ``bandwidth``
    Estimates the bandwidth usage of rendering in SYSMEM and GMEM modes, and chooses
    the one with lower estimated bandwidth.
  ``profiled``
    Dynamically profiles the RP timings in SYSMEM and GMEM modes, and uses that to
    move a probability distribution towards the optimal choice over time. This
    algorithm tends to be far more accurate than the bandwidth algorithm at choosing
    the optimal rendering mode but may result in larger FPS variance due to being
    based on a probability distribution with random sampling. This is the default
    algorithm.
  ``profiled_imm``
    Similar to ``profiled``, but only profiles the first few instances of a RP
    and then sticks to the chosen mode for subsequent instances. This is meant
    for single-frame traces run multiple times in a CI where this algorithm can
    immediately chose the optimal rendering mode for each RP.
  ``prefer_sysmem``
    Always chooses SYSMEM rendering. This is useful for games that don't benefit
    from GMEM rendering due to their rendering patterns, setting this is better
    than using ``TU_DEBUG=sysmem`` when done for performance reasons.
  The algorithm can be set via the driconf option ``tu_autotune_algorithm`` as well.
 .. envvar:: TU_AUTOTUNE_FLAGS
  Modifies the behavior of the selected algorithm. Supported flags are:
  ``big_gmem``
    Always chooses GMEM rendering if the amount of draw calls in the render pass
    is greater than a certain threshold. Larger RPs generally benefit more from
    GMEM rendering due to less overhead from tiling. 
  ``small_sysmem``
    Always chooses SYSMEM rendering if the amount of draw calls in the render pass
    is lower than a certain threshold. The benefits of GMEM rendering are less
    pronounced in these smaller RPs and SYSMEM rendering tends to win more often.
  ``preempt_optimize``
    Tries to keep non-preemptible time in the render pass is below a certain
    threshold. This is useful for systems with GPU-based compositors where long
    non-preemptible times can lead to missed frame deadlines, causing noticeable
    stuttering. This flag will reduce the performance of the render pass in order
    to improve overall system responsiveness, it should not be used unless the
    rest of the system is affected by preemption delays.
  Multiple flags can be combined by separating them with commas, e.g.
  ``TU_AUTOTUNE_FLAGS=big_gmem,small_sysmem``.
  If no flags are specified, the default behavior is used.
--- a/src/freedreno/vulkan/.clang-format
+++ b/src/freedreno/vulkan/.clang-format
@ -4,7 +4,7 @@ DisableFormat: false
 AlwaysBreakAfterReturnType: TopLevel
 BinPackParameters: false
-ColumnLimit: 78
+ColumnLimit: 120
 Cpp11BracedListStyle: false
 IncludeBlocks: Regroup
--- a/src/freedreno/vulkan/tu_autotune.cc
+++ b/src/freedreno/vulkan/tu_autotune.cc
--- a/src/freedreno/vulkan/tu_autotune.h
+++ b/src/freedreno/vulkan/tu_autotune.h
@ -8,150 +8,265 @@
 #include "tu_common.h"
-#include "util/hash_table.h"
+#include <atomic>
-#include "util/rwlock.h"
+#include <deque>
 #include <memory>
 #include <mutex>
 #include <shared_mutex>
 #include <unordered_map>
 #include <vector>
 #include "tu_cs.h"
 #include "tu_suballoc.h"
-struct tu_renderpass_history;
+/* Compile-time toggle for debugging preemption latency with CP preemption performance counters. */
 #define TU_AUTOTUNE_DEBUG_PERFCTR 0
-/**
+/* Autotune allows for us to tune rendering parameters (such as GMEM vs SYSMEM, tile size divisor, etc.) based on
- * "autotune" our decisions about bypass vs GMEM rendering, based on historical
+ * dynamic analysis of the rendering workload via on-GPU profiling. This lets us make much better decisions than static
- * data about a given render target.
+ * analysis, since we can adapt to the actual workload rather than relying on heuristics.
 *
 * In deciding which path to take there are tradeoffs, including some that
 * are not reasonably estimateable without having some additional information:
 *
 *  (1) If you know you are touching every pixel (ie. there is a clear),
 *      then the GMEM path will at least not cost more memory bandwidth than
 *      sysmem[1]
 *
 *  (2) If there is no clear, GMEM could potentially cost *more* bandwidth
 *      if there is sysmem->GMEM restore pass.
 *
 *  (3) If you see a high draw count, that is an indication that there will be
 *      enough pixels accessed multiple times to benefit from the reduced
 *      memory bandwidth that GMEM brings
 *
 *  (4) But high draw count where there is not much overdraw can actually be
 *      faster in bypass mode if it is pushing a lot of state change, due to
 *      not having to go thru the state changes per-tile[1]
 *
 * The approach taken is to measure the samples-passed for the batch to estimate
 * the amount of overdraw to detect cases where the number of pixels touched is
 * low.
 *
 * [1] ignoring early-tile-exit optimizations, but any draw that touches all/
 *     most of the tiles late in the tile-pass can defeat that
 */
 struct tu_autotune {
-
+ private:
-   /* We may have to disable autotuner if there are too many
+   bool enabled = true;
    * renderpasses in-flight.
    */
   bool enabled;
   struct tu_device *device;
-   /**
+   /** Configuration **/
    * Cache to map renderpass key to historical information about
    * rendering to that particular render target.
    */
   struct hash_table *ht;
   struct u_rwlock ht_lock;
-   /**
+   enum class algorithm : uint8_t;
-    * List of per-renderpass results that we are waiting for the GPU
+   enum class mod_flag : uint8_t;
-    * to finish with before reading back the results.
+   enum class metric_flag : uint8_t;
-    */
+   /* Container for all autotune configuration options. */
-   struct list_head pending_results;
+   struct PACKED config_t;
   union PACKED packed_config_t;
-   /**
+   /* Allows for thread-safe access to the configurations. */
-    * List of per-submission data that we may want to free after we
+   struct atomic_config_t {
-    * processed submission results.
+    private:
-    * This could happend after command buffers which were in the submission
+      std::atomic<uint32_t> config_bits = 0;
    * are destroyed.
    */
   struct list_head pending_submission_data;
-   /**
+    public:
-    * List of per-submission data that has been finished and can be reused.
+      atomic_config_t(config_t initial_config);
    */
   struct list_head submission_data_pool;
-   uint32_t fence_counter;
+      config_t load() const;
-   uint32_t idx_counter;
+
      bool compare_and_store(config_t updated, config_t expected);
   } active_config;
   config_t get_env_config();
   /** Global Fence and Internal CS Management **/
   /* BO suballocator for reducing BO management for small GMEM/SYSMEM autotune result buffers.
    * Synchronized by suballoc_mutex.
    */
   struct tu_suballocator suballoc;
   std::mutex suballoc_mutex;
   /* The next value to assign to tu6_global::autotune_fence, this is incremented during on_submit. */
   uint32_t next_fence = 1;
   /* A wrapper around a CS which sets the global autotune fence to a certain fence value, this allows for ergonomically
    * managing the lifetime of the CS including recycling it after the fence value has been reached.
    */
   struct submission_entry {
    private:
      uint32_t fence;
      struct tu_cs fence_cs;
    public:
      explicit submission_entry(tu_device *device);
      ~submission_entry();
      /* Disable move/copy, since this holds stable pointers to the fence_cs. */
      submission_entry(const submission_entry &) = delete;
      submission_entry &operator=(const submission_entry &) = delete;
      submission_entry(submission_entry &&) = delete;
      submission_entry &operator=(submission_entry &&) = delete;
      /* The current state of the submission entry, this is used to track whether the CS is available for reuse, pending
       * GPU completion or currently being processed.
       */
      bool is_active() const;
      /* If the CS is free, returns the CS which will write out the specified fence value. Otherwise, returns nullptr. */
      struct tu_cs *try_get_cs(uint32_t new_fence);
   };
   /* Unified pool for submission CSes.
    * Note: This is a deque rather than a vector due to the lack of move semantics in the submission_entry.
    */
   std::deque<submission_entry> submission_entries;
   /* Returns a CS which will write out the specified fence value to the global BO's autotune fence. */
   struct tu_cs *get_cs_for_fence(uint32_t fence);
   /** RP Entry Management **/
   struct rp_gpu_data;
   struct tile_gpu_data;
   struct rp_entry;
   /* A wrapper over all entries associated with a single command buffer. */
   struct rp_entry_batch {
      bool active;    /* If the entry is ready to be processed, i.e. the entry is submitted to the GPU queue and has a
                         valid fence. */
      uint32_t fence; /* The fence value which is used to signal the completion of the CB submission. This is used to
                         determine when the entries can be processed. */
      std::vector<std::unique_ptr<rp_entry>> entries;
      rp_entry_batch();
      /* Disable the copy/move to avoid performance hazards. */
      rp_entry_batch(const rp_entry_batch &) = delete;
      rp_entry_batch &operator=(const rp_entry_batch &) = delete;
      rp_entry_batch(rp_entry_batch &&) = delete;
      rp_entry_batch &operator=(rp_entry_batch &&) = delete;
      void assign_fence(uint32_t new_fence);
   };
   /* A deque of entry batches that are strongly ordered by the fence value that was written by the GPU, for efficient
    * iteration and to ensure that we process the entries in the same order they were submitted.
    */
   std::deque<std::shared_ptr<rp_entry_batch>> active_batches;
   /* Handles processing of entry batches that are pending to be processed.
    *
    * Note: This must be called regularly to process the entries that have been written by the GPU. We currently do this
    *       in the on_submit() method, which is called on every submit of a command buffer.
    */
   void process_entries();
   /** Renderpass State Tracking **/
   struct rp_history;
   struct rp_history_handle;
   /* A strongly typed key which generates a hash to uniquely identify a renderpass instance. This hash is expected to
    * be stable across runs, so it can be used to identify the same renderpass instance consistently.
    *
    * Note: We can potentially include the vector of data we extract from the parameters to generate the hash into
    *       rp_key, which would lead to true value-based equality rather than just hash-based equality which has a cost
    *       but avoids hash collisions causing issues.
    */
   struct rp_key {
      uint64_t hash;
      rp_key(const struct tu_render_pass *pass,
             const struct tu_framebuffer *framebuffer,
             const struct tu_cmd_buffer *cmd);
      /* Further salt the hash to distinguish between multiple instances of the same RP within a single command buffer. */
      rp_key(const rp_key &key, uint32_t duplicates);
      /* Equality operator, used in unordered_map. */
      constexpr bool operator==(const rp_key &other) const noexcept
      {
         return hash == other.hash;
      }
   };
   /* A thin wrapper to satisfy C++'s Hash named requirement for rp_key.
    *
    * Note: This should *NEVER* be used to calculate the hash itself as it would lead to the hash being calculated
    *       multiple times, rather than being calculated once and reused when there's multiple successive lookups like
    *       with find_or_create_rp_history() and providing the hash to the rp_history constructor.
    */
   struct rp_hash {
      constexpr size_t operator()(const rp_key &key) const noexcept
      {
         /* Note: This will throw away the upper 32-bits on 32-bit architectures. */
         return static_cast<size_t>(key.hash);
      }
   };
   /* A map between the hash of an RP and the historical state of the RP. Synchronized by rp_mutex. */
   using rp_histories_t = std::unordered_map<rp_key, rp_history, rp_hash>;
   rp_histories_t rp_histories;
   std::shared_mutex rp_mutex;
   uint64_t last_reap_ts = 0;
   /* Note: These will internally lock rp_mutex internally, no need to lock it. */
   rp_history_handle find_rp_history(const rp_key &key);
   rp_history_handle find_or_create_rp_history(const rp_key &key);
   void reap_old_rp_histories();
   /** Debug Performance Counters **/
 #if TU_AUTOTUNE_DEBUG_PERFCTR
   const fd_perfcntr_group *cp_group;
   const fd_perfcntr_countable *preemption_reaction_delay, *num_preemptions, *always_count;
 #endif
 public:
   tu_autotune(struct tu_device *device, VkResult &result);
   ~tu_autotune();
   /* Opaque pointer to internal structure with RP context that needs to be preserved across begin/end calls. */
   using rp_ctx_t = rp_entry *;
   /* An internal structure that needs to be held by tu_cmd_buffer to track the state of the autotuner for a given CB.
    *
    * Note: tu_cmd_buffer is only responsible for the lifetime of this object, all the access to the context state is
    *       done through tu_autotune.
    */
   struct cmd_buf_ctx {
    private:
      /* A batch of all entries from RPs within this CB. */
      std::shared_ptr<rp_entry_batch> batch;
      /* Creates a new RP entry attached to this CB. */
      rp_entry *
      attach_rp_entry(struct tu_device *device, rp_history_handle &&history, config_t config, uint32_t draw_count);
      rp_entry *find_rp_entry(const rp_key &key);
      friend struct tu_autotune;
    public:
      cmd_buf_ctx();
      ~cmd_buf_ctx();
      /* Resets the internal context, should be called when tu_cmd_buffer state has been reset. */
      void reset();
   };
   enum class render_mode {
      SYSMEM,
      GMEM,
   };
   render_mode get_optimal_mode(struct tu_cmd_buffer *cmd_buffer, rp_ctx_t *rp_ctx);
   /* Returns the optimal tile size divisor for the given CB state. */
   uint32_t get_tile_size_divisor(struct tu_cmd_buffer *cmd_buffer);
   /* Disables preemption latency optimization within the autotuner, this is used when high-priority queues are used to
    * ensure that the autotuner does not interfere with the high-priority queue's performance.
    *
    * Note: This should be called before any renderpass is started, otherwise it may lead to undefined behavior.
    */
   void disable_preempt_optimize();
   void
   begin_renderpass(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx, bool sysmem, uint32_t tile_count);
   void end_renderpass(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx);
   void begin_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx, uint32_t tile_idx);
   void end_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx, uint32_t tile_idx);
   /* The submit-time hook for autotuner, this may return a CS (can be NULL) which must be amended for autotuner
    * tracking to function correctly.
    *
    * Note: This must be called from a single-threaded context. There should never be multiple threads calling this
    *       function at the same time.
    */
   struct tu_cs *on_submit(struct tu_cmd_buffer **cmd_buffers, uint32_t cmd_buffer_count);
 };
 /**
 * From the cmdstream, the captured samples-passed values are recorded
 * at the start and end of the batch.
 *
 * Note that we do the math on the CPU to avoid a WFI.  But pre-emption
 * may force us to revisit that.
 */
 struct PACKED tu_renderpass_samples {
   uint64_t samples_start;
   /* hw requires the sample start/stop locations to be 128b aligned. */
   uint64_t __pad0;
   uint64_t samples_end;
   uint64_t __pad1;
 };
 /* Necessary when writing sample counts using CP_EVENT_WRITE7::ZPASS_DONE. */
 static_assert(offsetof(struct tu_renderpass_samples, samples_end) == 16);
 /**
 * Tracks the results from an individual renderpass. Initially created
 * per renderpass, and appended to the tail of at->pending_results. At a later
 * time, when the GPU has finished writing the results, we fill samples_passed.
 */
 struct tu_renderpass_result {
   /* Points into GPU memory */
   struct tu_renderpass_samples* samples;
   struct tu_suballoc_bo bo;
   /*
    * Below here, only used internally within autotune
    */
   uint64_t rp_key;
   struct tu_renderpass_history *history;
   struct list_head node;
   uint32_t fence;
   uint64_t samples_passed;
 };
 VkResult tu_autotune_init(struct tu_autotune *at, struct tu_device *dev);
 void tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev);
 bool tu_autotune_use_bypass(struct tu_autotune *at,
                            struct tu_cmd_buffer *cmd_buffer,
                            struct tu_renderpass_result **autotune_result);
 void tu_autotune_free_results(struct tu_device *dev, struct list_head *results);
 bool tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers,
                                       uint32_t cmd_buffer_count);
 /**
 * A magic 8-ball that tells the gmem code whether we should do bypass mode
 * for moar fps.
 */
 struct tu_cs *tu_autotune_on_submit(struct tu_device *dev,
                                    struct tu_autotune *at,
                                    struct tu_cmd_buffer **cmd_buffers,
                                    uint32_t cmd_buffer_count);
 struct tu_autotune_results_buffer;
 template <chip CHIP>
 void tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd,
                                  struct tu_cs *cs,
                                  struct tu_renderpass_result *autotune_result);
 template <chip CHIP>
 void tu_autotune_end_renderpass(struct tu_cmd_buffer *cmd,
                                struct tu_cs *cs,
                                struct tu_renderpass_result *autotune_result);
 #endif /* TU_AUTOTUNE_H */
--- a/src/freedreno/vulkan/tu_clear_blit.cc
+++ b/src/freedreno/vulkan/tu_clear_blit.cc
@ -5466,7 +5466,10 @@ tu_choose_gmem_layout(struct tu_cmd_buffer *cmd)
      }
   }
-   cmd->state.tiling = &cmd->state.framebuffer->tiling[cmd->state.gmem_layout];
+   cmd->state.gmem_layout_divisor = cmd->device->autotune->get_tile_size_divisor(cmd);
   cmd->state.tiling = tu_framebuffer_get_tiling_config(cmd->state.framebuffer, cmd->device, cmd->state.pass,
                                                        cmd->state.gmem_layout, cmd->state.gmem_layout_divisor);
 }
 struct apply_store_coords_state {
--- a/src/freedreno/vulkan/tu_cmd_buffer.cc
+++ b/src/freedreno/vulkan/tu_cmd_buffer.cc
@ -14,6 +14,7 @@
 #include "vk_render_pass.h"
 #include "vk_util.h"
 #include "tu_autotune.h"
 #include "tu_buffer.h"
 #include "tu_clear_blit.h"
 #include "tu_cs.h"
@ -1262,8 +1263,9 @@ tu_vsc_config(struct tu_cmd_buffer *cmd, const struct tu_tiling_config *tiling)
 static bool
 use_hw_binning(struct tu_cmd_buffer *cmd)
 {
-   const struct tu_framebuffer *fb = cmd->state.framebuffer;
+   struct tu_framebuffer *fb = cmd->state.framebuffer;
-   const struct tu_tiling_config *tiling = &fb->tiling[cmd->state.gmem_layout];
+   const struct tu_tiling_config *tiling =
      tu_framebuffer_get_tiling_config(fb, cmd->device, cmd->state.pass, cmd->state.gmem_layout, cmd->state.gmem_layout_divisor);
   const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling);
   /* XFB commands are emitted for BINNING || SYSMEM, which makes it
@ -1288,12 +1290,12 @@ use_hw_binning(struct tu_cmd_buffer *cmd)
      return true;
   }
-   return vsc->binning;
+   return vsc->binning_possible && vsc->binning_useful;
 }
 static bool
 use_sysmem_rendering(struct tu_cmd_buffer *cmd,
-                     struct tu_renderpass_result **autotune_result)
+                     tu_autotune::rp_ctx_t *rp_ctx)
 {
   if (TU_DEBUG(SYSMEM)) {
      cmd->state.rp.gmem_disable_reason = "TU_DEBUG(SYSMEM)";
@ -1343,18 +1345,20 @@ use_sysmem_rendering(struct tu_cmd_buffer *cmd,
      return true;
   }
-   if (TU_DEBUG(GMEM))
+   if (TU_DEBUG(GMEM)) {
      cmd->state.rp.gmem_disable_reason = "TU_DEBUG(GMEM)";
      return false;
   bool use_sysmem = tu_autotune_use_bypass(&cmd->device->autotune,
                                            cmd, autotune_result);
   if (*autotune_result) {
      list_addtail(&(*autotune_result)->node, &cmd->renderpass_autotune_results);
   }
-   if (use_sysmem) {
+   /* This is a case where it's better to avoid GMEM, too many tiles but no HW binning possible. */
   if (!vsc->binning_possible && vsc->binning_useful) {
      cmd->state.rp.gmem_disable_reason = "Too many tiles and HW binning is not possible";
      return true;
   }
   bool use_sysmem = cmd->device->autotune->get_optimal_mode(cmd, rp_ctx) == tu_autotune::render_mode::SYSMEM;
   if (use_sysmem)
      cmd->state.rp.gmem_disable_reason = "Autotune selected sysmem";
   }
   return use_sysmem;
 }
@ -3035,7 +3039,7 @@ tu7_emit_concurrent_binning_sysmem(struct tu_cmd_buffer *cmd,
 template <chip CHIP>
 static void
 tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
-                        struct tu_renderpass_result *autotune_result)
+                        tu_autotune::rp_ctx_t rp_ctx)
 {
   const struct tu_framebuffer *fb = cmd->state.framebuffer;
@ -3089,7 +3093,7 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
      tu_cs_emit_regs(cs, RB_BIN_FOVEAT(CHIP));
   }
-   tu_autotune_begin_renderpass<CHIP>(cmd, cs, autotune_result);
+   cmd->device->autotune->begin_renderpass(cmd, cs, rp_ctx, true, 0);
   tu_cs_sanity_check(cs);
 }
@ -3097,10 +3101,8 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
 template <chip CHIP>
 static void
 tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
-                      struct tu_renderpass_result *autotune_result)
+                      tu_autotune::rp_ctx_t rp_ctx)
 {
   tu_autotune_end_renderpass<CHIP>(cmd, cs, autotune_result);
   /* Do any resolves of the last subpass. These are handled in the
    * tile_store_cs in the gmem path.
    */
@ -3127,6 +3129,8 @@ tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
      tu_cs_emit(cs, 0); /* value */
   }
   cmd->device->autotune->end_renderpass(cmd, cs, rp_ctx);
   tu_cs_sanity_check(cs);
 }
@ -3275,7 +3279,7 @@ tu7_emit_concurrent_binning_gmem(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
 template <chip CHIP>
 static void
 tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
-                      struct tu_renderpass_result *autotune_result,
+                      tu_autotune::rp_ctx_t rp_ctx,
                      const VkOffset2D *fdm_offsets)
 {
   struct tu_physical_device *phys_dev = cmd->device->physical_device;
@ -3462,7 +3466,8 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
   if (use_cb)
      tu_trace_start_render_pass(cmd);
-   tu_autotune_begin_renderpass<CHIP>(cmd, cs, autotune_result);
+   uint32_t tile_count = vsc->tile_count.width * vsc->tile_count.height;
   cmd->device->autotune->begin_renderpass(cmd, cs, rp_ctx, false, tile_count);
   tu_cs_sanity_check(cs);
 }
@ -3471,13 +3476,18 @@ template <chip CHIP>
 static void
 tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
                const struct tu_tile_config *tile,
-                bool fdm, const VkOffset2D *fdm_offsets)
+                bool fdm, const VkOffset2D *fdm_offsets,
                tu_autotune::rp_ctx_t rp_ctx,
                const struct tu_vsc_config *vsc)
 {
   uint32_t tile_idx = (tile->pos.y * vsc->tile_count.width) + tile->pos.x;
   tu6_emit_tile_select<CHIP>(cmd, &cmd->cs, tile, fdm, fdm_offsets);
   tu_lrz_before_tile<CHIP>(cmd, &cmd->cs);
   trace_start_draw_ib_gmem(&cmd->trace, &cmd->cs, cmd);
   cmd->device->autotune->begin_tile(cmd, cs, rp_ctx, tile_idx);
   /* Primitives that passed all tests are still counted in in each
    * tile even with HW binning beforehand. Do not permit it.
    */
@ -3489,6 +3499,8 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
   if (cmd->state.prim_generated_query_running_before_rp)
      tu_emit_event_write<CHIP>(cmd, cs, FD_START_PRIMITIVE_CTRS);
   cmd->device->autotune->end_tile(cmd, cs, rp_ctx, tile_idx);
   if (use_hw_binning(cmd)) {
      tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
      tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_END_OF_DRAWS) |
@ -3528,10 +3540,8 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
 template <chip CHIP>
 static void
 tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
-                    struct tu_renderpass_result *autotune_result)
+                    tu_autotune::rp_ctx_t rp_ctx)
 {
   tu_autotune_end_renderpass<CHIP>(cmd, cs, autotune_result);
   tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
   tu_lrz_tiling_end<CHIP>(cmd, cs);
@ -3560,6 +3570,8 @@ tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
   tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_BLIT_CACHE);
   cmd->device->autotune->end_renderpass(cmd, cs, rp_ctx);
   tu_cs_sanity_check(cs);
 }
@ -3796,7 +3808,9 @@ void
 tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe,
                   uint32_t tx1, uint32_t ty1, uint32_t tx2, uint32_t ty2,
                   const struct tu_image_view *fdm,
-                   const VkOffset2D *fdm_offsets)
+                   const VkOffset2D *fdm_offsets,
                   tu_autotune::rp_ctx_t rp_ctx,
                   const struct tu_vsc_config *vsc)
 {
   uint32_t width = tx2 - tx1;
   uint32_t height = ty2 - ty1;
@ -3859,7 +3873,8 @@ tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe,
            continue;
         tu6_render_tile<CHIP>(cmd, &cmd->cs, &tiles[tile_idx],
-                               true, fdm_offsets);
+                               true, fdm_offsets,
                               rp_ctx, vsc);
      }
   }
 }
@ -3892,7 +3907,7 @@ tu_allocate_transient_attachments(struct tu_cmd_buffer *cmd, bool sysmem)
 template <chip CHIP>
 static void
 tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
-                    struct tu_renderpass_result *autotune_result,
+                    tu_autotune::rp_ctx_t rp_ctx,
                    const VkOffset2D *fdm_offsets)
 {
   const struct tu_tiling_config *tiling = cmd->state.tiling;
@ -3926,7 +3941,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
   tu6_emit_tile_store_cs<CHIP>(cmd, &cmd->tile_store_cs);
   tu_cs_end(&cmd->tile_store_cs);
-   tu6_tile_render_begin<CHIP>(cmd, &cmd->cs, autotune_result, fdm_offsets);
+   tu6_tile_render_begin<CHIP>(cmd, &cmd->cs, rp_ctx, fdm_offsets);
   /* Note: we reverse the order of walking the pipes and tiles on every
    * other row, to improve texture cache locality compared to raster order.
@ -3947,7 +3962,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
         if (merge_tiles) {
            tu_render_pipe_fdm<CHIP>(cmd, pipe, tx1, ty1, tx2, ty2, fdm,
-                                     fdm_offsets);
+                                     fdm_offsets, rp_ctx, vsc);
            continue;
         }
@ -3971,14 +3986,15 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
                  tu_calc_frag_area(cmd, &tile, fdm, fdm_offsets);
               tu6_render_tile<CHIP>(cmd, &cmd->cs, &tile, has_fdm,
-                                     fdm_offsets);
+                                     fdm_offsets,
                                     rp_ctx, vsc);
            }
            slot_row += tile_row_stride;
         }
      }
   }
-   tu6_tile_render_end<CHIP>(cmd, &cmd->cs, autotune_result);
+   tu6_tile_render_end<CHIP>(cmd, &cmd->cs, rp_ctx);
   tu_trace_end_render_pass<CHIP>(cmd, true);
@ -3998,7 +4014,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
 template <chip CHIP>
 static void
 tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,
-                     struct tu_renderpass_result *autotune_result)
+                     tu_autotune::rp_ctx_t rp_ctx)
 {
   VkResult result = tu_allocate_transient_attachments(cmd, true);
@ -4009,7 +4025,7 @@ tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,
   tu_trace_start_render_pass(cmd);
-   tu6_sysmem_render_begin<CHIP>(cmd, &cmd->cs, autotune_result);
+   tu6_sysmem_render_begin<CHIP>(cmd, &cmd->cs, rp_ctx);
   trace_start_draw_ib_sysmem(&cmd->trace, &cmd->cs, cmd);
@ -4017,7 +4033,7 @@ tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,
   trace_end_draw_ib_sysmem(&cmd->trace, &cmd->cs);
-   tu6_sysmem_render_end<CHIP>(cmd, &cmd->cs, autotune_result);
+   tu6_sysmem_render_end<CHIP>(cmd, &cmd->cs, rp_ctx);
   tu_clone_trace_range(cmd, &cmd->cs, &cmd->trace,
                        cmd->trace_renderpass_start,
@ -4034,11 +4050,11 @@ tu_cmd_render(struct tu_cmd_buffer *cmd_buffer,
   if (cmd_buffer->state.rp.has_tess)
      tu6_lazy_emit_tessfactor_addr<CHIP>(cmd_buffer);
-   struct tu_renderpass_result *autotune_result = NULL;
+   tu_autotune::rp_ctx_t rp_ctx = NULL;
-   if (use_sysmem_rendering(cmd_buffer, &autotune_result))
+   if (use_sysmem_rendering(cmd_buffer, &rp_ctx))
-      tu_cmd_render_sysmem<CHIP>(cmd_buffer, autotune_result);
+      tu_cmd_render_sysmem<CHIP>(cmd_buffer, rp_ctx);
   else
-      tu_cmd_render_tiles<CHIP>(cmd_buffer, autotune_result, fdm_offsets);
+      tu_cmd_render_tiles<CHIP>(cmd_buffer, rp_ctx, fdm_offsets);
   /* Outside of renderpasses we assume all draw states are disabled. We do
    * this outside the draw CS for the normal case where 3d gmem stores aren't
@ -4063,6 +4079,7 @@ static void tu_reset_render_pass(struct tu_cmd_buffer *cmd_buffer)
   cmd_buffer->state.attachments = NULL;
   cmd_buffer->state.clear_values = NULL;
   cmd_buffer->state.gmem_layout = TU_GMEM_LAYOUT_COUNT; /* invalid value to prevent looking up gmem offsets */
   cmd_buffer->state.gmem_layout_divisor = 0;
   cmd_buffer->state.renderpass_cb_disabled = false;
   memset(&cmd_buffer->state.rp, 0, sizeof(cmd_buffer->state.rp));
@ -4111,7 +4128,7 @@ tu_create_cmd_buffer(struct vk_command_pool *pool,
   u_trace_init(&cmd_buffer->rp_trace, &device->trace_context);
   cmd_buffer->trace_renderpass_start =
      u_trace_begin_iterator(&cmd_buffer->rp_trace);
-   list_inithead(&cmd_buffer->renderpass_autotune_results);
+   new (&cmd_buffer->autotune_ctx) tu_autotune::cmd_buf_ctx();
   if (TU_DEBUG_START(CHECK_CMD_BUFFER_STATUS)) {
      cmd_buffer->status_bo = tu_cmd_buffer_setup_status_tracking(device);
@ -4160,7 +4177,7 @@ tu_cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer)
   u_trace_fini(&cmd_buffer->trace);
   u_trace_fini(&cmd_buffer->rp_trace);
-   tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results);
+   cmd_buffer->autotune_ctx.~cmd_buf_ctx();
   for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
      if (cmd_buffer->descriptors[i].push_set.layout)
@ -4238,7 +4255,7 @@ tu_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer,
   tu_cs_reset(&cmd_buffer->pre_chain.draw_cs);
   tu_cs_reset(&cmd_buffer->pre_chain.draw_epilogue_cs);
-   tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results);
+   cmd_buffer->autotune_ctx.reset();
   for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
      memset(&cmd_buffer->descriptors[i].sets, 0, sizeof(cmd_buffer->descriptors[i].sets));
@ -6100,7 +6117,9 @@ tu_restore_suspended_pass(struct tu_cmd_buffer *cmd,
   cmd->state.clear_values = suspended->state.suspended_pass.clear_values;
   cmd->state.render_area = suspended->state.suspended_pass.render_area;
   cmd->state.gmem_layout = suspended->state.suspended_pass.gmem_layout;
-   cmd->state.tiling = &cmd->state.framebuffer->tiling[cmd->state.gmem_layout];
+   cmd->state.gmem_layout_divisor = suspended->state.suspended_pass.gmem_layout_divisor;
   cmd->state.tiling = tu_framebuffer_get_tiling_config(cmd->state.framebuffer, cmd->device, cmd->state.pass,
                                                        cmd->state.gmem_layout, cmd->state.gmem_layout_divisor);
   cmd->state.lrz = suspended->state.suspended_pass.lrz;
 }
@ -6483,7 +6502,7 @@ tu_emit_subpass_begin_gmem(struct tu_cmd_buffer *cmd, struct tu_resolve_group *r
    * (perf queries), then we can't do this optimization since the
    * start-of-the-CS geometry condition will have been overwritten.
    */
-   bool cond_load_allowed = vsc->binning &&
+   bool cond_load_allowed = vsc->binning_possible &&
                            cmd->state.pass->has_cond_load_store &&
                            !cmd->state.rp.draw_cs_writes_to_cond_pred;
@ -7051,6 +7070,7 @@ tu_CmdBeginRendering(VkCommandBuffer commandBuffer,
      cmd->state.suspended_pass.attachments = cmd->state.attachments;
      cmd->state.suspended_pass.clear_values = cmd->state.clear_values;
      cmd->state.suspended_pass.gmem_layout = cmd->state.gmem_layout;
      cmd->state.suspended_pass.gmem_layout_divisor = cmd->state.gmem_layout_divisor;
   }
   tu_fill_render_pass_state(&cmd->state.vk_rp, cmd->state.pass, cmd->state.subpass);
--- a/src/freedreno/vulkan/tu_cmd_buffer.h
+++ b/src/freedreno/vulkan/tu_cmd_buffer.h
@ -524,11 +524,12 @@ struct tu_cmd_state
   /* Decides which GMEM layout to use from the tu_pass, based on whether the CCU
    * might get used by tu_store_gmem_attachment().
    */
-   enum tu_gmem_layout gmem_layout;
+   tu_gmem_layout gmem_layout;
   uint32_t gmem_layout_divisor;
   const struct tu_render_pass *pass;
   const struct tu_subpass *subpass;
-   const struct tu_framebuffer *framebuffer;
+   struct tu_framebuffer *framebuffer;
   const struct tu_tiling_config *tiling;
   VkRect2D render_area;
@ -543,9 +544,10 @@ struct tu_cmd_state
   struct {
      const struct tu_render_pass *pass;
      const struct tu_subpass *subpass;
-      const struct tu_framebuffer *framebuffer;
+      struct tu_framebuffer *framebuffer;
      VkRect2D render_area;
      enum tu_gmem_layout gmem_layout;
      uint32_t gmem_layout_divisor;
      const struct tu_image_view **attachments;
      VkClearValue *clear_values;
@ -644,8 +646,7 @@ struct tu_cmd_buffer
   struct u_trace_iterator trace_renderpass_start;
   struct u_trace trace, rp_trace;
-   struct list_head renderpass_autotune_results;
+   tu_autotune::cmd_buf_ctx autotune_ctx;
   struct tu_autotune_results_buffer* autotune_buffer;
   void *patchpoints_ctx;
   struct util_dynarray fdm_bin_patchpoints;
--- a/src/freedreno/vulkan/tu_device.cc
+++ b/src/freedreno/vulkan/tu_device.cc
@ -1795,6 +1795,7 @@ static const driOptionDescription tu_dri_options[] = {
      DRI_CONF_TU_USE_TEX_COORD_ROUND_NEAREST_EVEN_MODE(false)
      DRI_CONF_TU_IGNORE_FRAG_DEPTH_DIRECTION(false)
      DRI_CONF_TU_ENABLE_SOFTFLOAT32(false)
      DRI_CONF_TU_AUTOTUNE_ALGORITHM()
   DRI_CONF_SECTION_END
 };
@ -1825,6 +1826,8 @@ tu_init_dri_options(struct tu_instance *instance)
         driQueryOptionb(&instance->dri_options, "tu_ignore_frag_depth_direction");
   instance->enable_softfloat32 =
         driQueryOptionb(&instance->dri_options, "tu_enable_softfloat32");
   instance->autotune_algo =
         driQueryOptionstr(&instance->dri_options, "tu_autotune_algorithm");
 }
 static uint32_t instance_count = 0;
@ -2633,7 +2636,6 @@ tu_device_destroy_mutexes(struct tu_device *device)
 {
   mtx_destroy(&device->bo_mutex);
   mtx_destroy(&device->pipeline_mutex);
   mtx_destroy(&device->autotune_mutex);
   mtx_destroy(&device->kgsl_profiling_mutex);
   mtx_destroy(&device->event_mutex);
   mtx_destroy(&device->trace_mutex);
@ -2667,6 +2669,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
   VkResult result;
   struct tu_device *device;
   bool border_color_without_format = false;
   bool autotune_disable_preempt_optimize = false;
   vk_foreach_struct_const (ext, pCreateInfo->pNext) {
      switch (ext->sType) {
@ -2743,7 +2746,6 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
   mtx_init(&device->bo_mutex, mtx_plain);
   mtx_init(&device->pipeline_mutex, mtx_plain);
   mtx_init(&device->autotune_mutex, mtx_plain);
   mtx_init(&device->kgsl_profiling_mutex, mtx_plain);
   mtx_init(&device->event_mutex, mtx_plain);
   mtx_init(&device->trace_mutex, mtx_plain);
@ -2789,6 +2791,13 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
   for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
      const VkDeviceQueueCreateInfo *queue_create =
         &pCreateInfo->pQueueCreateInfos[i];
      const VkDeviceQueueGlobalPriorityCreateInfoKHR *priority_info =
         vk_find_struct_const(queue_create->pNext,
               DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_KHR);
      const VkQueueGlobalPriorityKHR global_priority = priority_info ?
         priority_info->globalPriority :
         (TU_DEBUG(HIPRIO) ? VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR :
          VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR);
      uint32_t qfi = queue_create->queueFamilyIndex;
      enum tu_queue_type type = physical_device->queue_families[qfi].type;
      device->queues[qfi] = (struct tu_queue *) vk_alloc(
@ -2808,13 +2817,16 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
      device->queue_count[qfi] = queue_create->queueCount;
      for (unsigned q = 0; q < queue_create->queueCount; q++) {
-         result = tu_queue_init(device, &device->queues[qfi][q], type, q,
+         result = tu_queue_init(device, &device->queues[qfi][q], type,
-                                queue_create);
+                                global_priority, q, queue_create);
         if (result != VK_SUCCESS) {
            device->queue_count[qfi] = q;
            goto fail_queues;
         }
      }
      autotune_disable_preempt_optimize |=
         (global_priority == VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR);
   }
   result = vk_meta_device_init(&device->vk, &device->meta);
@ -2868,9 +2880,6 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
                                TU_BO_ALLOC_ALLOW_DUMP |
                                TU_BO_ALLOC_INTERNAL_RESOURCE),
      "pipeline_suballoc");
   tu_bo_suballocator_init(&device->autotune_suballoc, device,
                           128 * 1024, TU_BO_ALLOC_INTERNAL_RESOURCE,
                           "autotune_suballoc");
   if (is_kgsl(physical_device->instance)) {
      tu_bo_suballocator_init(&device->kgsl_profiling_suballoc, device,
                              128 * 1024, TU_BO_ALLOC_INTERNAL_RESOURCE,
@ -3019,10 +3028,12 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
   }
   pthread_condattr_destroy(&condattr);
-   result = tu_autotune_init(&device->autotune, device);
+   device->autotune = new tu_autotune(device, result);
-   if (result != VK_SUCCESS) {
+   if (result != VK_SUCCESS)
      goto fail_timeline_cond;
-   }
+
   if (autotune_disable_preempt_optimize)
      device->autotune->disable_preempt_optimize();
   device->use_z24uint_s8uint =
      physical_device->info->props.has_z24uint_s8uint &&
@ -3180,10 +3191,9 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
      free(device->dbg_renderpass_stomp_cs);
   }
-   tu_autotune_fini(&device->autotune, device);
+   delete device->autotune;
   tu_bo_suballocator_finish(&device->pipeline_suballoc);
   tu_bo_suballocator_finish(&device->autotune_suballoc);
   tu_bo_suballocator_finish(&device->kgsl_profiling_suballoc);
   tu_bo_suballocator_finish(&device->event_suballoc);
   tu_bo_suballocator_finish(&device->vis_stream_suballocator);
@ -4009,7 +4019,7 @@ tu_CreateFramebuffer(VkDevice _device,
      }
   }
-   tu_framebuffer_tiling_config(framebuffer, device, pass);
+   tu_framebuffer_init_tiling_config(framebuffer, device, pass);
   /* For MSRTSS, allocate extra images that are tied to the VkFramebuffer */
   if (msrtss_attachment_count > 0) {
@ -4071,7 +4081,7 @@ tu_setup_dynamic_framebuffer(struct tu_cmd_buffer *cmd_buffer,
         view->image->max_tile_h_constraint_fdm;
   }
-   tu_framebuffer_tiling_config(framebuffer, cmd_buffer->device, pass);
+   tu_framebuffer_init_tiling_config(framebuffer, cmd_buffer->device, pass);
 }
 VkResult
--- a/src/freedreno/vulkan/tu_device.h
+++ b/src/freedreno/vulkan/tu_device.h
@ -28,6 +28,7 @@
 #include "common/freedreno_rd_output.h"
 #include "util/vma.h"
 #include "util/u_vector.h"
 #include "util/rwlock.h"
 /* queue types */
 #define TU_QUEUE_GENERAL 0
@ -233,6 +234,9 @@ struct tu_instance
    * However we don't want native Vulkan apps using this.
    */
   bool enable_softfloat32;
   /* Configuration option to use a specific autotune algorithm by default. */
   const char *autotune_algo;
 };
 VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance,
                       VK_OBJECT_TYPE_INSTANCE)
@ -265,7 +269,12 @@ struct tu6_global
   volatile uint32_t vtx_stats_query_not_running;
-   /* To know when renderpass stats for autotune are valid */
+   /* A fence with a monotonically increasing value that is
    * incremented by the GPU on each submission that includes
    * a tu_autotune::submission_entry CS. This is used to track
    * which submissions have been processed by the GPU before
    * processing the autotune packet on the CPU.
    */
   volatile uint32_t autotune_fence;
   /* For recycling command buffers for dynamic suspend/resume comamnds */
@ -355,12 +364,6 @@ struct tu_device
   struct tu_suballocator pipeline_suballoc;
   mtx_t pipeline_mutex;
   /* Device-global BO suballocator for reducing BO management for small
    * gmem/sysmem autotune result buffers.  Synchronized by autotune_mutex.
    */
   struct tu_suballocator autotune_suballoc;
   mtx_t autotune_mutex;
   /* KGSL requires a small chunk of GPU mem to retrieve raw GPU time on
    * each submission.
    */
@ -462,7 +465,7 @@ struct tu_device
   pthread_cond_t timeline_cond;
   pthread_mutex_t submit_mutex;
-   struct tu_autotune autotune;
+   struct tu_autotune *autotune;
   struct breadcrumbs_context *breadcrumbs_ctx;
@ -547,8 +550,11 @@ struct tu_vsc_config {
   /* Whether binning could be used for gmem rendering using this framebuffer. */
   bool binning_possible;
-   /* Whether binning should be used for gmem rendering using this framebuffer. */
+   /* Whether binning is useful for GMEM rendering performance using this framebuffer. This is independent of whether
-   bool binning;
+    * binning is possible, and is determined by the tile count. Not binning when it's useful would be a performance
    * hazard, and GMEM rendering should be avoided in the case where it's useful to bin but not possible to do so.
    */
   bool binning_useful;
   /* pipe register values */
   uint32_t pipe_config[MAX_VSC_PIPES];
@ -577,7 +583,8 @@ struct tu_framebuffer
   uint32_t max_tile_w_constraint;
   uint32_t max_tile_h_constraint;
-   struct tu_tiling_config tiling[TU_GMEM_LAYOUT_COUNT];
+   uint32_t initd_divisor; /* The tile divisors up to this have been initialized, for lazy init. */
   struct tu_tiling_config tiling[TU_GMEM_LAYOUT_COUNT * TU_GMEM_LAYOUT_DIVISOR_MAX];
   uint32_t attachment_count;
   const struct tu_image_view *attachments[0];
--- a/src/freedreno/vulkan/tu_pass.h
+++ b/src/freedreno/vulkan/tu_pass.h
@ -22,6 +22,8 @@ enum tu_gmem_layout
   TU_GMEM_LAYOUT_COUNT,
 };
 constexpr uint32_t TU_GMEM_LAYOUT_DIVISOR_MAX = 6; /* 1x (no divisor), 2 (1/2), 3 (1/3) */
 struct tu_subpass_barrier {
   VkPipelineStageFlags2 src_stage_mask;
   VkPipelineStageFlags2 dst_stage_mask;
--- a/src/freedreno/vulkan/tu_queue.cc
+++ b/src/freedreno/vulkan/tu_queue.cc
@ -418,6 +418,7 @@ queue_submit(struct vk_queue *_queue, struct vk_queue_submit *vk_submit)
   struct tu_device *device = queue->device;
   bool u_trace_enabled = u_trace_should_process(&queue->device->trace_context);
   struct util_dynarray dump_cmds;
   struct tu_cs *autotune_cs = NULL;
   if (vk_submit->buffer_bind_count ||
       vk_submit->image_bind_count ||
@ -495,9 +496,8 @@ queue_submit(struct vk_queue *_queue, struct vk_queue_submit *vk_submit)
      }
   }
-   if (tu_autotune_submit_requires_fence(cmd_buffers, cmdbuf_count)) {
+   autotune_cs = device->autotune->on_submit(cmd_buffers, cmdbuf_count);
-      struct tu_cs *autotune_cs = tu_autotune_on_submit(
+   if (autotune_cs) {
         device, &device->autotune, cmd_buffers, cmdbuf_count);
      submit_add_entries(device, submit, &dump_cmds, autotune_cs->entries,
                         autotune_cs->entry_count);
   }
@ -605,17 +605,10 @@ VkResult
 tu_queue_init(struct tu_device *device,
              struct tu_queue *queue,
              enum tu_queue_type type,
              const VkQueueGlobalPriorityKHR global_priority,
              int idx,
              const VkDeviceQueueCreateInfo *create_info)
 {
   const VkDeviceQueueGlobalPriorityCreateInfoKHR *priority_info =
      vk_find_struct_const(create_info->pNext,
            DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_KHR);
   const VkQueueGlobalPriorityKHR global_priority = priority_info ?
      priority_info->globalPriority :
      (TU_DEBUG(HIPRIO) ? VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR :
       VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR);
   const int priority = tu_get_submitqueue_priority(
         device->physical_device, global_priority, type,
         device->vk.enabled_features.globalPriorityQuery);
--- a/src/freedreno/vulkan/tu_queue.h
+++ b/src/freedreno/vulkan/tu_queue.h
@ -43,6 +43,7 @@ VkResult
 tu_queue_init(struct tu_device *device,
              struct tu_queue *queue,
              enum tu_queue_type type,
              const VkQueueGlobalPriorityKHR global_priority,
              int idx,
              const VkDeviceQueueCreateInfo *create_info);
--- a/src/freedreno/vulkan/tu_util.cc
+++ b/src/freedreno/vulkan/tu_util.cc
@ -365,6 +365,51 @@ is_hw_binning_possible(const struct tu_vsc_config *vsc)
   return tiles_per_pipe <= 32;
 }
 static void
 tu_tiling_config_divide_tile(const struct tu_device *dev,
                             const struct tu_render_pass *pass,
                             const struct tu_framebuffer *fb,
                             const struct tu_tiling_config *tiling,
                             struct tu_tiling_config *new_tiling,
                             uint32_t divisor)
 {
   assert(divisor > 0);
   *new_tiling = *tiling;
   if (divisor == 1 || !tiling->possible || tiling->tile0.width == ~0) {
      /* If the divisor is 1, or if the tiling is not possible, or if the
       * tiling is invalid, just return the original tiling. */
      return;
   }
   /* Get the hardware-specified alignment values. */
   const uint32_t tile_align_w = pass->tile_align_w;
   const uint32_t tile_align_h = dev->physical_device->info->tile_align_h;
   /* Divide the current tile dimensions by the divisor. */
   uint32_t new_tile_width = tiling->tile0.width / divisor;
   uint32_t new_tile_height = tiling->tile0.height / divisor;
   /* Clamp to the minimum alignment if necessary and align down. */
   if (new_tile_width < tile_align_w)
      new_tile_width = tile_align_w;
   else
      new_tile_width = ROUND_DOWN_TO_NPOT(new_tile_width, tile_align_w);
   if (new_tile_height < tile_align_h)
      new_tile_height = tile_align_h;
   else
      new_tile_height = ROUND_DOWN_TO_NPOT(new_tile_height, tile_align_h);
   new_tiling->tile0.width = new_tile_width;
   new_tiling->tile0.height = new_tile_height;
   /* Recalculate the tile count from the framebuffer dimensions to ensure
    * full coverage. */
   new_tiling->vsc.tile_count.width = DIV_ROUND_UP(fb->width, new_tile_width);
   new_tiling->vsc.tile_count.height = DIV_ROUND_UP(fb->height, new_tile_height);
 }
 static void
 tu_tiling_config_update_pipe_layout(struct tu_vsc_config *vsc,
                                    const struct tu_device *dev,
@ -460,22 +505,18 @@ tu_tiling_config_update_pipes(struct tu_vsc_config *vsc,
 static void
 tu_tiling_config_update_binning(struct tu_vsc_config *vsc, const struct tu_device *device)
 {
-   if (vsc->binning_possible) {
+   vsc->binning_useful = (vsc->tile_count.width * vsc->tile_count.height) > 2;
      vsc->binning = (vsc->tile_count.width * vsc->tile_count.height) > 2;
-      if (TU_DEBUG(FORCEBIN))
+   if (TU_DEBUG(FORCEBIN))
-         vsc->binning = true;
+      vsc->binning_useful = true;
-      if (TU_DEBUG(NOBIN))
+   if (TU_DEBUG(NOBIN))
-         vsc->binning = false;
+      vsc->binning_useful = false;
   } else {
      vsc->binning = false;
   }
 }
 void
-tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
+tu_framebuffer_init_tiling_config(struct tu_framebuffer *fb,
-                             const struct tu_device *device,
+                                  const struct tu_device *device,
-                             const struct tu_render_pass *pass)
+                                  const struct tu_render_pass *pass)
 {
   for (int gmem_layout = 0; gmem_layout < TU_GMEM_LAYOUT_COUNT; gmem_layout++) {
      struct tu_tiling_config *tiling = &fb->tiling[gmem_layout];
@ -499,6 +540,49 @@ tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
         tu_tiling_config_update_binning(fdm_offset_vsc, device);
      }
   }
   fb->initd_divisor = 1;
 }
 const struct tu_tiling_config *
 tu_framebuffer_get_tiling_config(struct tu_framebuffer *fb,
                                 const struct tu_device *device,
                                 const struct tu_render_pass *pass,
                                 int gmem_layout,
                                 uint32_t divisor)
 {
   assert(divisor >= 1 && divisor <= TU_GMEM_LAYOUT_DIVISOR_MAX);
   assert(divisor == 1 || !pass->has_fdm); /* For FDM, it's expected that FDM alone will be sufficient to
                                              appropriately size the tiles for the framebuffer.*/
   struct tu_tiling_config *tiling = &fb->tiling[(TU_GMEM_LAYOUT_COUNT * (divisor - 1)) + gmem_layout];
   if (divisor > fb->initd_divisor) {
      const struct tu_tiling_config *base_tiling =
         tu_framebuffer_get_tiling_config(fb, device, pass, gmem_layout, divisor - 1);
      tu_tiling_config_divide_tile(device, pass, fb, base_tiling, tiling, divisor);
      struct tu_vsc_config *vsc = &tiling->vsc;
      if (tiling->possible) {
         tu_tiling_config_update_pipe_layout(vsc, device, false);
         tu_tiling_config_update_pipes(vsc, device);
         tu_tiling_config_update_binning(vsc, device);
         struct tu_vsc_config *fdm_offset_vsc = &tiling->fdm_offset_vsc;
         fdm_offset_vsc->tile_count = (VkExtent2D) { ~1, ~1 };
      }
      if (!tiling->possible ||                               /* If tiling is no longer possible, this is pointless. */
          (vsc->binning_useful && !vsc->binning_possible) || /* Dividing further without HW binning is a bad idea.  */
          (vsc->tile_count.width * vsc->tile_count.height > 100) /* 100 tiles are too many, even with HW binning.   */
      ) {
         /* Revert to the previous level's tiling configuration. */
         *tiling = *base_tiling;
      }
      fb->initd_divisor = divisor;
   }
   return tiling;
 }
 void
--- a/src/freedreno/vulkan/tu_util.h
+++ b/src/freedreno/vulkan/tu_util.h
@ -136,9 +136,16 @@ __tu_finishme(const char *file, int line, const char *format, ...)
   } while (0)
 void
-tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
+tu_framebuffer_init_tiling_config(struct tu_framebuffer *fb,
-                             const struct tu_device *device,
+                                  const struct tu_device *device,
-                             const struct tu_render_pass *pass);
+                                  const struct tu_render_pass *pass);
 const struct tu_tiling_config *
 tu_framebuffer_get_tiling_config(struct tu_framebuffer *fb,
                                 const struct tu_device *device,
                                 const struct tu_render_pass *pass,
                                 int gmem_layout,
                                 uint32_t divisor);
 #define TU_STAGE_MASK ((1 << MESA_SHADER_STAGES) - 1)
--- a/src/util/driconf.h
+++ b/src/util/driconf.h
@ -657,6 +657,10 @@
   DRI_CONF_OPT_B(tu_enable_softfloat32, def, \
                  "Enable softfloat emulation for float32 denormals")
 #define DRI_CONF_TU_AUTOTUNE_ALGORITHM() \
   DRI_CONF_OPT_S_NODEF(tu_autotune_algorithm, \
                        "Set the preferred autotune algorithm")
 /**
 * \brief Honeykrisp specific configuration options
 */
--- a/src/util/rand_xor.h
+++ b/src/util/rand_xor.h
@ -28,10 +28,18 @@
 #include <stdint.h>
 #include <stdbool.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 uint64_t
 rand_xorshift128plus(uint64_t seed[2]);
 void
 s_rand_xorshift128plus(uint64_t seed[2], bool randomised_seed);
 #ifdef __cplusplus
 } /* end of extern "C" */
 #endif
 #endif /* RAND_XOR_H */
--- a/src/util/u_math.h
+++ b/src/util/u_math.h
@ -674,6 +674,12 @@ ROUND_DOWN_TO(uint64_t value, uint32_t alignment)
   return ((value) & ~(uint64_t)(alignment - 1));
 }
 static inline uint64_t
 ROUND_DOWN_TO_NPOT(uint64_t value, uint32_t alignment)
 {
   return value - (value % alignment);
 }
 /**
 * Align a value, only works pot alignemnts.
 */