Merge branch 'tu-newat' into 'main'

turnip: Autotuner Overhaul See merge request mesa/mesa!37802
2025-12-20 05:10:11 +01:00 · 2025-12-20 06:20:41 +05:30 · 2025-12-20 06:20:41 +05:30 · adbb7f760f
commit adbb7f760f
parent c430f394c5 97f6f8b1a0
17 changed files with 2309 additions and 848 deletions
--- a/docs/drivers/freedreno.rst
+++ b/docs/drivers/freedreno.rst
@ -665,3 +665,66 @@ are supported at the moment: ``nir``, ``nobin``, ``sysmem``, ``gmem``, ``forcebi
 Some of these options will behave differently when toggled at runtime, for example:
 ``nolrz`` will still result in LRZ allocation which would not happen if the option
 was set in the environment variable.
+
+Autotune
+^^^^^^^^
+
+Turnip supports dynamically selecting between SYSMEM and GMEM rendering with the
+autotune system, the behavior of which can be controlled with the following
+environment variables:
+
+.. envvar:: TU_AUTOTUNE_ALGO
+
+  Selects the algorithm used for autotuning. Supported values are:
+
+  ``bandwidth``
+    Estimates the bandwidth usage of rendering in SYSMEM and GMEM modes, and chooses
+    the one with lower estimated bandwidth.
+
+  ``profiled``
+    Dynamically profiles the RP timings in SYSMEM and GMEM modes, and uses that to
+    move a probability distribution towards the optimal choice over time. This
+    algorithm tends to be far more accurate than the bandwidth algorithm at choosing
+    the optimal rendering mode but may result in larger FPS variance due to being
+    based on a probability distribution with random sampling. This is the default
+    algorithm.
+
+  ``profiled_imm``
+    Similar to ``profiled``, but only profiles the first few instances of a RP
+    and then sticks to the chosen mode for subsequent instances. This is meant
+    for single-frame traces run multiple times in a CI where this algorithm can
+    immediately chose the optimal rendering mode for each RP.
+
+  ``prefer_sysmem``
+    Always chooses SYSMEM rendering. This is useful for games that don't benefit
+    from GMEM rendering due to their rendering patterns, setting this is better
+    than using ``TU_DEBUG=sysmem`` when done for performance reasons.
+
+  The algorithm can be set via the driconf option ``tu_autotune_algorithm`` as well.
+
+.. envvar:: TU_AUTOTUNE_FLAGS
+
+  Modifies the behavior of the selected algorithm. Supported flags are:
+
+  ``big_gmem``
+    Always chooses GMEM rendering if the amount of draw calls in the render pass
+    is greater than a certain threshold. Larger RPs generally benefit more from
+    GMEM rendering due to less overhead from tiling. 
+
+  ``small_sysmem``
+    Always chooses SYSMEM rendering if the amount of draw calls in the render pass
+    is lower than a certain threshold. The benefits of GMEM rendering are less
+    pronounced in these smaller RPs and SYSMEM rendering tends to win more often.
+
+  ``preempt_optimize``
+    Tries to keep non-preemptible time in the render pass is below a certain
+    threshold. This is useful for systems with GPU-based compositors where long
+    non-preemptible times can lead to missed frame deadlines, causing noticeable
+    stuttering. This flag will reduce the performance of the render pass in order
+    to improve overall system responsiveness, it should not be used unless the
+    rest of the system is affected by preemption delays.
+
+  Multiple flags can be combined by separating them with commas, e.g.
+  ``TU_AUTOTUNE_FLAGS=big_gmem,small_sysmem``.
+
+  If no flags are specified, the default behavior is used.
--- a/src/freedreno/vulkan/.clang-format
+++ b/src/freedreno/vulkan/.clang-format
@ -4,7 +4,7 @@ DisableFormat: false

 AlwaysBreakAfterReturnType: TopLevel
 BinPackParameters: false
-ColumnLimit: 78
+ColumnLimit: 120
 Cpp11BracedListStyle: false

 IncludeBlocks: Regroup
--- a/src/freedreno/vulkan/tu_autotune.cc
+++ b/src/freedreno/vulkan/tu_autotune.cc
--- a/src/freedreno/vulkan/tu_autotune.h
+++ b/src/freedreno/vulkan/tu_autotune.h
@ -8,150 +8,265 @@

 #include "tu_common.h"

-#include "util/hash_table.h"
-#include "util/rwlock.h"
+#include <atomic>
+#include <deque>
+#include <memory>
+#include <mutex>
+#include <shared_mutex>
+#include <unordered_map>
+#include <vector>

+#include "tu_cs.h"
 #include "tu_suballoc.h"

-struct tu_renderpass_history;
+/* Compile-time toggle for debugging preemption latency with CP preemption performance counters. */
+#define TU_AUTOTUNE_DEBUG_PERFCTR 0

-/**
- * "autotune" our decisions about bypass vs GMEM rendering, based on historical
- * data about a given render target.
- *
- * In deciding which path to take there are tradeoffs, including some that
- * are not reasonably estimateable without having some additional information:
- *
- *  (1) If you know you are touching every pixel (ie. there is a clear),
- *      then the GMEM path will at least not cost more memory bandwidth than
- *      sysmem[1]
- *
- *  (2) If there is no clear, GMEM could potentially cost *more* bandwidth
- *      if there is sysmem->GMEM restore pass.
- *
- *  (3) If you see a high draw count, that is an indication that there will be
- *      enough pixels accessed multiple times to benefit from the reduced
- *      memory bandwidth that GMEM brings
- *
- *  (4) But high draw count where there is not much overdraw can actually be
- *      faster in bypass mode if it is pushing a lot of state change, due to
- *      not having to go thru the state changes per-tile[1]
- *
- * The approach taken is to measure the samples-passed for the batch to estimate
- * the amount of overdraw to detect cases where the number of pixels touched is
- * low.
- *
- * [1] ignoring early-tile-exit optimizations, but any draw that touches all/
- *     most of the tiles late in the tile-pass can defeat that
+/* Autotune allows for us to tune rendering parameters (such as GMEM vs SYSMEM, tile size divisor, etc.) based on
+ * dynamic analysis of the rendering workload via on-GPU profiling. This lets us make much better decisions than static
+ * analysis, since we can adapt to the actual workload rather than relying on heuristics.
 */
 struct tu_autotune {
-
-   /* We may have to disable autotuner if there are too many
-    * renderpasses in-flight.
-    */
-   bool enabled;
-
+ private:
+   bool enabled = true;
   struct tu_device *device;

-   /**
-    * Cache to map renderpass key to historical information about
-    * rendering to that particular render target.
+   /** Configuration **/
+
+   enum class algorithm : uint8_t;
+   enum class mod_flag : uint8_t;
+   enum class metric_flag : uint8_t;
+   /* Container for all autotune configuration options. */
+   struct PACKED config_t;
+   union PACKED packed_config_t;
+
+   /* Allows for thread-safe access to the configurations. */
+   struct atomic_config_t {
+    private:
+      std::atomic<uint32_t> config_bits = 0;
+
+    public:
+      atomic_config_t(config_t initial_config);
+
+      config_t load() const;
+
+      bool compare_and_store(config_t updated, config_t expected);
+   } active_config;
+
+   config_t get_env_config();
+
+   /** Global Fence and Internal CS Management **/
+
+   /* BO suballocator for reducing BO management for small GMEM/SYSMEM autotune result buffers.
+    * Synchronized by suballoc_mutex.
    */
-   struct hash_table *ht;
-   struct u_rwlock ht_lock;
+   struct tu_suballocator suballoc;
+   std::mutex suballoc_mutex;

-   /**
-    * List of per-renderpass results that we are waiting for the GPU
-    * to finish with before reading back the results.
+   /* The next value to assign to tu6_global::autotune_fence, this is incremented during on_submit. */
+   uint32_t next_fence = 1;
+
+   /* A wrapper around a CS which sets the global autotune fence to a certain fence value, this allows for ergonomically
+    * managing the lifetime of the CS including recycling it after the fence value has been reached.
    */
-   struct list_head pending_results;
-
-   /**
-    * List of per-submission data that we may want to free after we
-    * processed submission results.
-    * This could happend after command buffers which were in the submission
-    * are destroyed.
-    */
-   struct list_head pending_submission_data;
-
-   /**
-    * List of per-submission data that has been finished and can be reused.
-    */
-   struct list_head submission_data_pool;
-
-   uint32_t fence_counter;
-   uint32_t idx_counter;
-};
-
-/**
- * From the cmdstream, the captured samples-passed values are recorded
- * at the start and end of the batch.
- *
- * Note that we do the math on the CPU to avoid a WFI.  But pre-emption
- * may force us to revisit that.
- */
-struct PACKED tu_renderpass_samples {
-   uint64_t samples_start;
-   /* hw requires the sample start/stop locations to be 128b aligned. */
-   uint64_t __pad0;
-   uint64_t samples_end;
-   uint64_t __pad1;
-};
-
-/* Necessary when writing sample counts using CP_EVENT_WRITE7::ZPASS_DONE. */
-static_assert(offsetof(struct tu_renderpass_samples, samples_end) == 16);
-
-/**
- * Tracks the results from an individual renderpass. Initially created
- * per renderpass, and appended to the tail of at->pending_results. At a later
- * time, when the GPU has finished writing the results, we fill samples_passed.
- */
-struct tu_renderpass_result {
-   /* Points into GPU memory */
-   struct tu_renderpass_samples* samples;
-
-   struct tu_suballoc_bo bo;
-
-   /*
-    * Below here, only used internally within autotune
-    */
-   uint64_t rp_key;
-   struct tu_renderpass_history *history;
-   struct list_head node;
+   struct submission_entry {
+    private:
      uint32_t fence;
-   uint64_t samples_passed;
+      struct tu_cs fence_cs;
+
+    public:
+      explicit submission_entry(tu_device *device);
+
+      ~submission_entry();
+
+      /* Disable move/copy, since this holds stable pointers to the fence_cs. */
+      submission_entry(const submission_entry &) = delete;
+      submission_entry &operator=(const submission_entry &) = delete;
+      submission_entry(submission_entry &&) = delete;
+      submission_entry &operator=(submission_entry &&) = delete;
+
+      /* The current state of the submission entry, this is used to track whether the CS is available for reuse, pending
+       * GPU completion or currently being processed.
+       */
+      bool is_active() const;
+
+      /* If the CS is free, returns the CS which will write out the specified fence value. Otherwise, returns nullptr. */
+      struct tu_cs *try_get_cs(uint32_t new_fence);
   };

-VkResult tu_autotune_init(struct tu_autotune *at, struct tu_device *dev);
-void tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev);
-
-bool tu_autotune_use_bypass(struct tu_autotune *at,
-                            struct tu_cmd_buffer *cmd_buffer,
-                            struct tu_renderpass_result **autotune_result);
-void tu_autotune_free_results(struct tu_device *dev, struct list_head *results);
-
-bool tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers,
-                                       uint32_t cmd_buffer_count);
-
-/**
- * A magic 8-ball that tells the gmem code whether we should do bypass mode
- * for moar fps.
+   /* Unified pool for submission CSes.
+    * Note: This is a deque rather than a vector due to the lack of move semantics in the submission_entry.
    */
-struct tu_cs *tu_autotune_on_submit(struct tu_device *dev,
-                                    struct tu_autotune *at,
-                                    struct tu_cmd_buffer **cmd_buffers,
-                                    uint32_t cmd_buffer_count);
+   std::deque<submission_entry> submission_entries;

-struct tu_autotune_results_buffer;
+   /* Returns a CS which will write out the specified fence value to the global BO's autotune fence. */
+   struct tu_cs *get_cs_for_fence(uint32_t fence);

-template <chip CHIP>
-void tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd,
-                                  struct tu_cs *cs,
-                                  struct tu_renderpass_result *autotune_result);
+   /** RP Entry Management **/

-template <chip CHIP>
-void tu_autotune_end_renderpass(struct tu_cmd_buffer *cmd,
-                                struct tu_cs *cs,
-                                struct tu_renderpass_result *autotune_result);
+   struct rp_gpu_data;
+   struct tile_gpu_data;
+   struct rp_entry;
+
+   /* A wrapper over all entries associated with a single command buffer. */
+   struct rp_entry_batch {
+      bool active;    /* If the entry is ready to be processed, i.e. the entry is submitted to the GPU queue and has a
+                         valid fence. */
+      uint32_t fence; /* The fence value which is used to signal the completion of the CB submission. This is used to
+                         determine when the entries can be processed. */
+      std::vector<std::unique_ptr<rp_entry>> entries;
+
+      rp_entry_batch();
+
+      /* Disable the copy/move to avoid performance hazards. */
+      rp_entry_batch(const rp_entry_batch &) = delete;
+      rp_entry_batch &operator=(const rp_entry_batch &) = delete;
+      rp_entry_batch(rp_entry_batch &&) = delete;
+      rp_entry_batch &operator=(rp_entry_batch &&) = delete;
+
+      void assign_fence(uint32_t new_fence);
+   };
+
+   /* A deque of entry batches that are strongly ordered by the fence value that was written by the GPU, for efficient
+    * iteration and to ensure that we process the entries in the same order they were submitted.
+    */
+   std::deque<std::shared_ptr<rp_entry_batch>> active_batches;
+
+   /* Handles processing of entry batches that are pending to be processed.
+    *
+    * Note: This must be called regularly to process the entries that have been written by the GPU. We currently do this
+    *       in the on_submit() method, which is called on every submit of a command buffer.
+    */
+   void process_entries();
+
+   /** Renderpass State Tracking **/
+
+   struct rp_history;
+   struct rp_history_handle;
+
+   /* A strongly typed key which generates a hash to uniquely identify a renderpass instance. This hash is expected to
+    * be stable across runs, so it can be used to identify the same renderpass instance consistently.
+    *
+    * Note: We can potentially include the vector of data we extract from the parameters to generate the hash into
+    *       rp_key, which would lead to true value-based equality rather than just hash-based equality which has a cost
+    *       but avoids hash collisions causing issues.
+    */
+   struct rp_key {
+      uint64_t hash;
+
+      rp_key(const struct tu_render_pass *pass,
+             const struct tu_framebuffer *framebuffer,
+             const struct tu_cmd_buffer *cmd);
+
+      /* Further salt the hash to distinguish between multiple instances of the same RP within a single command buffer. */
+      rp_key(const rp_key &key, uint32_t duplicates);
+
+      /* Equality operator, used in unordered_map. */
+      constexpr bool operator==(const rp_key &other) const noexcept
+      {
+         return hash == other.hash;
+      }
+   };
+
+   /* A thin wrapper to satisfy C++'s Hash named requirement for rp_key.
+    *
+    * Note: This should *NEVER* be used to calculate the hash itself as it would lead to the hash being calculated
+    *       multiple times, rather than being calculated once and reused when there's multiple successive lookups like
+    *       with find_or_create_rp_history() and providing the hash to the rp_history constructor.
+    */
+   struct rp_hash {
+      constexpr size_t operator()(const rp_key &key) const noexcept
+      {
+         /* Note: This will throw away the upper 32-bits on 32-bit architectures. */
+         return static_cast<size_t>(key.hash);
+      }
+   };
+
+   /* A map between the hash of an RP and the historical state of the RP. Synchronized by rp_mutex. */
+   using rp_histories_t = std::unordered_map<rp_key, rp_history, rp_hash>;
+   rp_histories_t rp_histories;
+   std::shared_mutex rp_mutex;
+   uint64_t last_reap_ts = 0;
+
+   /* Note: These will internally lock rp_mutex internally, no need to lock it. */
+   rp_history_handle find_rp_history(const rp_key &key);
+   rp_history_handle find_or_create_rp_history(const rp_key &key);
+   void reap_old_rp_histories();
+
+   /** Debug Performance Counters **/
+
+#if TU_AUTOTUNE_DEBUG_PERFCTR
+   const fd_perfcntr_group *cp_group;
+   const fd_perfcntr_countable *preemption_reaction_delay, *num_preemptions, *always_count;
+#endif
+
+ public:
+   tu_autotune(struct tu_device *device, VkResult &result);
+
+   ~tu_autotune();
+
+   /* Opaque pointer to internal structure with RP context that needs to be preserved across begin/end calls. */
+   using rp_ctx_t = rp_entry *;
+
+   /* An internal structure that needs to be held by tu_cmd_buffer to track the state of the autotuner for a given CB.
+    *
+    * Note: tu_cmd_buffer is only responsible for the lifetime of this object, all the access to the context state is
+    *       done through tu_autotune.
+    */
+   struct cmd_buf_ctx {
+    private:
+      /* A batch of all entries from RPs within this CB. */
+      std::shared_ptr<rp_entry_batch> batch;
+
+      /* Creates a new RP entry attached to this CB. */
+      rp_entry *
+      attach_rp_entry(struct tu_device *device, rp_history_handle &&history, config_t config, uint32_t draw_count);
+
+      rp_entry *find_rp_entry(const rp_key &key);
+
+      friend struct tu_autotune;
+
+    public:
+      cmd_buf_ctx();
+      ~cmd_buf_ctx();
+
+      /* Resets the internal context, should be called when tu_cmd_buffer state has been reset. */
+      void reset();
+   };
+
+   enum class render_mode {
+      SYSMEM,
+      GMEM,
+   };
+
+   render_mode get_optimal_mode(struct tu_cmd_buffer *cmd_buffer, rp_ctx_t *rp_ctx);
+
+   /* Returns the optimal tile size divisor for the given CB state. */
+   uint32_t get_tile_size_divisor(struct tu_cmd_buffer *cmd_buffer);
+
+   /* Disables preemption latency optimization within the autotuner, this is used when high-priority queues are used to
+    * ensure that the autotuner does not interfere with the high-priority queue's performance.
+    *
+    * Note: This should be called before any renderpass is started, otherwise it may lead to undefined behavior.
+    */
+   void disable_preempt_optimize();
+
+   void
+   begin_renderpass(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx, bool sysmem, uint32_t tile_count);
+
+   void end_renderpass(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx);
+
+   void begin_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx, uint32_t tile_idx);
+
+   void end_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx, uint32_t tile_idx);
+
+   /* The submit-time hook for autotuner, this may return a CS (can be NULL) which must be amended for autotuner
+    * tracking to function correctly.
+    *
+    * Note: This must be called from a single-threaded context. There should never be multiple threads calling this
+    *       function at the same time.
+    */
+   struct tu_cs *on_submit(struct tu_cmd_buffer **cmd_buffers, uint32_t cmd_buffer_count);
+};

 #endif /* TU_AUTOTUNE_H */
--- a/src/freedreno/vulkan/tu_clear_blit.cc
+++ b/src/freedreno/vulkan/tu_clear_blit.cc
@ -5466,7 +5466,10 @@ tu_choose_gmem_layout(struct tu_cmd_buffer *cmd)
      }
   }

-   cmd->state.tiling = &cmd->state.framebuffer->tiling[cmd->state.gmem_layout];
+   cmd->state.gmem_layout_divisor = cmd->device->autotune->get_tile_size_divisor(cmd);
+
+   cmd->state.tiling = tu_framebuffer_get_tiling_config(cmd->state.framebuffer, cmd->device, cmd->state.pass,
+                                                        cmd->state.gmem_layout, cmd->state.gmem_layout_divisor);
 }

 struct apply_store_coords_state {
--- a/src/freedreno/vulkan/tu_cmd_buffer.cc
+++ b/src/freedreno/vulkan/tu_cmd_buffer.cc
@ -14,6 +14,7 @@
 #include "vk_render_pass.h"
 #include "vk_util.h"

+#include "tu_autotune.h"
 #include "tu_buffer.h"
 #include "tu_clear_blit.h"
 #include "tu_cs.h"
@ -1262,8 +1263,9 @@ tu_vsc_config(struct tu_cmd_buffer *cmd, const struct tu_tiling_config *tiling)
 static bool
 use_hw_binning(struct tu_cmd_buffer *cmd)
 {
-   const struct tu_framebuffer *fb = cmd->state.framebuffer;
-   const struct tu_tiling_config *tiling = &fb->tiling[cmd->state.gmem_layout];
+   struct tu_framebuffer *fb = cmd->state.framebuffer;
+   const struct tu_tiling_config *tiling =
+      tu_framebuffer_get_tiling_config(fb, cmd->device, cmd->state.pass, cmd->state.gmem_layout, cmd->state.gmem_layout_divisor);
   const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling);

   /* XFB commands are emitted for BINNING || SYSMEM, which makes it
@ -1288,12 +1290,12 @@ use_hw_binning(struct tu_cmd_buffer *cmd)
      return true;
   }

-   return vsc->binning;
+   return vsc->binning_possible && vsc->binning_useful;
 }

 static bool
 use_sysmem_rendering(struct tu_cmd_buffer *cmd,
-                     struct tu_renderpass_result **autotune_result)
+                     tu_autotune::rp_ctx_t *rp_ctx)
 {
   if (TU_DEBUG(SYSMEM)) {
      cmd->state.rp.gmem_disable_reason = "TU_DEBUG(SYSMEM)";
@ -1343,18 +1345,20 @@ use_sysmem_rendering(struct tu_cmd_buffer *cmd,
      return true;
   }

-   if (TU_DEBUG(GMEM))
+   if (TU_DEBUG(GMEM)) {
+      cmd->state.rp.gmem_disable_reason = "TU_DEBUG(GMEM)";
      return false;
-
-   bool use_sysmem = tu_autotune_use_bypass(&cmd->device->autotune,
-                                            cmd, autotune_result);
-   if (*autotune_result) {
-      list_addtail(&(*autotune_result)->node, &cmd->renderpass_autotune_results);
   }

-   if (use_sysmem) {
+   /* This is a case where it's better to avoid GMEM, too many tiles but no HW binning possible. */
+   if (!vsc->binning_possible && vsc->binning_useful) {
+      cmd->state.rp.gmem_disable_reason = "Too many tiles and HW binning is not possible";
+      return true;
+   }
+
+   bool use_sysmem = cmd->device->autotune->get_optimal_mode(cmd, rp_ctx) == tu_autotune::render_mode::SYSMEM;
+   if (use_sysmem)
      cmd->state.rp.gmem_disable_reason = "Autotune selected sysmem";
-   }

   return use_sysmem;
 }
@ -3035,7 +3039,7 @@ tu7_emit_concurrent_binning_sysmem(struct tu_cmd_buffer *cmd,
 template <chip CHIP>
 static void
 tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
-                        struct tu_renderpass_result *autotune_result)
+                        tu_autotune::rp_ctx_t rp_ctx)
 {
   const struct tu_framebuffer *fb = cmd->state.framebuffer;

@ -3089,7 +3093,7 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
      tu_cs_emit_regs(cs, RB_BIN_FOVEAT(CHIP));
   }

-   tu_autotune_begin_renderpass<CHIP>(cmd, cs, autotune_result);
+   cmd->device->autotune->begin_renderpass(cmd, cs, rp_ctx, true, 0);

   tu_cs_sanity_check(cs);
 }
@ -3097,10 +3101,8 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
 template <chip CHIP>
 static void
 tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
-                      struct tu_renderpass_result *autotune_result)
+                      tu_autotune::rp_ctx_t rp_ctx)
 {
-   tu_autotune_end_renderpass<CHIP>(cmd, cs, autotune_result);
-
   /* Do any resolves of the last subpass. These are handled in the
    * tile_store_cs in the gmem path.
    */
@ -3127,6 +3129,8 @@ tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
      tu_cs_emit(cs, 0); /* value */
   }

+   cmd->device->autotune->end_renderpass(cmd, cs, rp_ctx);
+
   tu_cs_sanity_check(cs);
 }

@ -3275,7 +3279,7 @@ tu7_emit_concurrent_binning_gmem(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
 template <chip CHIP>
 static void
 tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
-                      struct tu_renderpass_result *autotune_result,
+                      tu_autotune::rp_ctx_t rp_ctx,
                      const VkOffset2D *fdm_offsets)
 {
   struct tu_physical_device *phys_dev = cmd->device->physical_device;
@ -3462,7 +3466,8 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
   if (use_cb)
      tu_trace_start_render_pass(cmd);

-   tu_autotune_begin_renderpass<CHIP>(cmd, cs, autotune_result);
+   uint32_t tile_count = vsc->tile_count.width * vsc->tile_count.height;
+   cmd->device->autotune->begin_renderpass(cmd, cs, rp_ctx, false, tile_count);

   tu_cs_sanity_check(cs);
 }
@ -3471,13 +3476,18 @@ template <chip CHIP>
 static void
 tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
                const struct tu_tile_config *tile,
-                bool fdm, const VkOffset2D *fdm_offsets)
+                bool fdm, const VkOffset2D *fdm_offsets,
+                tu_autotune::rp_ctx_t rp_ctx,
+                const struct tu_vsc_config *vsc)
 {
+   uint32_t tile_idx = (tile->pos.y * vsc->tile_count.width) + tile->pos.x;
   tu6_emit_tile_select<CHIP>(cmd, &cmd->cs, tile, fdm, fdm_offsets);
   tu_lrz_before_tile<CHIP>(cmd, &cmd->cs);

   trace_start_draw_ib_gmem(&cmd->trace, &cmd->cs, cmd);

+   cmd->device->autotune->begin_tile(cmd, cs, rp_ctx, tile_idx);
+
   /* Primitives that passed all tests are still counted in in each
    * tile even with HW binning beforehand. Do not permit it.
    */
@ -3489,6 +3499,8 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
   if (cmd->state.prim_generated_query_running_before_rp)
      tu_emit_event_write<CHIP>(cmd, cs, FD_START_PRIMITIVE_CTRS);

+   cmd->device->autotune->end_tile(cmd, cs, rp_ctx, tile_idx);
+
   if (use_hw_binning(cmd)) {
      tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
      tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_END_OF_DRAWS) |
@ -3528,10 +3540,8 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
 template <chip CHIP>
 static void
 tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
-                    struct tu_renderpass_result *autotune_result)
+                    tu_autotune::rp_ctx_t rp_ctx)
 {
-   tu_autotune_end_renderpass<CHIP>(cmd, cs, autotune_result);
-
   tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);

   tu_lrz_tiling_end<CHIP>(cmd, cs);
@ -3560,6 +3570,8 @@ tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,

   tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_BLIT_CACHE);

+   cmd->device->autotune->end_renderpass(cmd, cs, rp_ctx);
+
   tu_cs_sanity_check(cs);
 }

@ -3796,7 +3808,9 @@ void
 tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe,
                   uint32_t tx1, uint32_t ty1, uint32_t tx2, uint32_t ty2,
                   const struct tu_image_view *fdm,
-                   const VkOffset2D *fdm_offsets)
+                   const VkOffset2D *fdm_offsets,
+                   tu_autotune::rp_ctx_t rp_ctx,
+                   const struct tu_vsc_config *vsc)
 {
   uint32_t width = tx2 - tx1;
   uint32_t height = ty2 - ty1;
@ -3859,7 +3873,8 @@ tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe,
            continue;

         tu6_render_tile<CHIP>(cmd, &cmd->cs, &tiles[tile_idx],
-                               true, fdm_offsets);
+                               true, fdm_offsets,
+                               rp_ctx, vsc);
      }
   }
 }
@ -3892,7 +3907,7 @@ tu_allocate_transient_attachments(struct tu_cmd_buffer *cmd, bool sysmem)
 template <chip CHIP>
 static void
 tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
-                    struct tu_renderpass_result *autotune_result,
+                    tu_autotune::rp_ctx_t rp_ctx,
                    const VkOffset2D *fdm_offsets)
 {
   const struct tu_tiling_config *tiling = cmd->state.tiling;
@ -3926,7 +3941,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
   tu6_emit_tile_store_cs<CHIP>(cmd, &cmd->tile_store_cs);
   tu_cs_end(&cmd->tile_store_cs);

-   tu6_tile_render_begin<CHIP>(cmd, &cmd->cs, autotune_result, fdm_offsets);
+   tu6_tile_render_begin<CHIP>(cmd, &cmd->cs, rp_ctx, fdm_offsets);

   /* Note: we reverse the order of walking the pipes and tiles on every
    * other row, to improve texture cache locality compared to raster order.
@ -3947,7 +3962,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,

         if (merge_tiles) {
            tu_render_pipe_fdm<CHIP>(cmd, pipe, tx1, ty1, tx2, ty2, fdm,
-                                     fdm_offsets);
+                                     fdm_offsets, rp_ctx, vsc);
            continue;
         }

@ -3971,14 +3986,15 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
                  tu_calc_frag_area(cmd, &tile, fdm, fdm_offsets);

               tu6_render_tile<CHIP>(cmd, &cmd->cs, &tile, has_fdm,
-                                     fdm_offsets);
+                                     fdm_offsets,
+                                     rp_ctx, vsc);
            }
            slot_row += tile_row_stride;
         }
      }
   }

-   tu6_tile_render_end<CHIP>(cmd, &cmd->cs, autotune_result);
+   tu6_tile_render_end<CHIP>(cmd, &cmd->cs, rp_ctx);

   tu_trace_end_render_pass<CHIP>(cmd, true);

@ -3998,7 +4014,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
 template <chip CHIP>
 static void
 tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,
-                     struct tu_renderpass_result *autotune_result)
+                     tu_autotune::rp_ctx_t rp_ctx)
 {
   VkResult result = tu_allocate_transient_attachments(cmd, true);

@ -4009,7 +4025,7 @@ tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,

   tu_trace_start_render_pass(cmd);

-   tu6_sysmem_render_begin<CHIP>(cmd, &cmd->cs, autotune_result);
+   tu6_sysmem_render_begin<CHIP>(cmd, &cmd->cs, rp_ctx);

   trace_start_draw_ib_sysmem(&cmd->trace, &cmd->cs, cmd);

@ -4017,7 +4033,7 @@ tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,

   trace_end_draw_ib_sysmem(&cmd->trace, &cmd->cs);

-   tu6_sysmem_render_end<CHIP>(cmd, &cmd->cs, autotune_result);
+   tu6_sysmem_render_end<CHIP>(cmd, &cmd->cs, rp_ctx);

   tu_clone_trace_range(cmd, &cmd->cs, &cmd->trace,
                        cmd->trace_renderpass_start,
@ -4034,11 +4050,11 @@ tu_cmd_render(struct tu_cmd_buffer *cmd_buffer,
   if (cmd_buffer->state.rp.has_tess)
      tu6_lazy_emit_tessfactor_addr<CHIP>(cmd_buffer);

-   struct tu_renderpass_result *autotune_result = NULL;
-   if (use_sysmem_rendering(cmd_buffer, &autotune_result))
-      tu_cmd_render_sysmem<CHIP>(cmd_buffer, autotune_result);
+   tu_autotune::rp_ctx_t rp_ctx = NULL;
+   if (use_sysmem_rendering(cmd_buffer, &rp_ctx))
+      tu_cmd_render_sysmem<CHIP>(cmd_buffer, rp_ctx);
   else
-      tu_cmd_render_tiles<CHIP>(cmd_buffer, autotune_result, fdm_offsets);
+      tu_cmd_render_tiles<CHIP>(cmd_buffer, rp_ctx, fdm_offsets);

   /* Outside of renderpasses we assume all draw states are disabled. We do
    * this outside the draw CS for the normal case where 3d gmem stores aren't
@ -4063,6 +4079,7 @@ static void tu_reset_render_pass(struct tu_cmd_buffer *cmd_buffer)
   cmd_buffer->state.attachments = NULL;
   cmd_buffer->state.clear_values = NULL;
   cmd_buffer->state.gmem_layout = TU_GMEM_LAYOUT_COUNT; /* invalid value to prevent looking up gmem offsets */
+   cmd_buffer->state.gmem_layout_divisor = 0;
   cmd_buffer->state.renderpass_cb_disabled = false;
   memset(&cmd_buffer->state.rp, 0, sizeof(cmd_buffer->state.rp));

@ -4111,7 +4128,7 @@ tu_create_cmd_buffer(struct vk_command_pool *pool,
   u_trace_init(&cmd_buffer->rp_trace, &device->trace_context);
   cmd_buffer->trace_renderpass_start =
      u_trace_begin_iterator(&cmd_buffer->rp_trace);
-   list_inithead(&cmd_buffer->renderpass_autotune_results);
+   new (&cmd_buffer->autotune_ctx) tu_autotune::cmd_buf_ctx();

   if (TU_DEBUG_START(CHECK_CMD_BUFFER_STATUS)) {
      cmd_buffer->status_bo = tu_cmd_buffer_setup_status_tracking(device);
@ -4160,7 +4177,7 @@ tu_cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer)
   u_trace_fini(&cmd_buffer->trace);
   u_trace_fini(&cmd_buffer->rp_trace);

-   tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results);
+   cmd_buffer->autotune_ctx.~cmd_buf_ctx();

   for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
      if (cmd_buffer->descriptors[i].push_set.layout)
@ -4238,7 +4255,7 @@ tu_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer,
   tu_cs_reset(&cmd_buffer->pre_chain.draw_cs);
   tu_cs_reset(&cmd_buffer->pre_chain.draw_epilogue_cs);

-   tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results);
+   cmd_buffer->autotune_ctx.reset();

   for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
      memset(&cmd_buffer->descriptors[i].sets, 0, sizeof(cmd_buffer->descriptors[i].sets));
@ -6100,7 +6117,9 @@ tu_restore_suspended_pass(struct tu_cmd_buffer *cmd,
   cmd->state.clear_values = suspended->state.suspended_pass.clear_values;
   cmd->state.render_area = suspended->state.suspended_pass.render_area;
   cmd->state.gmem_layout = suspended->state.suspended_pass.gmem_layout;
-   cmd->state.tiling = &cmd->state.framebuffer->tiling[cmd->state.gmem_layout];
+   cmd->state.gmem_layout_divisor = suspended->state.suspended_pass.gmem_layout_divisor;
+   cmd->state.tiling = tu_framebuffer_get_tiling_config(cmd->state.framebuffer, cmd->device, cmd->state.pass,
+                                                        cmd->state.gmem_layout, cmd->state.gmem_layout_divisor);
   cmd->state.lrz = suspended->state.suspended_pass.lrz;
 }

@ -6483,7 +6502,7 @@ tu_emit_subpass_begin_gmem(struct tu_cmd_buffer *cmd, struct tu_resolve_group *r
    * (perf queries), then we can't do this optimization since the
    * start-of-the-CS geometry condition will have been overwritten.
    */
-   bool cond_load_allowed = vsc->binning &&
+   bool cond_load_allowed = vsc->binning_possible &&
                            cmd->state.pass->has_cond_load_store &&
                            !cmd->state.rp.draw_cs_writes_to_cond_pred;

@ -7051,6 +7070,7 @@ tu_CmdBeginRendering(VkCommandBuffer commandBuffer,
      cmd->state.suspended_pass.attachments = cmd->state.attachments;
      cmd->state.suspended_pass.clear_values = cmd->state.clear_values;
      cmd->state.suspended_pass.gmem_layout = cmd->state.gmem_layout;
+      cmd->state.suspended_pass.gmem_layout_divisor = cmd->state.gmem_layout_divisor;
   }

   tu_fill_render_pass_state(&cmd->state.vk_rp, cmd->state.pass, cmd->state.subpass);
--- a/src/freedreno/vulkan/tu_cmd_buffer.h
+++ b/src/freedreno/vulkan/tu_cmd_buffer.h
@ -524,11 +524,12 @@ struct tu_cmd_state
   /* Decides which GMEM layout to use from the tu_pass, based on whether the CCU
    * might get used by tu_store_gmem_attachment().
    */
-   enum tu_gmem_layout gmem_layout;
+   tu_gmem_layout gmem_layout;
+   uint32_t gmem_layout_divisor;

   const struct tu_render_pass *pass;
   const struct tu_subpass *subpass;
-   const struct tu_framebuffer *framebuffer;
+   struct tu_framebuffer *framebuffer;
   const struct tu_tiling_config *tiling;
   VkRect2D render_area;

@ -543,9 +544,10 @@ struct tu_cmd_state
   struct {
      const struct tu_render_pass *pass;
      const struct tu_subpass *subpass;
-      const struct tu_framebuffer *framebuffer;
+      struct tu_framebuffer *framebuffer;
      VkRect2D render_area;
      enum tu_gmem_layout gmem_layout;
+      uint32_t gmem_layout_divisor;

      const struct tu_image_view **attachments;
      VkClearValue *clear_values;
@ -644,8 +646,7 @@ struct tu_cmd_buffer
   struct u_trace_iterator trace_renderpass_start;
   struct u_trace trace, rp_trace;

-   struct list_head renderpass_autotune_results;
-   struct tu_autotune_results_buffer* autotune_buffer;
+   tu_autotune::cmd_buf_ctx autotune_ctx;

   void *patchpoints_ctx;
   struct util_dynarray fdm_bin_patchpoints;
--- a/src/freedreno/vulkan/tu_device.cc
+++ b/src/freedreno/vulkan/tu_device.cc
@ -1795,6 +1795,7 @@ static const driOptionDescription tu_dri_options[] = {
      DRI_CONF_TU_USE_TEX_COORD_ROUND_NEAREST_EVEN_MODE(false)
      DRI_CONF_TU_IGNORE_FRAG_DEPTH_DIRECTION(false)
      DRI_CONF_TU_ENABLE_SOFTFLOAT32(false)
+      DRI_CONF_TU_AUTOTUNE_ALGORITHM()
   DRI_CONF_SECTION_END
 };

@ -1825,6 +1826,8 @@ tu_init_dri_options(struct tu_instance *instance)
         driQueryOptionb(&instance->dri_options, "tu_ignore_frag_depth_direction");
   instance->enable_softfloat32 =
         driQueryOptionb(&instance->dri_options, "tu_enable_softfloat32");
+   instance->autotune_algo =
+         driQueryOptionstr(&instance->dri_options, "tu_autotune_algorithm");
 }

 static uint32_t instance_count = 0;
@ -2633,7 +2636,6 @@ tu_device_destroy_mutexes(struct tu_device *device)
 {
   mtx_destroy(&device->bo_mutex);
   mtx_destroy(&device->pipeline_mutex);
-   mtx_destroy(&device->autotune_mutex);
   mtx_destroy(&device->kgsl_profiling_mutex);
   mtx_destroy(&device->event_mutex);
   mtx_destroy(&device->trace_mutex);
@ -2667,6 +2669,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
   VkResult result;
   struct tu_device *device;
   bool border_color_without_format = false;
+   bool autotune_disable_preempt_optimize = false;

   vk_foreach_struct_const (ext, pCreateInfo->pNext) {
      switch (ext->sType) {
@ -2743,7 +2746,6 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,

   mtx_init(&device->bo_mutex, mtx_plain);
   mtx_init(&device->pipeline_mutex, mtx_plain);
-   mtx_init(&device->autotune_mutex, mtx_plain);
   mtx_init(&device->kgsl_profiling_mutex, mtx_plain);
   mtx_init(&device->event_mutex, mtx_plain);
   mtx_init(&device->trace_mutex, mtx_plain);
@ -2789,6 +2791,13 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
   for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
      const VkDeviceQueueCreateInfo *queue_create =
         &pCreateInfo->pQueueCreateInfos[i];
+      const VkDeviceQueueGlobalPriorityCreateInfoKHR *priority_info =
+         vk_find_struct_const(queue_create->pNext,
+               DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_KHR);
+      const VkQueueGlobalPriorityKHR global_priority = priority_info ?
+         priority_info->globalPriority :
+         (TU_DEBUG(HIPRIO) ? VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR :
+          VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR);
      uint32_t qfi = queue_create->queueFamilyIndex;
      enum tu_queue_type type = physical_device->queue_families[qfi].type;
      device->queues[qfi] = (struct tu_queue *) vk_alloc(
@ -2808,13 +2817,16 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
      device->queue_count[qfi] = queue_create->queueCount;

      for (unsigned q = 0; q < queue_create->queueCount; q++) {
-         result = tu_queue_init(device, &device->queues[qfi][q], type, q,
-                                queue_create);
+         result = tu_queue_init(device, &device->queues[qfi][q], type,
+                                global_priority, q, queue_create);
         if (result != VK_SUCCESS) {
            device->queue_count[qfi] = q;
            goto fail_queues;
         }
      }
+
+      autotune_disable_preempt_optimize |=
+         (global_priority == VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR);
   }

   result = vk_meta_device_init(&device->vk, &device->meta);
@ -2868,9 +2880,6 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
                                TU_BO_ALLOC_ALLOW_DUMP |
                                TU_BO_ALLOC_INTERNAL_RESOURCE),
      "pipeline_suballoc");
-   tu_bo_suballocator_init(&device->autotune_suballoc, device,
-                           128 * 1024, TU_BO_ALLOC_INTERNAL_RESOURCE,
-                           "autotune_suballoc");
   if (is_kgsl(physical_device->instance)) {
      tu_bo_suballocator_init(&device->kgsl_profiling_suballoc, device,
                              128 * 1024, TU_BO_ALLOC_INTERNAL_RESOURCE,
@ -3019,10 +3028,12 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
   }
   pthread_condattr_destroy(&condattr);

-   result = tu_autotune_init(&device->autotune, device);
-   if (result != VK_SUCCESS) {
+   device->autotune = new tu_autotune(device, result);
+   if (result != VK_SUCCESS)
      goto fail_timeline_cond;
-   }
+
+   if (autotune_disable_preempt_optimize)
+      device->autotune->disable_preempt_optimize();

   device->use_z24uint_s8uint =
      physical_device->info->props.has_z24uint_s8uint &&
@ -3180,10 +3191,9 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
      free(device->dbg_renderpass_stomp_cs);
   }

-   tu_autotune_fini(&device->autotune, device);
+   delete device->autotune;

   tu_bo_suballocator_finish(&device->pipeline_suballoc);
-   tu_bo_suballocator_finish(&device->autotune_suballoc);
   tu_bo_suballocator_finish(&device->kgsl_profiling_suballoc);
   tu_bo_suballocator_finish(&device->event_suballoc);
   tu_bo_suballocator_finish(&device->vis_stream_suballocator);
@ -4009,7 +4019,7 @@ tu_CreateFramebuffer(VkDevice _device,
      }
   }

-   tu_framebuffer_tiling_config(framebuffer, device, pass);
+   tu_framebuffer_init_tiling_config(framebuffer, device, pass);

   /* For MSRTSS, allocate extra images that are tied to the VkFramebuffer */
   if (msrtss_attachment_count > 0) {
@ -4071,7 +4081,7 @@ tu_setup_dynamic_framebuffer(struct tu_cmd_buffer *cmd_buffer,
         view->image->max_tile_h_constraint_fdm;
   }

-   tu_framebuffer_tiling_config(framebuffer, cmd_buffer->device, pass);
+   tu_framebuffer_init_tiling_config(framebuffer, cmd_buffer->device, pass);
 }

 VkResult
--- a/src/freedreno/vulkan/tu_device.h
+++ b/src/freedreno/vulkan/tu_device.h
@ -28,6 +28,7 @@
 #include "common/freedreno_rd_output.h"
 #include "util/vma.h"
 #include "util/u_vector.h"
+#include "util/rwlock.h"

 /* queue types */
 #define TU_QUEUE_GENERAL 0
@ -233,6 +234,9 @@ struct tu_instance
    * However we don't want native Vulkan apps using this.
    */
   bool enable_softfloat32;
+
+   /* Configuration option to use a specific autotune algorithm by default. */
+   const char *autotune_algo;
 };
 VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance,
                       VK_OBJECT_TYPE_INSTANCE)
@ -265,7 +269,12 @@ struct tu6_global

   volatile uint32_t vtx_stats_query_not_running;

-   /* To know when renderpass stats for autotune are valid */
+   /* A fence with a monotonically increasing value that is
+    * incremented by the GPU on each submission that includes
+    * a tu_autotune::submission_entry CS. This is used to track
+    * which submissions have been processed by the GPU before
+    * processing the autotune packet on the CPU.
+    */
   volatile uint32_t autotune_fence;

   /* For recycling command buffers for dynamic suspend/resume comamnds */
@ -355,12 +364,6 @@ struct tu_device
   struct tu_suballocator pipeline_suballoc;
   mtx_t pipeline_mutex;

-   /* Device-global BO suballocator for reducing BO management for small
-    * gmem/sysmem autotune result buffers.  Synchronized by autotune_mutex.
-    */
-   struct tu_suballocator autotune_suballoc;
-   mtx_t autotune_mutex;
-
   /* KGSL requires a small chunk of GPU mem to retrieve raw GPU time on
    * each submission.
    */
@ -462,7 +465,7 @@ struct tu_device
   pthread_cond_t timeline_cond;
   pthread_mutex_t submit_mutex;

-   struct tu_autotune autotune;
+   struct tu_autotune *autotune;

   struct breadcrumbs_context *breadcrumbs_ctx;

@ -547,8 +550,11 @@ struct tu_vsc_config {
   /* Whether binning could be used for gmem rendering using this framebuffer. */
   bool binning_possible;

-   /* Whether binning should be used for gmem rendering using this framebuffer. */
-   bool binning;
+   /* Whether binning is useful for GMEM rendering performance using this framebuffer. This is independent of whether
+    * binning is possible, and is determined by the tile count. Not binning when it's useful would be a performance
+    * hazard, and GMEM rendering should be avoided in the case where it's useful to bin but not possible to do so.
+    */
+   bool binning_useful;

   /* pipe register values */
   uint32_t pipe_config[MAX_VSC_PIPES];
@ -577,7 +583,8 @@ struct tu_framebuffer

   uint32_t max_tile_w_constraint;
   uint32_t max_tile_h_constraint;
-   struct tu_tiling_config tiling[TU_GMEM_LAYOUT_COUNT];
+   uint32_t initd_divisor; /* The tile divisors up to this have been initialized, for lazy init. */
+   struct tu_tiling_config tiling[TU_GMEM_LAYOUT_COUNT * TU_GMEM_LAYOUT_DIVISOR_MAX];

   uint32_t attachment_count;
   const struct tu_image_view *attachments[0];
--- a/src/freedreno/vulkan/tu_pass.h
+++ b/src/freedreno/vulkan/tu_pass.h
@ -22,6 +22,8 @@ enum tu_gmem_layout
   TU_GMEM_LAYOUT_COUNT,
 };

+constexpr uint32_t TU_GMEM_LAYOUT_DIVISOR_MAX = 6; /* 1x (no divisor), 2 (1/2), 3 (1/3) */
+
 struct tu_subpass_barrier {
   VkPipelineStageFlags2 src_stage_mask;
   VkPipelineStageFlags2 dst_stage_mask;
--- a/src/freedreno/vulkan/tu_queue.cc
+++ b/src/freedreno/vulkan/tu_queue.cc
@ -418,6 +418,7 @@ queue_submit(struct vk_queue *_queue, struct vk_queue_submit *vk_submit)
   struct tu_device *device = queue->device;
   bool u_trace_enabled = u_trace_should_process(&queue->device->trace_context);
   struct util_dynarray dump_cmds;
+   struct tu_cs *autotune_cs = NULL;

   if (vk_submit->buffer_bind_count ||
       vk_submit->image_bind_count ||
@ -495,9 +496,8 @@ queue_submit(struct vk_queue *_queue, struct vk_queue_submit *vk_submit)
      }
   }

-   if (tu_autotune_submit_requires_fence(cmd_buffers, cmdbuf_count)) {
-      struct tu_cs *autotune_cs = tu_autotune_on_submit(
-         device, &device->autotune, cmd_buffers, cmdbuf_count);
+   autotune_cs = device->autotune->on_submit(cmd_buffers, cmdbuf_count);
+   if (autotune_cs) {
      submit_add_entries(device, submit, &dump_cmds, autotune_cs->entries,
                         autotune_cs->entry_count);
   }
@ -605,17 +605,10 @@ VkResult
 tu_queue_init(struct tu_device *device,
              struct tu_queue *queue,
              enum tu_queue_type type,
+              const VkQueueGlobalPriorityKHR global_priority,
              int idx,
              const VkDeviceQueueCreateInfo *create_info)
 {
-   const VkDeviceQueueGlobalPriorityCreateInfoKHR *priority_info =
-      vk_find_struct_const(create_info->pNext,
-            DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_KHR);
-   const VkQueueGlobalPriorityKHR global_priority = priority_info ?
-      priority_info->globalPriority :
-      (TU_DEBUG(HIPRIO) ? VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR :
-       VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR);
-
   const int priority = tu_get_submitqueue_priority(
         device->physical_device, global_priority, type,
         device->vk.enabled_features.globalPriorityQuery);
--- a/src/freedreno/vulkan/tu_queue.h
+++ b/src/freedreno/vulkan/tu_queue.h
@ -43,6 +43,7 @@ VkResult
 tu_queue_init(struct tu_device *device,
              struct tu_queue *queue,
              enum tu_queue_type type,
+              const VkQueueGlobalPriorityKHR global_priority,
              int idx,
              const VkDeviceQueueCreateInfo *create_info);

--- a/src/freedreno/vulkan/tu_util.cc
+++ b/src/freedreno/vulkan/tu_util.cc
@ -365,6 +365,51 @@ is_hw_binning_possible(const struct tu_vsc_config *vsc)
   return tiles_per_pipe <= 32;
 }

+static void
+tu_tiling_config_divide_tile(const struct tu_device *dev,
+                             const struct tu_render_pass *pass,
+                             const struct tu_framebuffer *fb,
+                             const struct tu_tiling_config *tiling,
+                             struct tu_tiling_config *new_tiling,
+                             uint32_t divisor)
+{
+   assert(divisor > 0);
+
+   *new_tiling = *tiling;
+   if (divisor == 1 || !tiling->possible || tiling->tile0.width == ~0) {
+      /* If the divisor is 1, or if the tiling is not possible, or if the
+       * tiling is invalid, just return the original tiling. */
+      return;
+   }
+
+   /* Get the hardware-specified alignment values. */
+   const uint32_t tile_align_w = pass->tile_align_w;
+   const uint32_t tile_align_h = dev->physical_device->info->tile_align_h;
+
+   /* Divide the current tile dimensions by the divisor. */
+   uint32_t new_tile_width = tiling->tile0.width / divisor;
+   uint32_t new_tile_height = tiling->tile0.height / divisor;
+
+   /* Clamp to the minimum alignment if necessary and align down. */
+   if (new_tile_width < tile_align_w)
+      new_tile_width = tile_align_w;
+   else
+      new_tile_width = ROUND_DOWN_TO_NPOT(new_tile_width, tile_align_w);
+
+   if (new_tile_height < tile_align_h)
+      new_tile_height = tile_align_h;
+   else
+      new_tile_height = ROUND_DOWN_TO_NPOT(new_tile_height, tile_align_h);
+
+   new_tiling->tile0.width = new_tile_width;
+   new_tiling->tile0.height = new_tile_height;
+
+   /* Recalculate the tile count from the framebuffer dimensions to ensure
+    * full coverage. */
+   new_tiling->vsc.tile_count.width = DIV_ROUND_UP(fb->width, new_tile_width);
+   new_tiling->vsc.tile_count.height = DIV_ROUND_UP(fb->height, new_tile_height);
+}
+
 static void
 tu_tiling_config_update_pipe_layout(struct tu_vsc_config *vsc,
                                    const struct tu_device *dev,
@ -460,20 +505,16 @@ tu_tiling_config_update_pipes(struct tu_vsc_config *vsc,
 static void
 tu_tiling_config_update_binning(struct tu_vsc_config *vsc, const struct tu_device *device)
 {
-   if (vsc->binning_possible) {
-      vsc->binning = (vsc->tile_count.width * vsc->tile_count.height) > 2;
+   vsc->binning_useful = (vsc->tile_count.width * vsc->tile_count.height) > 2;

   if (TU_DEBUG(FORCEBIN))
-         vsc->binning = true;
+      vsc->binning_useful = true;
   if (TU_DEBUG(NOBIN))
-         vsc->binning = false;
-   } else {
-      vsc->binning = false;
-   }
+      vsc->binning_useful = false;
 }

 void
-tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
+tu_framebuffer_init_tiling_config(struct tu_framebuffer *fb,
                                  const struct tu_device *device,
                                  const struct tu_render_pass *pass)
 {
@ -499,6 +540,49 @@ tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
         tu_tiling_config_update_binning(fdm_offset_vsc, device);
      }
   }
+
+   fb->initd_divisor = 1;
+}
+
+const struct tu_tiling_config *
+tu_framebuffer_get_tiling_config(struct tu_framebuffer *fb,
+                                 const struct tu_device *device,
+                                 const struct tu_render_pass *pass,
+                                 int gmem_layout,
+                                 uint32_t divisor)
+{
+   assert(divisor >= 1 && divisor <= TU_GMEM_LAYOUT_DIVISOR_MAX);
+   assert(divisor == 1 || !pass->has_fdm); /* For FDM, it's expected that FDM alone will be sufficient to
+                                              appropriately size the tiles for the framebuffer.*/
+   struct tu_tiling_config *tiling = &fb->tiling[(TU_GMEM_LAYOUT_COUNT * (divisor - 1)) + gmem_layout];
+
+   if (divisor > fb->initd_divisor) {
+      const struct tu_tiling_config *base_tiling =
+         tu_framebuffer_get_tiling_config(fb, device, pass, gmem_layout, divisor - 1);
+      tu_tiling_config_divide_tile(device, pass, fb, base_tiling, tiling, divisor);
+
+      struct tu_vsc_config *vsc = &tiling->vsc;
+      if (tiling->possible) {
+         tu_tiling_config_update_pipe_layout(vsc, device, false);
+         tu_tiling_config_update_pipes(vsc, device);
+         tu_tiling_config_update_binning(vsc, device);
+
+         struct tu_vsc_config *fdm_offset_vsc = &tiling->fdm_offset_vsc;
+         fdm_offset_vsc->tile_count = (VkExtent2D) { ~1, ~1 };
+      }
+
+      if (!tiling->possible ||                               /* If tiling is no longer possible, this is pointless. */
+          (vsc->binning_useful && !vsc->binning_possible) || /* Dividing further without HW binning is a bad idea.  */
+          (vsc->tile_count.width * vsc->tile_count.height > 100) /* 100 tiles are too many, even with HW binning.   */
+      ) {
+         /* Revert to the previous level's tiling configuration. */
+         *tiling = *base_tiling;
+      }
+
+      fb->initd_divisor = divisor;
+   }
+
+   return tiling;
 }

 void
--- a/src/freedreno/vulkan/tu_util.h
+++ b/src/freedreno/vulkan/tu_util.h
@ -136,10 +136,17 @@ __tu_finishme(const char *file, int line, const char *format, ...)
   } while (0)

 void
-tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
+tu_framebuffer_init_tiling_config(struct tu_framebuffer *fb,
                                  const struct tu_device *device,
                                  const struct tu_render_pass *pass);

+const struct tu_tiling_config *
+tu_framebuffer_get_tiling_config(struct tu_framebuffer *fb,
+                                 const struct tu_device *device,
+                                 const struct tu_render_pass *pass,
+                                 int gmem_layout,
+                                 uint32_t divisor);
+
 #define TU_STAGE_MASK ((1 << MESA_SHADER_STAGES) - 1)

 #define tu_foreach_stage(stage, stage_bits)                                  \
--- a/src/util/driconf.h
+++ b/src/util/driconf.h
@ -657,6 +657,10 @@
   DRI_CONF_OPT_B(tu_enable_softfloat32, def, \
                  "Enable softfloat emulation for float32 denormals")

+#define DRI_CONF_TU_AUTOTUNE_ALGORITHM() \
+   DRI_CONF_OPT_S_NODEF(tu_autotune_algorithm, \
+                        "Set the preferred autotune algorithm")
+
 /**
 * \brief Honeykrisp specific configuration options
 */
--- a/src/util/rand_xor.h
+++ b/src/util/rand_xor.h
@ -28,10 +28,18 @@
 #include <stdint.h>
 #include <stdbool.h>

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 uint64_t
 rand_xorshift128plus(uint64_t seed[2]);

 void
 s_rand_xorshift128plus(uint64_t seed[2], bool randomised_seed);

+#ifdef __cplusplus
+} /* end of extern "C" */
+#endif
+
 #endif /* RAND_XOR_H */
--- a/src/util/u_math.h
+++ b/src/util/u_math.h
@ -674,6 +674,12 @@ ROUND_DOWN_TO(uint64_t value, uint32_t alignment)
   return ((value) & ~(uint64_t)(alignment - 1));
 }

+static inline uint64_t
+ROUND_DOWN_TO_NPOT(uint64_t value, uint32_t alignment)
+{
+   return value - (value % alignment);
+}
+
 /**
 * Align a value, only works pot alignemnts.
 */