From 11d5f3be82bc9dc27077ec744f2cde7ce37eeb99 Mon Sep 17 00:00:00 2001
From: Dhruv Mark Collins <mark@igalia.com>
Date: Thu, 9 Oct 2025 13:56:29 +0000
Subject: [PATCH 01/25] tu: Increase clang-format ColumnLimit to 120

The existing column limit led to readability being affected due to
excessive line wrapping, especially in C++ code.

Signed-off-by: Dhruv Mark Collins <mark@igalia.com>
---
 src/freedreno/vulkan/.clang-format | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/freedreno/vulkan/.clang-format b/src/freedreno/vulkan/.clang-format
index f7f9e5755db..256e3ff892f 100644
--- a/src/freedreno/vulkan/.clang-format
+++ b/src/freedreno/vulkan/.clang-format
@@ -4,7 +4,7 @@ DisableFormat: false
 
 AlwaysBreakAfterReturnType: TopLevel
 BinPackParameters: false
-ColumnLimit: 78
+ColumnLimit: 120
 Cpp11BracedListStyle: false
 
 IncludeBlocks: Regroup

From 759589944a4daaa06862c35838e88fc61667576c Mon Sep 17 00:00:00 2001
From: Dhruv Mark Collins <mark@igalia.com>
Date: Thu, 9 Oct 2025 13:56:25 +0000
Subject: [PATCH 02/25] tu: Move tu_autotune_end_renderpass as late as possible

The tu_autotune_end_renderpass function collects timestamp data for
the renderpass and should be called as late as possible for the most
complete data.

Signed-off-by: Dhruv Mark Collins <mark@igalia.com>
---
 src/freedreno/vulkan/tu_cmd_buffer.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc
index 2619b6829b6..df892beaa91 100644
--- a/src/freedreno/vulkan/tu_cmd_buffer.cc
+++ b/src/freedreno/vulkan/tu_cmd_buffer.cc
@@ -3085,8 +3085,6 @@ static void
 tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
                       struct tu_renderpass_result *autotune_result)
 {
-   tu_autotune_end_renderpass<CHIP>(cmd, cs, autotune_result);
-
    /* Do any resolves of the last subpass. These are handled in the
     * tile_store_cs in the gmem path.
     */
@@ -3113,6 +3111,8 @@ tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
       tu_cs_emit(cs, 0); /* value */
    }
 
+   tu_autotune_end_renderpass<CHIP>(cmd, cs, autotune_result);
+
    tu_cs_sanity_check(cs);
 }
 
@@ -3516,8 +3516,6 @@ static void
 tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
                     struct tu_renderpass_result *autotune_result)
 {
-   tu_autotune_end_renderpass<CHIP>(cmd, cs, autotune_result);
-
    tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
 
    tu_lrz_tiling_end<CHIP>(cmd, cs);
@@ -3546,6 +3544,8 @@ tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
 
    tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_BLIT_CACHE);
 
+   tu_autotune_end_renderpass<CHIP>(cmd, cs, autotune_result);
+
    tu_cs_sanity_check(cs);
 }
 

From d8ff474b70da044555a2447b846b565496989451 Mon Sep 17 00:00:00 2001
From: Dhruv Mark Collins <mark@igalia.com>
Date: Thu, 9 Oct 2025 19:34:43 +0000
Subject: [PATCH 03/25] tu: Rewrite autotune in C++

Completely overhauls the autotuner in C++ with the functionality
being extended as well.

Signed-off-by: Dhruv Mark Collins <mark@igalia.com>
---
 docs/drivers/freedreno.rst            |   34 +
 src/freedreno/vulkan/tu_autotune.cc   | 1429 ++++++++++++++-----------
 src/freedreno/vulkan/tu_autotune.h    |  348 +++---
 src/freedreno/vulkan/tu_cmd_buffer.cc |   55 +-
 src/freedreno/vulkan/tu_cmd_buffer.h  |    3 +-
 src/freedreno/vulkan/tu_device.cc     |   13 +-
 src/freedreno/vulkan/tu_device.h      |   16 +-
 src/freedreno/vulkan/tu_queue.cc      |    6 +-
 8 files changed, 1094 insertions(+), 810 deletions(-)

diff --git a/docs/drivers/freedreno.rst b/docs/drivers/freedreno.rst
index f57db5ff18d..cfdcaa21941 100644
--- a/docs/drivers/freedreno.rst
+++ b/docs/drivers/freedreno.rst
@@ -665,3 +665,37 @@ are supported at the moment: ``nir``, ``nobin``, ``sysmem``, ``gmem``, ``forcebi
 Some of these options will behave differently when toggled at runtime, for example:
 ``nolrz`` will still result in LRZ allocation which would not happen if the option
 was set in the environment variable.
+
+Autotune
+^^^^^^^^
+
+Turnip supports dynamically selecting between SYSMEM and GMEM rendering with the
+autotune system, the behavior of which can be controlled with the following
+environment variables:
+
+.. envvar:: TU_AUTOTUNE_ALGO
+
+  Selects the algorithm used for autotuning. Supported values are:
+
+  ``bandwidth``
+    Estimates the bandwidth usage of rendering in SYSMEM and GMEM modes, and chooses
+    the one with lower estimated bandwidth. This is the default algorithm.
+
+.. envvar:: TU_AUTOTUNE_FLAGS
+
+  Modifies the behavior of the selected algorithm. Supported flags are:
+
+  ``big_gmem``
+    Always chooses GMEM rendering if the amount of draw calls in the render pass
+    is greater than a certain threshold. Larger RPs generally benefit more from
+    GMEM rendering due to less overhead from tiling. 
+
+  ``small_sysmem``
+    Always chooses SYSMEM rendering if the amount of draw calls in the render pass
+    is lower than a certain threshold. The benefits of GMEM rendering are less
+    pronounced in these smaller RPs and SYSMEM rendering tends to win more often.
+
+  Multiple flags can be combined by separating them with commas, e.g.
+  ``TU_AUTOTUNE_FLAGS=big_gmem,small_sysmem``.
+
+  If no flags are specified, the default behavior is used.
\ No newline at end of file
diff --git a/src/freedreno/vulkan/tu_autotune.cc b/src/freedreno/vulkan/tu_autotune.cc
index df969834a37..f7de8603a29 100644
--- a/src/freedreno/vulkan/tu_autotune.cc
+++ b/src/freedreno/vulkan/tu_autotune.cc
@@ -5,113 +5,305 @@
 
 #include "tu_autotune.h"
 
+#include <array>
+#include <atomic>
+#include <optional>
+#include <string>
+#include <string_view>
+
+#include "util/rand_xor.h"
+
+#define XXH_INLINE_ALL
+#include "util/xxhash.h"
+
 #include "tu_cmd_buffer.h"
 #include "tu_cs.h"
 #include "tu_device.h"
 #include "tu_image.h"
 #include "tu_pass.h"
 
-#define XXH_INLINE_ALL
-#include "util/xxhash.h"
+/** Compile-time debug options **/
 
-/* How does it work?
- *
- * - For each renderpass we calculate the number of samples passed
- *   by storing the number before and after in GPU memory.
- * - To store the values each command buffer holds GPU memory which
- *   expands with more renderpasses being written.
- * - For each renderpass we create tu_renderpass_result entry which
- *   points to the results in GPU memory.
- *   - Later on tu_renderpass_result would be added to the
- *     tu_renderpass_history entry which aggregate results for a
- *     given renderpass.
- * - On submission:
- *   - Process results which fence was signalled.
- *   - Free per-submission data which we now don't need.
- *
- *   - Create a command stream to write a fence value. This way we would
- *     know when we could safely read the results.
- *   - We cannot rely on the command buffer's lifetime when referencing
- *     its resources since the buffer could be destroyed before we process
- *     the results.
- *   - For each command buffer:
- *     - Reference its GPU memory.
- *     - Move if ONE_TIME_SUBMIT or copy all tu_renderpass_result to the queue.
- *
- * Since the command buffers could be recorded on different threads
- * we have to maintaining some amount of locking history table,
- * however we change the table only in a single thread at the submission
- * time, so in most cases there will be no locking.
- */
+#define TU_AUTOTUNE_DEBUG_LOG_BASE      0
+#define TU_AUTOTUNE_DEBUG_LOG_BANDWIDTH 0
 
-void
-tu_autotune_free_results_locked(struct tu_device *dev, struct list_head *results);
+#if TU_AUTOTUNE_DEBUG_LOG_BASE
+#define at_log_base(fmt, ...)         mesa_logi("autotune: " fmt, ##__VA_ARGS__)
+#define at_log_base_h(fmt, hash, ...) mesa_logi("autotune %016" PRIx64 ": " fmt, hash, ##__VA_ARGS__)
+#else
+#define at_log_base(fmt, ...)
+#define at_log_base_h(fmt, hash, ...)
+#endif
 
-#define TU_AUTOTUNE_DEBUG_LOG 0
-/* Dump history entries on autotuner finish,
- * could be used to gather data from traces.
- */
-#define TU_AUTOTUNE_LOG_AT_FINISH 0
+#if TU_AUTOTUNE_DEBUG_LOG_BANDWIDTH
+#define at_log_bandwidth_h(fmt, hash, ...) mesa_logi("autotune-bw %016" PRIx64 ": " fmt, hash, ##__VA_ARGS__)
+#else
+#define at_log_bandwidth_h(fmt, hash, ...)
+#endif
 
-/* How many last renderpass stats are taken into account. */
-#define MAX_HISTORY_RESULTS 5
-/* For how many submissions we store renderpass stats. */
-#define MAX_HISTORY_LIFETIME 128
+/* Process any pending entries on autotuner finish, could be used to gather data from traces. */
+#define TU_AUTOTUNE_FLUSH_AT_FINISH 0
 
+/** Global constants and helpers **/
 
-/**
- * Tracks results for a given renderpass key
- */
-struct tu_renderpass_history {
-   uint64_t key;
+/* GPU always-on timer constants */
+constexpr uint64_t ALWAYS_ON_FREQUENCY_HZ = 19'200'000;
+constexpr uint64_t GPU_TICKS_PER_US = ALWAYS_ON_FREQUENCY_HZ / 1'000'000;
 
-   /* We would delete old history entries */
-   uint32_t last_fence;
-
-   /**
-    * List of recent fd_renderpass_result's
-    */
-   struct list_head results;
-   uint32_t num_results;
-
-   uint32_t avg_samples;
-};
-
-/* Holds per-submission cs which writes the fence. */
-struct tu_submission_data {
-   struct list_head node;
-   uint32_t fence;
-
-   struct tu_cs fence_cs;
-};
-
-static bool
-fence_before(uint32_t a, uint32_t b)
+constexpr uint64_t
+ticks_to_us(uint64_t ticks)
 {
-   /* essentially a < b, but handle wrapped values */
-   return (int32_t)(a - b) < 0;
+   return ticks / GPU_TICKS_PER_US;
 }
 
-static uint32_t
-get_autotune_fence(struct tu_autotune *at)
+constexpr bool
+fence_before(uint32_t a, uint32_t b)
 {
-   return at->device->global_bo_map->autotune_fence;
+   /* Essentially a < b, but handles wrapped values. */
+   return (int32_t) (a - b) < 0;
+}
+
+constexpr const char *
+render_mode_str(tu_autotune::render_mode mode)
+{
+   switch (mode) {
+   case tu_autotune::render_mode::SYSMEM:
+      return "SYSMEM";
+   case tu_autotune::render_mode::GMEM:
+      return "GMEM";
+   default:
+      return "UNKNOWN";
+   }
+}
+
+/** Configuration **/
+
+enum class tu_autotune::algorithm : uint8_t {
+   BANDWIDTH = 0,    /* Uses estimated BW for determining rendering mode. */
+
+   DEFAULT = BANDWIDTH, /* Default algorithm, used if no other is specified. */
+};
+
+/* Modifier flags, these modify the behavior of the autotuner in a user-defined way. */
+enum class tu_autotune::mod_flag : uint8_t {
+   BIG_GMEM = BIT(1),          /* All RPs with >= 10 draws use GMEM. */
+   SMALL_SYSMEM = BIT(2),      /* All RPs with <= 5 draws use SYSMEM. */
+};
+
+/* Metric flags, for internal tracking of enabled metrics. */
+enum class tu_autotune::metric_flag : uint8_t {
+   SAMPLES = BIT(1), /* Enable tracking samples passed metric. */
+};
+
+struct PACKED tu_autotune::config_t {
+ private:
+   algorithm algo = algorithm::DEFAULT;
+   uint8_t mod_flags = 0;    /* See mod_flag enum. */
+   uint8_t metric_flags = 0; /* See metric_flag enum. */
+
+   constexpr void update_metric_flags()
+   {
+      /* Note: Always keep in sync with rp_history to prevent UB. */
+      if (algo == algorithm::BANDWIDTH) {
+         metric_flags |= (uint8_t) metric_flag::SAMPLES;
+      }
+   }
+
+ public:
+   constexpr config_t() = default;
+
+   constexpr config_t(algorithm algo, uint8_t mod_flags): algo(algo), mod_flags(mod_flags)
+   {
+      update_metric_flags();
+   }
+
+   constexpr bool is_enabled(algorithm a) const
+   {
+      return algo == a;
+   }
+
+   constexpr bool test(mod_flag f) const
+   {
+      return mod_flags & (uint32_t) f;
+   }
+
+   constexpr bool test(metric_flag f) const
+   {
+      return metric_flags & (uint32_t) f;
+   }
+
+   constexpr bool set_algo(algorithm a)
+   {
+      if (algo == a)
+         return false;
+
+      algo = a;
+      update_metric_flags();
+      return true;
+   }
+
+   constexpr bool disable(mod_flag f)
+   {
+      if (!(mod_flags & (uint8_t) f))
+         return false;
+
+      mod_flags &= ~(uint8_t) f;
+      update_metric_flags();
+      return true;
+   }
+
+   constexpr bool enable(mod_flag f)
+   {
+      if (mod_flags & (uint8_t) f)
+         return false;
+
+      mod_flags |= (uint8_t) f;
+      update_metric_flags();
+      return true;
+   }
+
+   std::string to_string() const
+   {
+#define ALGO_STR(algo_name)                                                                                            \
+   if (algo == algorithm::algo_name)                                                                                   \
+      str += #algo_name;
+#define MODF_STR(flag)                                                                                                 \
+   if (mod_flags & (uint8_t) mod_flag::flag) {                                                                         \
+      str += #flag " ";                                                                                                \
+   }
+#define METRICF_STR(flag)                                                                                              \
+   if (metric_flags & (uint8_t) metric_flag::flag) {                                                                   \
+      str += #flag " ";                                                                                                \
+   }
+
+      std::string str = "Algorithm: ";
+
+      ALGO_STR(BANDWIDTH);
+
+      str += ", Mod Flags: 0x" + std::to_string(mod_flags) + " (";
+      MODF_STR(BIG_GMEM);
+      MODF_STR(SMALL_SYSMEM);
+      str += ")";
+
+      str += ", Metric Flags: 0x" + std::to_string(metric_flags) + " (";
+      METRICF_STR(SAMPLES);
+      str += ")";
+
+      return str;
+
+#undef ALGO_STR
+#undef MODF_STR
+#undef METRICF_STR
+   }
+};
+
+union tu_autotune::packed_config_t {
+   config_t config;
+   uint32_t bits = 0;
+   static_assert(sizeof(bits) >= sizeof(config));
+   static_assert(std::is_trivially_copyable<config_t>::value,
+                 "config_t must be trivially copyable to be automatically packed");
+
+   constexpr packed_config_t(config_t config): config(config)
+   {
+   }
+
+   constexpr packed_config_t(uint32_t bits): bits(bits)
+   {
+   }
+};
+
+tu_autotune::atomic_config_t::atomic_config_t(config_t initial): config_bits(packed_config_t { initial }.bits)
+{
+}
+
+tu_autotune::config_t
+tu_autotune::atomic_config_t::load() const
+{
+   return config_t(packed_config_t { config_bits.load(std::memory_order_relaxed) }.config);
+}
+
+bool
+tu_autotune::atomic_config_t::compare_and_store(config_t updated, config_t expected)
+{
+   uint32_t expected_bits = packed_config_t { expected }.bits;
+   return config_bits.compare_exchange_strong(expected_bits, packed_config_t { updated }.bits,
+                                              std::memory_order_acquire, std::memory_order_relaxed);
+}
+
+tu_autotune::config_t
+tu_autotune::get_env_config()
+{
+   static std::once_flag once;
+   static config_t at_config;
+   std::call_once(once, [&] {
+      const char *algo_env_str = os_get_option("TU_AUTOTUNE_ALGO");
+      algorithm algo = algorithm::DEFAULT;
+
+      if (algo_env_str) {
+         std::string_view algo_strv(algo_env_str);
+         if (algo_strv == "bandwidth") {
+            algo = algorithm::BANDWIDTH;
+         }
+
+         if (TU_DEBUG(STARTUP))
+            mesa_logi("TU_AUTOTUNE_ALGO=%u (%s)", (uint8_t) algo, algo_env_str);
+      }
+
+      /* Parse the flags from the environment variable. */
+      const char *flags_env_str = os_get_option("TU_AUTOTUNE_FLAGS");
+      uint32_t mod_flags = 0;
+      if (flags_env_str) {
+         static const struct debug_control tu_at_flags_control[] = {
+            { "big_gmem", (uint32_t) mod_flag::BIG_GMEM },
+            { "small_sysmem", (uint32_t) mod_flag::SMALL_SYSMEM },
+            { NULL, 0 }
+         };
+
+         mod_flags = parse_debug_string(flags_env_str, tu_at_flags_control);
+         if (TU_DEBUG(STARTUP))
+            mesa_logi("TU_AUTOTUNE_FLAGS=0x%x (%s)", mod_flags, flags_env_str);
+      }
+
+      assert((uint8_t) mod_flags == mod_flags);
+      at_config = config_t(algo, (uint8_t) mod_flags);
+   });
+
+   if (TU_DEBUG(STARTUP))
+      mesa_logi("TU_AUTOTUNE: %s", at_config.to_string().c_str());
+
+   return at_config;
+}
+
+/** Global Fence and Internal CS Management **/
+
+tu_autotune::submission_entry::submission_entry(tu_device *device): fence(0)
+{
+   tu_cs_init(&fence_cs, device, TU_CS_MODE_GROW, 5, "autotune fence cs");
+}
+
+tu_autotune::submission_entry::~submission_entry()
+{
+   assert(!is_active());
+   tu_cs_finish(&fence_cs);
+}
+
+bool
+tu_autotune::submission_entry::is_active() const
+{
+   return fence_cs.device->global_bo_map->autotune_fence < fence;
 }
 
 template <chip CHIP>
 static void
-create_submission_fence(struct tu_device *dev,
-                        struct tu_cs *cs,
-                        uint32_t fence)
+write_fence_cs(struct tu_device *dev, struct tu_cs *cs, uint32_t fence)
 {
    uint64_t dst_iova = dev->global_bo->iova + gb_offset(autotune_fence);
    if (CHIP >= A7XX) {
       tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 4);
-      tu_cs_emit(cs,
-         CP_EVENT_WRITE7_0(.event = CACHE_FLUSH_TS,
-                           .write_src = EV_WRITE_USER_32B,
-                           .write_dst = EV_DST_RAM,
-                           .write_enabled = true).value);
+      tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = CACHE_FLUSH_TS, .write_src = EV_WRITE_USER_32B, .write_dst = EV_DST_RAM,
+                                       .write_enabled = true)
+                        .value);
    } else {
       tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 4);
       tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(CACHE_FLUSH_TS));
@@ -121,626 +313,617 @@ create_submission_fence(struct tu_device *dev,
    tu_cs_emit(cs, fence);
 }
 
-static struct tu_submission_data *
-create_submission_data(struct tu_device *dev, struct tu_autotune *at,
-                       uint32_t fence)
+struct tu_cs *
+tu_autotune::submission_entry::try_get_cs(uint32_t new_fence)
 {
-   struct tu_submission_data *submission_data = NULL;
-   if (!list_is_empty(&at->submission_data_pool)) {
-      submission_data = list_first_entry(&at->submission_data_pool,
-                                         struct tu_submission_data, node);
-      list_del(&submission_data->node);
-   } else {
-      submission_data = (struct tu_submission_data *) calloc(
-         1, sizeof(struct tu_submission_data));
-      tu_cs_init(&submission_data->fence_cs, dev, TU_CS_MODE_GROW, 5, "autotune fence cs");
-   }
-   submission_data->fence = fence;
-
-   struct tu_cs* fence_cs = &submission_data->fence_cs;
-   tu_cs_begin(fence_cs);
-   TU_CALLX(dev, create_submission_fence)(dev, fence_cs, fence);
-   tu_cs_end(fence_cs);
-
-   list_addtail(&submission_data->node, &at->pending_submission_data);
-
-   return submission_data;
-}
-
-static void
-finish_submission_data(struct tu_autotune *at,
-                       struct tu_submission_data *data)
-{
-   list_del(&data->node);
-   list_addtail(&data->node, &at->submission_data_pool);
-   tu_cs_reset(&data->fence_cs);
-}
-
-static void
-free_submission_data(struct tu_submission_data *data)
-{
-   list_del(&data->node);
-   tu_cs_finish(&data->fence_cs);
-
-   free(data);
-}
-
-static uint64_t
-hash_renderpass_instance(const struct tu_render_pass *pass,
-                         const struct tu_framebuffer *framebuffer,
-                         const struct tu_cmd_buffer *cmd) {
-   uint32_t data[3 + pass->attachment_count * 5];
-   uint32_t* ptr = data;
-
-   *ptr++ = framebuffer->width;
-   *ptr++ = framebuffer->height;
-   *ptr++ = framebuffer->layers;
-
-   for (unsigned i = 0; i < pass->attachment_count; i++) {
-      *ptr++ = cmd->state.attachments[i]->view.width;
-      *ptr++ = cmd->state.attachments[i]->view.height;
-      *ptr++ = cmd->state.attachments[i]->image->vk.format;
-      *ptr++ = cmd->state.attachments[i]->image->vk.array_layers;
-      *ptr++ = cmd->state.attachments[i]->image->vk.mip_levels;
+   if (is_active()) {
+      /* If the CS is already active, we cannot write to it. */
+      return nullptr;
    }
 
-   return XXH64(data, sizeof(data), pass->autotune_hash);
+   struct tu_device *device = fence_cs.device;
+   tu_cs_reset(&fence_cs);
+   tu_cs_begin(&fence_cs);
+   TU_CALLX(device, write_fence_cs)(device, &fence_cs, new_fence);
+   tu_cs_end(&fence_cs);
+   assert(fence_cs.entry_count == 1); /* We expect the initial allocation to be large enough. */
+   fence = new_fence;
+
+   return &fence_cs;
 }
 
-static void
-free_result(struct tu_device *dev, struct tu_renderpass_result *result)
+struct tu_cs *
+tu_autotune::get_cs_for_fence(uint32_t fence)
 {
-   tu_suballoc_bo_free(&dev->autotune_suballoc, &result->bo);
-   list_del(&result->node);
-   free(result);
+   for (submission_entry &entry : submission_entries) {
+      struct tu_cs *cs = entry.try_get_cs(fence);
+      if (cs)
+         return cs;
+   }
+
+   /* If we reach here, we have to allocate a new entry. */
+   submission_entry &entry = submission_entries.emplace_back(device);
+   struct tu_cs *cs = entry.try_get_cs(fence);
+   assert(cs); /* We just allocated it, so it should be available. */
+   return cs;
 }
 
-static void
-free_history(struct tu_device *dev, struct tu_renderpass_history *history)
+/** RP Entry Management **/
+
+/* The part of the per-RP entry which is written by the GPU. */
+struct PACKED tu_autotune::rp_gpu_data {
+   /* HW requires the sample start/stop locations to be 128b aligned. */
+   alignas(16) uint64_t samples_start;
+   alignas(16) uint64_t samples_end;
+   uint64_t ts_start;
+   uint64_t ts_end;
+};
+
+/* An "entry" of renderpass autotune results, which is used to store the results of a renderpass autotune run for a
+ * given command buffer. */
+struct tu_autotune::rp_entry {
+ private:
+   struct tu_device *device;
+
+   struct tu_suballoc_bo bo;
+   uint8_t *map; /* A direct pointer to the BO's CPU mapping. */
+
+   static_assert(alignof(rp_gpu_data) == 16);
+   static_assert(offsetof(rp_gpu_data, samples_start) == 0);
+   static_assert(offsetof(rp_gpu_data, samples_end) == 16);
+
+ public:
+   rp_history *history; /* Guaranteed to never be nullptr. */
+   config_t config;     /* Configuration at the time of entry creation. */
+   bool sysmem;
+   uint32_t draw_count;
+
+   rp_entry(struct tu_device *device, rp_history &history, config_t config, uint32_t draw_count)
+       : device(device), map(nullptr), history(&history), config(config), draw_count(draw_count)
+   {
+   }
+
+   ~rp_entry()
+   {
+      if (map) {
+         std::scoped_lock lock(device->autotune->suballoc_mutex);
+         tu_suballoc_bo_free(&device->autotune->suballoc, &bo);
+      }
+   }
+
+   /* Disable the copy operators as that shouldn't be done. */
+   rp_entry(const rp_entry &) = delete;
+   rp_entry &operator=(const rp_entry &) = delete;
+
+   rp_entry(rp_entry &&other) noexcept
+       : device(other.device), bo(other.bo), map(other.map), history(other.history), config(other.config),
+         sysmem(other.sysmem), draw_count(other.draw_count)
+   {
+      other.map = nullptr; /* Prevent the destructor from freeing the BO. */
+   }
+
+   rp_entry &operator=(rp_entry &&other) noexcept
+   {
+      if (this != &other) {
+         device = other.device;
+         bo = other.bo;
+         map = other.map;
+         history = other.history;
+         config = other.config;
+         sysmem = other.sysmem;
+         draw_count = other.draw_count;
+
+         other.map = nullptr;
+      }
+      return *this;
+   }
+
+   void allocate(bool sysmem)
+   {
+      this->sysmem = sysmem;
+      size_t total_size = sizeof(rp_gpu_data);
+
+      std::scoped_lock lock(device->autotune->suballoc_mutex);
+      VkResult result = tu_suballoc_bo_alloc(&bo, &device->autotune->suballoc, total_size, alignof(rp_gpu_data));
+      if (result != VK_SUCCESS) {
+         mesa_loge("Failed to allocate BO for autotune rp_entry: %u", result);
+         return;
+      }
+
+      map = (uint8_t *) tu_suballoc_bo_map(&bo);
+      memset(map, 0, total_size);
+   }
+
+   rp_gpu_data &get_gpu_data()
+   {
+      assert(map);
+      return *(rp_gpu_data *) map;
+   }
+
+   /** Samples-Passed Metric **/
+
+   uint64_t get_samples_passed()
+   {
+      assert(config.test(metric_flag::SAMPLES));
+      rp_gpu_data &gpu = get_gpu_data();
+      return gpu.samples_end - gpu.samples_start;
+   }
+
+   void emit_metric_samples_start(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint64_t start_iova)
+   {
+      tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNTER_CNTL(.copy = true));
+      if (cmd->device->physical_device->info->props.has_event_write_sample_count) {
+         tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
+         tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE, .write_sample_count = true).value);
+         tu_cs_emit_qw(cs, start_iova);
+
+         /* If the renderpass contains an occlusion query with its own ZPASS_DONE, we have to provide a fake ZPASS_DONE
+          * event here to logically close the previous one, preventing firmware from misbehaving due to nested events.
+          * This writes into the samples_end field, which will be overwritten in tu_autotune_end_renderpass.
+          */
+         if (cmd->state.rp.has_zpass_done_sample_count_write_in_rp) {
+            tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
+            tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE, .write_sample_count = true,
+                                             .sample_count_end_offset = true, .write_accum_sample_count_diff = true)
+                              .value);
+            tu_cs_emit_qw(cs, start_iova);
+         }
+      } else {
+         tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNTER_BASE(.qword = start_iova));
+         tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
+         tu_cs_emit(cs, ZPASS_DONE);
+      }
+   }
+
+   void emit_metric_samples_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint64_t start_iova, uint64_t end_iova)
+   {
+      tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNTER_CNTL(.copy = true));
+      if (cmd->device->physical_device->info->props.has_event_write_sample_count) {
+         /* If the renderpass contains ZPASS_DONE events we emit a fake ZPASS_DONE event here, composing a pair of these
+          * events that firmware handles without issue. This first event writes into the samples_end field and the
+          * second event overwrites it. The second event also enables the accumulation flag even when we don't use that
+          * result because the blob always sets it.
+          */
+         if (cmd->state.rp.has_zpass_done_sample_count_write_in_rp) {
+            tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
+            tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE, .write_sample_count = true).value);
+            tu_cs_emit_qw(cs, end_iova);
+         }
+
+         tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
+         tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE, .write_sample_count = true,
+                                          .sample_count_end_offset = true, .write_accum_sample_count_diff = true)
+                           .value);
+         tu_cs_emit_qw(cs, start_iova);
+      } else {
+         tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNTER_BASE(.qword = end_iova));
+         tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
+         tu_cs_emit(cs, ZPASS_DONE);
+      }
+   }
+
+   /** CS Emission **/
+
+   void emit_rp_start(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
+   {
+      assert(map && bo.iova);
+      uint64_t bo_iova = bo.iova;
+      if (config.test(metric_flag::SAMPLES))
+         emit_metric_samples_start(cmd, cs, bo_iova + offsetof(rp_gpu_data, samples_start));
+   }
+
+   void emit_rp_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
+   {
+      assert(map && bo.iova);
+      uint64_t bo_iova = bo.iova;
+      if (config.test(metric_flag::SAMPLES))
+         emit_metric_samples_end(cmd, cs, bo_iova + offsetof(rp_gpu_data, samples_start),
+                                 bo_iova + offsetof(rp_gpu_data, samples_end));
+   }
+};
+
+tu_autotune::rp_entry_batch::rp_entry_batch(): active(false), fence(0), entries()
 {
-   tu_autotune_free_results_locked(dev, &history->results);
-   free(history);
 }
 
-static bool
-get_history(struct tu_autotune *at, uint64_t rp_key, uint32_t *avg_samples)
+void
+tu_autotune::rp_entry_batch::assign_fence(uint32_t new_fence)
 {
-   bool has_history = false;
+   assert(!active); /* Cannot assign a fence to an active entry batch. */
+   fence = new_fence;
+   active = true;
+}
 
-   /* If the lock contantion would be found in the wild -
-    * we could use try_lock here.
+/** Renderpass state tracking. **/
+
+tu_autotune::rp_key::rp_key(const struct tu_render_pass *pass,
+                            const struct tu_framebuffer *framebuffer,
+                            const struct tu_cmd_buffer *cmd)
+{
+   /* Q: Why not make the key from framebuffer + renderpass pointers?
+    * A: At least DXVK creates new framebuffers each frame while keeping renderpasses the same. Hashing the contents
+    *    of the framebuffer and renderpass is more stable, and it maintains stability across runs, so we can reliably
+    *    identify the same renderpass instance.
     */
-   u_rwlock_rdlock(&at->ht_lock);
-   struct hash_entry *entry =
-      _mesa_hash_table_search(at->ht, &rp_key);
-   if (entry) {
-      struct tu_renderpass_history *history =
-         (struct tu_renderpass_history *) entry->data;
-      if (history->num_results > 0) {
-         *avg_samples = p_atomic_read(&history->avg_samples);
-         has_history = true;
+
+   auto get_hash = [&](uint32_t *data, size_t size) {
+      uint32_t *ptr = data;
+      *ptr++ = framebuffer->width;
+      *ptr++ = framebuffer->height;
+      *ptr++ = framebuffer->layers;
+
+      for (unsigned i = 0; i < pass->attachment_count; i++) {
+         *ptr++ = cmd->state.attachments[i]->view.width;
+         *ptr++ = cmd->state.attachments[i]->view.height;
+         *ptr++ = cmd->state.attachments[i]->image->vk.format;
+         *ptr++ = cmd->state.attachments[i]->image->vk.array_layers;
+         *ptr++ = cmd->state.attachments[i]->image->vk.mip_levels;
       }
-   }
-   u_rwlock_rdunlock(&at->ht_lock);
 
-   return has_history;
-}
+      return XXH64(data, size * sizeof(uint32_t), 0);
+   };
 
-static struct tu_renderpass_result *
-create_history_result(struct tu_autotune *at, uint64_t rp_key)
-{
-   struct tu_renderpass_result *result =
-      (struct tu_renderpass_result *) calloc(1, sizeof(*result));
-   result->rp_key = rp_key;
+   /* We do a manual Boost-style "small vector" optimization here where the stack is used for the vast majority of
+    * cases, while only extreme cases need to allocate on the heap.
+    */
+   size_t data_count = 3 + (pass->attachment_count * 5);
+   constexpr size_t STACK_MAX_DATA_COUNT = 3 + (5 * 5); /* in u32 units. */
 
-   return result;
-}
-
-static void
-history_add_result(struct tu_device *dev, struct tu_renderpass_history *history,
-                      struct tu_renderpass_result *result)
-{
-   list_delinit(&result->node);
-   list_add(&result->node, &history->results);
-
-   if (history->num_results < MAX_HISTORY_RESULTS) {
-      history->num_results++;
+   if (data_count <= STACK_MAX_DATA_COUNT) {
+      /* If the data is small enough, we can use the stack. */
+      std::array<uint32_t, STACK_MAX_DATA_COUNT> arr;
+      hash = get_hash(arr.data(), data_count);
    } else {
-      /* Once above the limit, start popping old results off the
-       * tail of the list:
-       */
-      struct tu_renderpass_result *old_result =
-         list_last_entry(&history->results, struct tu_renderpass_result, node);
-      mtx_lock(&dev->autotune_mutex);
-      free_result(dev, old_result);
-      mtx_unlock(&dev->autotune_mutex);
-   }
-
-   /* Do calculations here to avoid locking history in tu_autotune_use_bypass */
-   uint32_t total_samples = 0;
-   list_for_each_entry(struct tu_renderpass_result, result,
-                       &history->results, node) {
-      total_samples += result->samples_passed;
-   }
-
-   float avg_samples = (float)total_samples / (float)history->num_results;
-   p_atomic_set(&history->avg_samples, (uint32_t)avg_samples);
-}
-
-static void
-process_results(struct tu_autotune *at, uint32_t current_fence)
-{
-   struct tu_device *dev = at->device;
-
-   list_for_each_entry_safe(struct tu_renderpass_result, result,
-                            &at->pending_results, node) {
-      if (fence_before(current_fence, result->fence))
-         break;
-
-      struct tu_renderpass_history *history = result->history;
-      result->samples_passed =
-         result->samples->samples_end - result->samples->samples_start;
-
-      history_add_result(dev, history, result);
-   }
-
-   list_for_each_entry_safe(struct tu_submission_data, submission_data,
-                            &at->pending_submission_data, node) {
-      if (fence_before(current_fence, submission_data->fence))
-         break;
-
-      finish_submission_data(at, submission_data);
+      /* If the data is too large, we have to allocate it on the heap. */
+      std::vector<uint32_t> vec(data_count);
+      hash = get_hash(vec.data(), vec.size());
    }
 }
 
-static void
-queue_pending_results(struct tu_autotune *at, struct tu_cmd_buffer *cmdbuf)
-{
-   bool one_time_submit = cmdbuf->usage_flags &
-         VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+/* Exponential moving average (EMA) calculator for smoothing successive values of any metric. An alpha (smoothing
+ * factor) of 0.1 means 10% weight to new values (slow adaptation), while 0.9 means 90% weight (fast adaptation).
+ */
+template <typename T = double> class exponential_average {
+ private:
+   std::atomic<double> average = std::numeric_limits<double>::quiet_NaN();
+   double alpha;
 
-   if (one_time_submit) {
-      /* We can just steal the list since it won't be resubmitted again */
-      list_splicetail(&cmdbuf->renderpass_autotune_results,
-                        &at->pending_results);
-      list_inithead(&cmdbuf->renderpass_autotune_results);
-   } else {
-      list_for_each_entry_safe(struct tu_renderpass_result, result,
-                              &cmdbuf->renderpass_autotune_results, node) {
-         /* TODO: copying each result isn't nice */
-         struct tu_renderpass_result *copy =
-            (struct tu_renderpass_result *) malloc(sizeof(*result));
-         *copy = *result;
-         tu_bo_get_ref(copy->bo.bo);
-         list_addtail(&copy->node, &at->pending_results);
+ public:
+   explicit exponential_average(double alpha = 0.1) noexcept: alpha(alpha)
+   {
+   }
+
+   bool empty() const noexcept
+   {
+      double current = average.load(std::memory_order_relaxed);
+      return std::isnan(current);
+   }
+
+   void add(T value) noexcept
+   {
+      double v = static_cast<double>(value);
+      double current = average.load(std::memory_order_relaxed);
+      double new_avg;
+      do {
+         new_avg = std::isnan(current) ? v : (1.0 - alpha) * current + alpha * v;
+      } while (!average.compare_exchange_weak(current, new_avg, std::memory_order_relaxed, std::memory_order_relaxed));
+   }
+
+   void clear() noexcept
+   {
+      average.store(std::numeric_limits<double>::quiet_NaN(), std::memory_order_relaxed);
+   }
+
+   T get() const noexcept
+   {
+      double current = average.load(std::memory_order_relaxed);
+      return std::isnan(current) ? T {} : static_cast<T>(current);
+   }
+};
+
+/* All historical state pertaining to a uniquely identified RP. This integrates data from RP entries, accumulating
+ * metrics over the long-term and providing autotune algorithms using the data.
+ */
+struct tu_autotune::rp_history {
+ public:
+   uint64_t hash; /* The hash of the renderpass, just for debug output. */
+
+   rp_history(uint64_t hash): hash(hash)
+   {
+   }
+
+   /** Bandwidth Estimation Algorithm **/
+   struct bandwidth_algo {
+    private:
+      exponential_average<uint32_t> mean_samples_passed;
+
+    public:
+      void update(uint32_t samples)
+      {
+         mean_samples_passed.add(samples);
       }
+
+      render_mode get_optimal_mode(rp_history &history,
+                                   const struct tu_cmd_state *cmd_state,
+                                   const struct tu_render_pass *pass,
+                                   const struct tu_framebuffer *framebuffer,
+                                   const struct tu_render_pass_state *rp_state)
+      {
+         const VkExtent2D &extent = cmd_state->render_area.extent;
+         const uint32_t pass_pixel_count = extent.width * extent.height;
+         uint64_t sysmem_bandwidth = (uint64_t) pass->sysmem_bandwidth_per_pixel * pass_pixel_count;
+         uint64_t gmem_bandwidth = (uint64_t) pass->gmem_bandwidth_per_pixel * pass_pixel_count;
+
+         uint64_t total_draw_call_bandwidth = 0;
+         uint64_t mean_samples = mean_samples_passed.get();
+         if (rp_state->drawcall_count && mean_samples > 0.0) {
+            /* The total draw call bandwidth is estimated as the average samples (collected via tracking samples passed
+             * within the CS) multiplied by the drawcall bandwidth per sample, divided by the amount of draw calls.
+             *
+             * This is a rough estimate of the bandwidth used by the draw calls in the renderpass for FB writes which
+             * is used to determine whether to use SYSMEM or GMEM.
+             */
+            total_draw_call_bandwidth =
+               (mean_samples * rp_state->drawcall_bandwidth_per_sample_sum) / rp_state->drawcall_count;
+         }
+
+         /* Drawcalls access the memory in SYSMEM rendering (ignoring CCU). */
+         sysmem_bandwidth += total_draw_call_bandwidth;
+
+         /* Drawcalls access GMEM in GMEM rendering, but we do not want to ignore them completely.  The state changes
+          * between tiles also have an overhead.  The magic numbers of 11 and 10 are randomly chosen.
+          */
+         gmem_bandwidth = (gmem_bandwidth * 11 + total_draw_call_bandwidth) / 10;
+
+         bool select_sysmem = sysmem_bandwidth <= gmem_bandwidth;
+         render_mode mode = select_sysmem ? render_mode::SYSMEM : render_mode::GMEM;
+
+         at_log_bandwidth_h(
+            "%" PRIu32 " selecting %s\n"
+            "   mean_samples=%" PRIu64 ", draw_bandwidth_per_sample=%.2f, total_draw_call_bandwidth=%" PRIu64
+            ", render_area=%" PRIu32 "x%" PRIu32 ", sysmem_bandwidth_per_pixel=%" PRIu32
+            ", gmem_bandwidth_per_pixel=%" PRIu32 ", sysmem_bandwidth=%" PRIu64 ", gmem_bandwidth=%" PRIu64,
+            history.hash, rp_state->drawcall_count, render_mode_str(mode), mean_samples,
+            (float) rp_state->drawcall_bandwidth_per_sample_sum / rp_state->drawcall_count, total_draw_call_bandwidth,
+            extent.width, extent.height, pass->sysmem_bandwidth_per_pixel, pass->gmem_bandwidth_per_pixel,
+            sysmem_bandwidth, gmem_bandwidth);
+
+         return mode;
+      }
+   } bandwidth;
+
+   void process(rp_entry &entry, tu_autotune &at)
+   {
+      /* We use entry config to know what metrics it has, autotune config to know what algorithms are enabled. */
+      config_t entry_config = entry.config;
+      config_t at_config = at.active_config.load();
+
+      if (entry_config.test(metric_flag::SAMPLES) && at_config.is_enabled(algorithm::BANDWIDTH))
+         bandwidth.update(entry.get_samples_passed());
+   }
+};
+
+tu_autotune::rp_history *
+tu_autotune::find_rp_history(const rp_key &key)
+{
+   std::shared_lock lock(rp_mutex);
+   auto it = rp_histories.find(key);
+   if (it != rp_histories.end())
+      return &it->second;
+
+   return nullptr;
+}
+
+tu_autotune::rp_history &
+tu_autotune::find_or_create_rp_history(const rp_key &key)
+{
+   rp_history *existing = find_rp_history(key);
+   if (existing)
+      return *existing;
+
+   /* If we reach here, we have to create a new history. */
+   std::unique_lock lock(rp_mutex);
+   auto it = rp_histories.find(key);
+   if (it != rp_histories.end())
+      return it->second; /* Another thread created the history while we were waiting for the lock. */
+   auto history = rp_histories.emplace(std::make_pair(key, key.hash));
+   return history.first->second;
+}
+
+void
+tu_autotune::process_entries()
+{
+   uint32_t current_fence = device->global_bo_map->autotune_fence;
+
+   while (!active_batches.empty()) {
+      auto &batch = active_batches.front();
+      assert(batch->active);
+
+      if (fence_before(current_fence, batch->fence))
+         break; /* Entries are allocated in sequence, next will be newer and
+                   also fail so we can just directly break out of the loop. */
+
+      for (rp_entry &entry : batch->entries)
+         entry.history->process(entry, *this);
+
+      active_batches.pop_front();
+   }
+
+   if (active_batches.size() > 10) {
+      at_log_base("high amount of active batches: %zu, fence: %" PRIu32 " < %" PRIu32, active_batches.size(),
+                  current_fence, active_batches.front()->fence);
    }
 }
 
 struct tu_cs *
-tu_autotune_on_submit(struct tu_device *dev,
-                      struct tu_autotune *at,
-                      struct tu_cmd_buffer **cmd_buffers,
-                      uint32_t cmd_buffer_count)
+tu_autotune::on_submit(struct tu_cmd_buffer **cmd_buffers, uint32_t cmd_buffer_count)
 {
-   /* We are single-threaded here */
 
-   const uint32_t gpu_fence = get_autotune_fence(at);
-   const uint32_t new_fence = at->fence_counter++;
-
-   process_results(at, gpu_fence);
-
-   /* Create history entries here to minimize work and locking being
-    * done on renderpass end.
+   /* This call occurs regularly and we are single-threaded here, so we use this opportunity to process any available
+    * entries. It's also important that any entries are processed here because we always want to ensure that we've
+    * processed all entries from prior CBs before we submit any new CBs with the same RP to the GPU.
     */
+   process_entries();
+
+   bool has_results = false;
    for (uint32_t i = 0; i < cmd_buffer_count; i++) {
-      struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
-      list_for_each_entry_safe(struct tu_renderpass_result, result,
-                          &cmdbuf->renderpass_autotune_results, node) {
-         struct tu_renderpass_history *history;
-         struct hash_entry *entry =
-            _mesa_hash_table_search(at->ht, &result->rp_key);
-         if (!entry) {
-            history =
-               (struct tu_renderpass_history *) calloc(1, sizeof(*history));
-            history->key = result->rp_key;
-            list_inithead(&history->results);
-
-            u_rwlock_wrlock(&at->ht_lock);
-            _mesa_hash_table_insert(at->ht, &history->key, history);
-            u_rwlock_wrunlock(&at->ht_lock);
-         } else {
-            history = (struct tu_renderpass_history *) entry->data;
-         }
-
-         history->last_fence = new_fence;
-
-         result->fence = new_fence;
-         result->history = history;
+      auto &batch = cmd_buffers[i]->autotune_ctx.batch;
+      if (!batch->entries.empty()) {
+         has_results = true;
+         break;
       }
    }
+   if (!has_results)
+      return nullptr; /* No results to process, return early. */
 
-   struct tu_submission_data *submission_data =
-      create_submission_data(dev, at, new_fence);
-
+   /* Generate a new fence and the CS for it. */
+   const uint32_t new_fence = next_fence++;
+   auto fence_cs = get_cs_for_fence(new_fence);
    for (uint32_t i = 0; i < cmd_buffer_count; i++) {
+      /* Transfer the entries from the command buffers to the active queue. */
       struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
-      if (list_is_empty(&cmdbuf->renderpass_autotune_results))
+      auto &batch = cmdbuf->autotune_ctx.batch;
+      if (batch->entries.empty())
          continue;
 
-      queue_pending_results(at, cmdbuf);
+      batch->assign_fence(new_fence);
+      if (cmdbuf->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT) {
+         /* If the command buffer is one-time submit, we can move the batch directly into the active batches, as it
+          * won't be used again. This would lead to it being deallocated as early as possible.
+          */
+         active_batches.push_back(std::move(batch));
+      } else {
+         active_batches.push_back(batch);
+      }
    }
 
-   if (TU_AUTOTUNE_DEBUG_LOG)
-      mesa_logi("Total history entries: %u", at->ht->entries);
+   return fence_cs;
+}
 
-   /* Cleanup old entries from history table. The assumption
-    * here is that application doesn't hold many old unsubmitted
-    * command buffers, otherwise this table may grow big.
+tu_autotune::tu_autotune(struct tu_device *device, VkResult &result): device(device), active_config(get_env_config())
+{
+   tu_bo_suballocator_init(&suballoc, device, 128 * 1024, TU_BO_ALLOC_INTERNAL_RESOURCE, "autotune_suballoc");
+
+   result = VK_SUCCESS;
+   return;
+}
+
+tu_autotune::~tu_autotune()
+{
+   if (TU_AUTOTUNE_FLUSH_AT_FINISH) {
+      while (!active_batches.empty())
+         process_entries();
+      at_log_base("finished processing all entries");
+   }
+
+   tu_bo_suballocator_finish(&suballoc);
+}
+
+tu_autotune::cmd_buf_ctx::cmd_buf_ctx(): batch(std::make_shared<rp_entry_batch>())
+{
+}
+
+tu_autotune::cmd_buf_ctx::~cmd_buf_ctx()
+{
+   /* This is empty but it causes the implicit destructor to be compiled within this compilation unit with access to
+    * internal structures. Otherwise, we would need to expose the full definition of autotuner internals in the header
+    * file, which is not desirable.
     */
-   hash_table_foreach(at->ht, entry) {
-      struct tu_renderpass_history *history =
-         (struct tu_renderpass_history *) entry->data;
-      if (fence_before(gpu_fence, history->last_fence + MAX_HISTORY_LIFETIME))
-         continue;
-
-      if (TU_AUTOTUNE_DEBUG_LOG)
-         mesa_logi("Removed old history entry %016" PRIx64 "", history->key);
-
-      u_rwlock_wrlock(&at->ht_lock);
-      _mesa_hash_table_remove_key(at->ht, &history->key);
-      u_rwlock_wrunlock(&at->ht_lock);
-
-      mtx_lock(&dev->autotune_mutex);
-      free_history(dev, history);
-      mtx_unlock(&dev->autotune_mutex);
-   }
-
-   return &submission_data->fence_cs;
-}
-
-static bool
-renderpass_key_equals(const void *_a, const void *_b)
-{
-   return *(uint64_t *)_a == *(uint64_t *)_b;
-}
-
-static uint32_t
-renderpass_key_hash(const void *_a)
-{
-   return *((uint64_t *) _a) & 0xffffffff;
-}
-
-VkResult
-tu_autotune_init(struct tu_autotune *at, struct tu_device *dev)
-{
-   at->enabled = true;
-   at->device = dev;
-   at->ht = _mesa_hash_table_create(NULL,
-                                    renderpass_key_hash,
-                                    renderpass_key_equals);
-   u_rwlock_init(&at->ht_lock);
-
-   list_inithead(&at->pending_results);
-   list_inithead(&at->pending_submission_data);
-   list_inithead(&at->submission_data_pool);
-
-   /* start from 1 because tu6_global::autotune_fence is initialized to 0 */
-   at->fence_counter = 1;
-
-   return VK_SUCCESS;
 }
 
 void
-tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev)
+tu_autotune::cmd_buf_ctx::reset()
 {
-   if (TU_AUTOTUNE_LOG_AT_FINISH) {
-      while (!list_is_empty(&at->pending_results)) {
-         const uint32_t gpu_fence = get_autotune_fence(at);
-         process_results(at, gpu_fence);
-      }
-
-      hash_table_foreach(at->ht, entry) {
-         struct tu_renderpass_history *history =
-            (struct tu_renderpass_history *) entry->data;
-
-         mesa_logi("%016" PRIx64 " \tavg_passed=%u results=%u",
-                   history->key, history->avg_samples, history->num_results);
-      }
-   }
-
-   tu_autotune_free_results(dev, &at->pending_results);
-
-   mtx_lock(&dev->autotune_mutex);
-   hash_table_foreach(at->ht, entry) {
-      struct tu_renderpass_history *history =
-         (struct tu_renderpass_history *) entry->data;
-      free_history(dev, history);
-   }
-   mtx_unlock(&dev->autotune_mutex);
-
-   list_for_each_entry_safe(struct tu_submission_data, submission_data,
-                            &at->pending_submission_data, node) {
-      free_submission_data(submission_data);
-   }
-
-   list_for_each_entry_safe(struct tu_submission_data, submission_data,
-                            &at->submission_data_pool, node) {
-      free_submission_data(submission_data);
-   }
-
-   _mesa_hash_table_destroy(at->ht, NULL);
-   u_rwlock_destroy(&at->ht_lock);
+   batch = std::make_shared<rp_entry_batch>();
 }
 
-bool
-tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers,
-                                  uint32_t cmd_buffer_count)
+tu_autotune::rp_entry &
+tu_autotune::cmd_buf_ctx::attach_rp_entry(struct tu_device *device,
+                                          rp_history &history,
+                                          config_t config,
+                                          uint32_t drawcall_count)
 {
-   for (uint32_t i = 0; i < cmd_buffer_count; i++) {
-      struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
-      if (!list_is_empty(&cmdbuf->renderpass_autotune_results))
-         return true;
-   }
-
-   return false;
+   return batch->entries.emplace_back(device, history, config, drawcall_count);
 }
 
-void
-tu_autotune_free_results_locked(struct tu_device *dev, struct list_head *results)
+tu_autotune::render_mode
+tu_autotune::get_optimal_mode(struct tu_cmd_buffer *cmd_buffer, rp_ctx_t *rp_ctx)
 {
-   list_for_each_entry_safe(struct tu_renderpass_result, result,
-                            results, node) {
-      free_result(dev, result);
-   }
-}
+   const struct tu_cmd_state *cmd_state = &cmd_buffer->state;
+   const struct tu_render_pass *pass = cmd_state->pass;
+   const struct tu_framebuffer *framebuffer = cmd_state->framebuffer;
+   const struct tu_render_pass_state *rp_state = &cmd_state->rp;
+   cmd_buf_ctx &cb_ctx = cmd_buffer->autotune_ctx;
+   config_t config = active_config.load();
 
-void
-tu_autotune_free_results(struct tu_device *dev, struct list_head *results)
-{
-   mtx_lock(&dev->autotune_mutex);
-   tu_autotune_free_results_locked(dev, results);
-   mtx_unlock(&dev->autotune_mutex);
-}
-
-static bool
-fallback_use_bypass(const struct tu_render_pass *pass,
-                    const struct tu_framebuffer *framebuffer,
-                    const struct tu_cmd_buffer *cmd_buffer)
-{
-   if (cmd_buffer->state.rp.drawcall_count > 5)
-      return false;
-
-   for (unsigned i = 0; i < pass->subpass_count; i++) {
-      if (pass->subpasses[i].samples != VK_SAMPLE_COUNT_1_BIT)
-         return false;
-   }
-
-   return true;
-}
-
-static uint32_t
-get_render_pass_pixel_count(const struct tu_cmd_buffer *cmd)
-{
-   const VkExtent2D *extent = &cmd->state.render_area.extent;
-   return extent->width * extent->height;
-}
-
-static uint64_t
-estimate_drawcall_bandwidth(const struct tu_cmd_buffer *cmd,
-                            uint32_t avg_renderpass_sample_count)
-{
-   const struct tu_cmd_state *state = &cmd->state;
-
-   if (!state->rp.drawcall_count)
-      return 0;
-
-   /* sample count times drawcall_bandwidth_per_sample */
-   return (uint64_t)avg_renderpass_sample_count *
-      state->rp.drawcall_bandwidth_per_sample_sum / state->rp.drawcall_count;
-}
-
-bool
-tu_autotune_use_bypass(struct tu_autotune *at,
-                       struct tu_cmd_buffer *cmd_buffer,
-                       struct tu_renderpass_result **autotune_result)
-{
-   const struct tu_render_pass *pass = cmd_buffer->state.pass;
-   const struct tu_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
+   /* Just to ensure a segfault for accesses, in case we don't set it. */
+   *rp_ctx = nullptr;
 
    /* If a feedback loop in the subpass caused one of the pipelines used to set
-    * SINGLE_PRIM_MODE(FLUSH_PER_OVERLAP_AND_OVERWRITE) or even
-    * SINGLE_PRIM_MODE(FLUSH), then that should cause significantly increased
-    * sysmem bandwidth (though we haven't quantified it).
+    * SINGLE_PRIM_MODE(FLUSH_PER_OVERLAP_AND_OVERWRITE) or even SINGLE_PRIM_MODE(FLUSH), then that should cause
+    * significantly increased SYSMEM bandwidth (though we haven't quantified it).
     */
-   if (cmd_buffer->state.rp.sysmem_single_prim_mode)
-      return false;
+   if (rp_state->sysmem_single_prim_mode)
+      return render_mode::GMEM;
 
-   /* If the user is using a fragment density map, then this will cause less
-    * FS invocations with GMEM, which has a hard-to-measure impact on
-    * performance because it depends on how heavy the FS is in addition to how
-    * many invocations there were and the density. Let's assume the user knows
-    * what they're doing when they added the map, because if sysmem is
-    * actually faster then they could've just not used the fragment density
-    * map.
+   /* If the user is using a fragment density map, then this will cause less FS invocations with GMEM, which has a
+    * hard-to-measure impact on performance because it depends on how heavy the FS is in addition to how many
+    * invocations there were and the density. Let's assume the user knows what they're doing when they added the map,
+    * because if SYSMEM is actually faster then they could've just not used the fragment density map.
     */
    if (pass->has_fdm)
-      return false;
+      return render_mode::GMEM;
 
-   /* For VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT buffers
-    * we would have to allocate GPU memory at the submit time and copy
-    * results into it.
-    * Native games ususally don't use it, Zink and DXVK don't use it,
-    * D3D12 doesn't have such concept.
+   /* SYSMEM is always a safe default mode when we can't fully engage the autotuner. From testing, we know that for an
+    * incorrect decision towards SYSMEM tends to be far less impactful than an incorrect decision towards GMEM, which
+    * can cause significant performance issues.
     */
-   bool simultaneous_use =
-      cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
+   constexpr render_mode default_mode = render_mode::SYSMEM;
 
-   if (!at->enabled || simultaneous_use)
-      return fallback_use_bypass(pass, framebuffer, cmd_buffer);
-
-   /* We use 64bit hash as a key since we don't fear rare hash collision,
-    * the worst that would happen is sysmem being selected when it should
-    * have not, and with 64bit it would be extremely rare.
+   /* For VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT buffers, we would have to allocate GPU memory at the submit time
+    * and copy results into it. We just disable complex autotuner in this case, which isn't a big issue since native
+    * games usually don't use it, Zink and DXVK don't use it, while D3D12 doesn't even have such concept.
     *
-    * Q: Why not make the key from framebuffer + renderpass pointers?
-    * A: At least DXVK creates new framebuffers each frame while keeping
-    *    renderpasses the same. Also we want to support replaying a single
-    *    frame in a loop for testing.
+    * We combine this with processing entries at submit time, to avoid a race where the CPU hasn't processed the results
+    * from an earlier submission of the CB while a second submission of the CB is on the GPU queue.
     */
-   uint64_t renderpass_key = hash_renderpass_instance(pass, framebuffer, cmd_buffer);
+   bool simultaneous_use = cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
 
-   *autotune_result = create_history_result(at, renderpass_key);
+   if (!enabled || simultaneous_use)
+      return default_mode;
 
-   uint32_t avg_samples = 0;
-   if (get_history(at, renderpass_key, &avg_samples)) {
-      const uint32_t pass_pixel_count =
-         get_render_pass_pixel_count(cmd_buffer);
-      uint64_t sysmem_bandwidth =
-         (uint64_t)pass->sysmem_bandwidth_per_pixel * pass_pixel_count;
-      uint64_t gmem_bandwidth =
-         (uint64_t)pass->gmem_bandwidth_per_pixel * pass_pixel_count;
+   if (config.test(mod_flag::BIG_GMEM) && rp_state->drawcall_count >= 10)
+      return render_mode::GMEM;
+   if (config.test(mod_flag::SMALL_SYSMEM) && rp_state->drawcall_count <= 5)
+      return render_mode::SYSMEM;
 
-      const uint64_t total_draw_call_bandwidth =
-         estimate_drawcall_bandwidth(cmd_buffer, avg_samples);
+   rp_key key(pass, framebuffer, cmd_buffer);
+   auto &history = find_or_create_rp_history(key);
+   *rp_ctx = &cb_ctx.attach_rp_entry(device, history, config, rp_state->drawcall_count);
 
-      /* drawcalls access the memory in sysmem rendering (ignoring CCU) */
-      sysmem_bandwidth += total_draw_call_bandwidth;
+   if (config.is_enabled(algorithm::BANDWIDTH))
+      return history.bandwidth.get_optimal_mode(history, cmd_state, pass, framebuffer, rp_state);
 
-      /* drawcalls access gmem in gmem rendering, but we do not want to ignore
-       * them completely.  The state changes between tiles also have an
-       * overhead.  The magic numbers of 11 and 10 are randomly chosen.
-       */
-      gmem_bandwidth = (gmem_bandwidth * 11 + total_draw_call_bandwidth) / 10;
-
-      const bool select_sysmem = sysmem_bandwidth <= gmem_bandwidth;
-      if (TU_AUTOTUNE_DEBUG_LOG) {
-         const VkExtent2D *extent = &cmd_buffer->state.render_area.extent;
-         const float drawcall_bandwidth_per_sample =
-            (float)cmd_buffer->state.rp.drawcall_bandwidth_per_sample_sum /
-            cmd_buffer->state.rp.drawcall_count;
-
-         mesa_logi("autotune %016" PRIx64 ":%u selecting %s",
-               renderpass_key,
-               cmd_buffer->state.rp.drawcall_count,
-               select_sysmem ? "sysmem" : "gmem");
-         mesa_logi("   avg_samples=%u, draw_bandwidth_per_sample=%.2f, total_draw_call_bandwidth=%" PRIu64,
-               avg_samples,
-               drawcall_bandwidth_per_sample,
-               total_draw_call_bandwidth);
-         mesa_logi("   render_area=%ux%u, sysmem_bandwidth_per_pixel=%u, gmem_bandwidth_per_pixel=%u",
-               extent->width, extent->height,
-               pass->sysmem_bandwidth_per_pixel,
-               pass->gmem_bandwidth_per_pixel);
-         mesa_logi("   sysmem_bandwidth=%" PRIu64 ", gmem_bandwidth=%" PRIu64,
-               sysmem_bandwidth, gmem_bandwidth);
-      }
-
-      return select_sysmem;
-   }
-
-   return fallback_use_bypass(pass, framebuffer, cmd_buffer);
+   return default_mode;
 }
 
-template <chip CHIP>
+/** RP-level CS emissions **/
+
 void
-tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd,
-                             struct tu_cs *cs,
-                             struct tu_renderpass_result *autotune_result)
+tu_autotune::begin_renderpass(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx, bool sysmem)
 {
-   if (!autotune_result)
+   if (!rp_ctx)
       return;
 
-   struct tu_device *dev = cmd->device;
-
-   static const uint32_t size = sizeof(struct tu_renderpass_samples);
-
-   mtx_lock(&dev->autotune_mutex);
-   VkResult ret = tu_suballoc_bo_alloc(&autotune_result->bo, &dev->autotune_suballoc, size, size);
-   mtx_unlock(&dev->autotune_mutex);
-   if (ret != VK_SUCCESS) {
-      autotune_result->bo.iova = 0;
-      return;
-   }
-
-   uint64_t result_iova = autotune_result->bo.iova;
-
-   autotune_result->samples =
-      (struct tu_renderpass_samples *) tu_suballoc_bo_map(
-         &autotune_result->bo);
-
-   tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNTER_CNTL(.copy = true));
-   if (cmd->device->physical_device->info->props.has_event_write_sample_count) {
-      tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
-      tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
-                                       .write_sample_count = true).value);
-      tu_cs_emit_qw(cs, result_iova);
-
-      /* If the renderpass contains an occlusion query with its own ZPASS_DONE,
-       * we have to provide a fake ZPASS_DONE event here to logically close the
-       * previous one, preventing firmware from misbehaving due to nested events.
-       * This writes into the samples_end field, which will be overwritten in
-       * tu_autotune_end_renderpass.
-       */
-      if (cmd->state.rp.has_zpass_done_sample_count_write_in_rp) {
-         tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
-         tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
-                                          .write_sample_count = true,
-                                          .sample_count_end_offset = true,
-                                          .write_accum_sample_count_diff = true).value);
-         tu_cs_emit_qw(cs, result_iova);
-      }
-   } else {
-      tu_cs_emit_regs(cs,
-                        A6XX_RB_SAMPLE_COUNTER_BASE(.qword = result_iova));
-      tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
-      tu_cs_emit(cs, ZPASS_DONE);
-   }
+   rp_ctx->allocate(sysmem);
+   rp_ctx->emit_rp_start(cmd, cs);
 }
-TU_GENX(tu_autotune_begin_renderpass);
 
-template <chip CHIP>
-void tu_autotune_end_renderpass(struct tu_cmd_buffer *cmd,
-                                struct tu_cs *cs,
-                                struct tu_renderpass_result *autotune_result)
+void
+tu_autotune::end_renderpass(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx)
 {
-   if (!autotune_result)
+   if (!rp_ctx)
       return;
 
-   if (!autotune_result->bo.iova)
-      return;
-
-   uint64_t result_iova = autotune_result->bo.iova;
-
-   tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNTER_CNTL(.copy = true));
-
-   if (cmd->device->physical_device->info->props.has_event_write_sample_count) {
-      /* If the renderpass contains ZPASS_DONE events we emit a fake ZPASS_DONE
-       * event here, composing a pair of these events that firmware handles without
-       * issue. This first event writes into the samples_end field and the second
-       * event overwrites it. The second event also enables the accumulation flag
-       * even when we don't use that result because the blob always sets it.
-       */
-      if (cmd->state.rp.has_zpass_done_sample_count_write_in_rp) {
-         tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
-         tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
-                                          .write_sample_count = true).value);
-         tu_cs_emit_qw(cs, result_iova + offsetof(struct tu_renderpass_samples, samples_end));
-      }
-
-      tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
-      tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
-                                       .write_sample_count = true,
-                                       .sample_count_end_offset = true,
-                                       .write_accum_sample_count_diff = true).value);
-      tu_cs_emit_qw(cs, result_iova);
-   } else {
-      result_iova += offsetof(struct tu_renderpass_samples, samples_end);
-
-      tu_cs_emit_regs(cs,
-                        A6XX_RB_SAMPLE_COUNTER_BASE(.qword = result_iova));
-      tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
-      tu_cs_emit(cs, ZPASS_DONE);
-   }
+   rp_ctx->emit_rp_end(cmd, cs);
 }
-TU_GENX(tu_autotune_end_renderpass);
diff --git a/src/freedreno/vulkan/tu_autotune.h b/src/freedreno/vulkan/tu_autotune.h
index c374e86ab89..f5cea4c8c3c 100644
--- a/src/freedreno/vulkan/tu_autotune.h
+++ b/src/freedreno/vulkan/tu_autotune.h
@@ -8,150 +8,230 @@
 
 #include "tu_common.h"
 
-#include "util/hash_table.h"
-#include "util/rwlock.h"
+#include <atomic>
+#include <deque>
+#include <memory>
+#include <mutex>
+#include <shared_mutex>
+#include <unordered_map>
+#include <vector>
 
+#include "tu_cs.h"
 #include "tu_suballoc.h"
 
-struct tu_renderpass_history;
-
-/**
- * "autotune" our decisions about bypass vs GMEM rendering, based on historical
- * data about a given render target.
- *
- * In deciding which path to take there are tradeoffs, including some that
- * are not reasonably estimateable without having some additional information:
- *
- *  (1) If you know you are touching every pixel (ie. there is a clear),
- *      then the GMEM path will at least not cost more memory bandwidth than
- *      sysmem[1]
- *
- *  (2) If there is no clear, GMEM could potentially cost *more* bandwidth
- *      if there is sysmem->GMEM restore pass.
- *
- *  (3) If you see a high draw count, that is an indication that there will be
- *      enough pixels accessed multiple times to benefit from the reduced
- *      memory bandwidth that GMEM brings
- *
- *  (4) But high draw count where there is not much overdraw can actually be
- *      faster in bypass mode if it is pushing a lot of state change, due to
- *      not having to go thru the state changes per-tile[1]
- *
- * The approach taken is to measure the samples-passed for the batch to estimate
- * the amount of overdraw to detect cases where the number of pixels touched is
- * low.
- *
- * [1] ignoring early-tile-exit optimizations, but any draw that touches all/
- *     most of the tiles late in the tile-pass can defeat that
+/* Autotune allows for us to tune rendering parameters (such as GMEM vs SYSMEM, tile size divisor, etc.) based on
+ * dynamic analysis of the rendering workload via on-GPU profiling. This lets us make much better decisions than static
+ * analysis, since we can adapt to the actual workload rather than relying on heuristics.
  */
 struct tu_autotune {
-
-   /* We may have to disable autotuner if there are too many
-    * renderpasses in-flight.
-    */
-   bool enabled;
-
+ private:
+   bool enabled = true;
    struct tu_device *device;
 
-   /**
-    * Cache to map renderpass key to historical information about
-    * rendering to that particular render target.
-    */
-   struct hash_table *ht;
-   struct u_rwlock ht_lock;
+   /** Configuration **/
 
-   /**
-    * List of per-renderpass results that we are waiting for the GPU
-    * to finish with before reading back the results.
-    */
-   struct list_head pending_results;
+   enum class algorithm : uint8_t;
+   enum class mod_flag : uint8_t;
+   enum class metric_flag : uint8_t;
+   /* Container for all autotune configuration options. */
+   struct PACKED config_t;
+   union PACKED packed_config_t;
 
-   /**
-    * List of per-submission data that we may want to free after we
-    * processed submission results.
-    * This could happend after command buffers which were in the submission
-    * are destroyed.
-    */
-   struct list_head pending_submission_data;
+   /* Allows for thread-safe access to the configurations. */
+   struct atomic_config_t {
+    private:
+      std::atomic<uint32_t> config_bits = 0;
 
-   /**
-    * List of per-submission data that has been finished and can be reused.
-    */
-   struct list_head submission_data_pool;
+    public:
+      atomic_config_t(config_t initial_config);
 
-   uint32_t fence_counter;
-   uint32_t idx_counter;
+      config_t load() const;
+
+      bool compare_and_store(config_t updated, config_t expected);
+   } active_config;
+
+   config_t get_env_config();
+
+   /** Global Fence and Internal CS Management **/
+
+   /* BO suballocator for reducing BO management for small GMEM/SYSMEM autotune result buffers.
+    * Synchronized by suballoc_mutex.
+    */
+   struct tu_suballocator suballoc;
+   std::mutex suballoc_mutex;
+
+   /* The next value to assign to tu6_global::autotune_fence, this is incremented during on_submit. */
+   uint32_t next_fence = 1;
+
+   /* A wrapper around a CS which sets the global autotune fence to a certain fence value, this allows for ergonomically
+    * managing the lifetime of the CS including recycling it after the fence value has been reached.
+    */
+   struct submission_entry {
+    private:
+      uint32_t fence;
+      struct tu_cs fence_cs;
+
+    public:
+      explicit submission_entry(tu_device *device);
+
+      ~submission_entry();
+
+      /* Disable move/copy, since this holds stable pointers to the fence_cs. */
+      submission_entry(const submission_entry &) = delete;
+      submission_entry &operator=(const submission_entry &) = delete;
+      submission_entry(submission_entry &&) = delete;
+      submission_entry &operator=(submission_entry &&) = delete;
+
+      /* The current state of the submission entry, this is used to track whether the CS is available for reuse, pending
+       * GPU completion or currently being processed.
+       */
+      bool is_active() const;
+
+      /* If the CS is free, returns the CS which will write out the specified fence value. Otherwise, returns nullptr. */
+      struct tu_cs *try_get_cs(uint32_t new_fence);
+   };
+
+   /* Unified pool for submission CSes.
+    * Note: This is a deque rather than a vector due to the lack of move semantics in the submission_entry.
+    */
+   std::deque<submission_entry> submission_entries;
+
+   /* Returns a CS which will write out the specified fence value to the global BO's autotune fence. */
+   struct tu_cs *get_cs_for_fence(uint32_t fence);
+
+   /** RP Entry Management **/
+
+   struct rp_gpu_data;
+   struct tile_gpu_data;
+   struct rp_entry;
+
+   /* A wrapper over all entries associated with a single command buffer. */
+   struct rp_entry_batch {
+      bool active;    /* If the entry is ready to be processed, i.e. the entry is submitted to the GPU queue and has a
+                         valid fence. */
+      uint32_t fence; /* The fence value which is used to signal the completion of the CB submission. This is used to
+                         determine when the entries can be processed. */
+      std::vector<rp_entry> entries;
+
+      rp_entry_batch();
+
+      /* Disable the copy/move to avoid performance hazards. */
+      rp_entry_batch(const rp_entry_batch &) = delete;
+      rp_entry_batch &operator=(const rp_entry_batch &) = delete;
+      rp_entry_batch(rp_entry_batch &&) = delete;
+      rp_entry_batch &operator=(rp_entry_batch &&) = delete;
+
+      void assign_fence(uint32_t new_fence);
+   };
+
+   /* A deque of entry batches that are strongly ordered by the fence value that was written by the GPU, for efficient
+    * iteration and to ensure that we process the entries in the same order they were submitted.
+    */
+   std::deque<std::shared_ptr<rp_entry_batch>> active_batches;
+
+   /* Handles processing of entry batches that are pending to be processed.
+    *
+    * Note: This must be called regularly to process the entries that have been written by the GPU. We currently do this
+    *       in the on_submit() method, which is called on every submit of a command buffer.
+    */
+   void process_entries();
+
+   /** Renderpass State Tracking **/
+
+   struct rp_history;
+
+   /* A strongly typed key which generates a hash to uniquely identify a renderpass instance. This hash is expected to
+    * be stable across runs, so it can be used to identify the same renderpass instance consistently.
+    *
+    * Note: We can potentially include the vector of data we extract from the parameters to generate the hash into
+    *       rp_key, which would lead to true value-based equality rather than just hash-based equality which has a cost
+    *       but avoids hash collisions causing issues.
+    */
+   struct rp_key {
+      uint64_t hash;
+
+      rp_key(const struct tu_render_pass *pass,
+             const struct tu_framebuffer *framebuffer,
+             const struct tu_cmd_buffer *cmd);
+
+      /* Equality operator, used in unordered_map. */
+      constexpr bool operator==(const rp_key &other) const noexcept
+      {
+         return hash == other.hash;
+      }
+   };
+
+   /* A thin wrapper to satisfy C++'s Hash named requirement for rp_key.
+    *
+    * Note: This should *NEVER* be used to calculate the hash itself as it would lead to the hash being calculated
+    *       multiple times, rather than being calculated once and reused when there's multiple successive lookups like
+    *       with find_or_create_rp_history() and providing the hash to the rp_history constructor.
+    */
+   struct rp_hash {
+      constexpr size_t operator()(const rp_key &key) const noexcept
+      {
+         /* Note: This will throw away the upper 32-bits on 32-bit architectures. */
+         return static_cast<size_t>(key.hash);
+      }
+   };
+
+   /* A map between the hash of an RP and the historical state of the RP. Synchronized by rp_mutex. */
+   std::unordered_map<rp_key, rp_history, rp_hash> rp_histories;
+   std::shared_mutex rp_mutex;
+
+   /* Note: These will internally lock rp_mutex internally, no need to lock it. */
+   rp_history *find_rp_history(const rp_key &key);
+   rp_history &find_or_create_rp_history(const rp_key &key);
+
+ public:
+   tu_autotune(struct tu_device *device, VkResult &result);
+
+   ~tu_autotune();
+
+   /* Opaque pointer to internal structure with RP context that needs to be preserved across begin/end calls. */
+   using rp_ctx_t = rp_entry *;
+
+   /* An internal structure that needs to be held by tu_cmd_buffer to track the state of the autotuner for a given CB.
+    *
+    * Note: tu_cmd_buffer is only responsible for the lifetime of this object, all the access to the context state is
+    *       done through tu_autotune.
+    */
+   struct cmd_buf_ctx {
+    private:
+      /* A batch of all entries from RPs within this CB. */
+      std::shared_ptr<rp_entry_batch> batch;
+
+      /* Creates a new RP entry attached to this CB. */
+      rp_entry &attach_rp_entry(struct tu_device *device, rp_history &entry, config_t config, uint32_t draw_count);
+
+      friend struct tu_autotune;
+
+    public:
+      cmd_buf_ctx();
+      ~cmd_buf_ctx();
+
+      /* Resets the internal context, should be called when tu_cmd_buffer state has been reset. */
+      void reset();
+   };
+
+   enum class render_mode {
+      SYSMEM,
+      GMEM,
+   };
+
+   render_mode get_optimal_mode(struct tu_cmd_buffer *cmd_buffer, rp_ctx_t *rp_ctx);
+
+   void begin_renderpass(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx, bool sysmem);
+
+   void end_renderpass(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx);
+
+   /* The submit-time hook for autotuner, this may return a CS (can be NULL) which must be amended for autotuner
+    * tracking to function correctly.
+    *
+    * Note: This must be called from a single-threaded context. There should never be multiple threads calling this
+    *       function at the same time.
+    */
+   struct tu_cs *on_submit(struct tu_cmd_buffer **cmd_buffers, uint32_t cmd_buffer_count);
 };
 
-/**
- * From the cmdstream, the captured samples-passed values are recorded
- * at the start and end of the batch.
- *
- * Note that we do the math on the CPU to avoid a WFI.  But pre-emption
- * may force us to revisit that.
- */
-struct PACKED tu_renderpass_samples {
-   uint64_t samples_start;
-   /* hw requires the sample start/stop locations to be 128b aligned. */
-   uint64_t __pad0;
-   uint64_t samples_end;
-   uint64_t __pad1;
-};
-
-/* Necessary when writing sample counts using CP_EVENT_WRITE7::ZPASS_DONE. */
-static_assert(offsetof(struct tu_renderpass_samples, samples_end) == 16);
-
-/**
- * Tracks the results from an individual renderpass. Initially created
- * per renderpass, and appended to the tail of at->pending_results. At a later
- * time, when the GPU has finished writing the results, we fill samples_passed.
- */
-struct tu_renderpass_result {
-   /* Points into GPU memory */
-   struct tu_renderpass_samples* samples;
-
-   struct tu_suballoc_bo bo;
-
-   /*
-    * Below here, only used internally within autotune
-    */
-   uint64_t rp_key;
-   struct tu_renderpass_history *history;
-   struct list_head node;
-   uint32_t fence;
-   uint64_t samples_passed;
-};
-
-VkResult tu_autotune_init(struct tu_autotune *at, struct tu_device *dev);
-void tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev);
-
-bool tu_autotune_use_bypass(struct tu_autotune *at,
-                            struct tu_cmd_buffer *cmd_buffer,
-                            struct tu_renderpass_result **autotune_result);
-void tu_autotune_free_results(struct tu_device *dev, struct list_head *results);
-
-bool tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers,
-                                       uint32_t cmd_buffer_count);
-
-/**
- * A magic 8-ball that tells the gmem code whether we should do bypass mode
- * for moar fps.
- */
-struct tu_cs *tu_autotune_on_submit(struct tu_device *dev,
-                                    struct tu_autotune *at,
-                                    struct tu_cmd_buffer **cmd_buffers,
-                                    uint32_t cmd_buffer_count);
-
-struct tu_autotune_results_buffer;
-
-template <chip CHIP>
-void tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd,
-                                  struct tu_cs *cs,
-                                  struct tu_renderpass_result *autotune_result);
-
-template <chip CHIP>
-void tu_autotune_end_renderpass(struct tu_cmd_buffer *cmd,
-                                struct tu_cs *cs,
-                                struct tu_renderpass_result *autotune_result);
-
-#endif /* TU_AUTOTUNE_H */
+#endif /* TU_AUTOTUNE_H */
\ No newline at end of file
diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc
index df892beaa91..eeee54e2e34 100644
--- a/src/freedreno/vulkan/tu_cmd_buffer.cc
+++ b/src/freedreno/vulkan/tu_cmd_buffer.cc
@@ -14,6 +14,7 @@
 #include "vk_render_pass.h"
 #include "vk_util.h"
 
+#include "tu_autotune.h"
 #include "tu_buffer.h"
 #include "tu_clear_blit.h"
 #include "tu_cs.h"
@@ -1277,7 +1278,7 @@ use_hw_binning(struct tu_cmd_buffer *cmd)
 
 static bool
 use_sysmem_rendering(struct tu_cmd_buffer *cmd,
-                     struct tu_renderpass_result **autotune_result)
+                     tu_autotune::rp_ctx_t *rp_ctx)
 {
    if (TU_DEBUG(SYSMEM)) {
       cmd->state.rp.gmem_disable_reason = "TU_DEBUG(SYSMEM)";
@@ -1330,15 +1331,9 @@ use_sysmem_rendering(struct tu_cmd_buffer *cmd,
    if (TU_DEBUG(GMEM))
       return false;
 
-   bool use_sysmem = tu_autotune_use_bypass(&cmd->device->autotune,
-                                            cmd, autotune_result);
-   if (*autotune_result) {
-      list_addtail(&(*autotune_result)->node, &cmd->renderpass_autotune_results);
-   }
-
-   if (use_sysmem) {
+   bool use_sysmem = cmd->device->autotune->get_optimal_mode(cmd, rp_ctx) == tu_autotune::render_mode::SYSMEM;
+   if (use_sysmem)
       cmd->state.rp.gmem_disable_reason = "Autotune selected sysmem";
-   }
 
    return use_sysmem;
 }
@@ -3021,7 +3016,7 @@ tu7_emit_concurrent_binning_sysmem(struct tu_cmd_buffer *cmd,
 template <chip CHIP>
 static void
 tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
-                        struct tu_renderpass_result *autotune_result)
+                        tu_autotune::rp_ctx_t rp_ctx)
 {
    const struct tu_framebuffer *fb = cmd->state.framebuffer;
 
@@ -3075,7 +3070,7 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
       tu_cs_emit_regs(cs, RB_BIN_FOVEAT(CHIP));
    }
 
-   tu_autotune_begin_renderpass<CHIP>(cmd, cs, autotune_result);
+   cmd->device->autotune->begin_renderpass(cmd, cs, rp_ctx, true);
 
    tu_cs_sanity_check(cs);
 }
@@ -3083,7 +3078,7 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
 template <chip CHIP>
 static void
 tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
-                      struct tu_renderpass_result *autotune_result)
+                      tu_autotune::rp_ctx_t rp_ctx)
 {
    /* Do any resolves of the last subpass. These are handled in the
     * tile_store_cs in the gmem path.
@@ -3111,7 +3106,7 @@ tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
       tu_cs_emit(cs, 0); /* value */
    }
 
-   tu_autotune_end_renderpass<CHIP>(cmd, cs, autotune_result);
+   cmd->device->autotune->end_renderpass(cmd, cs, rp_ctx);
 
    tu_cs_sanity_check(cs);
 }
@@ -3261,7 +3256,7 @@ tu7_emit_concurrent_binning_gmem(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
 template <chip CHIP>
 static void
 tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
-                      struct tu_renderpass_result *autotune_result,
+                      tu_autotune::rp_ctx_t rp_ctx,
                       const VkOffset2D *fdm_offsets)
 {
    struct tu_physical_device *phys_dev = cmd->device->physical_device;
@@ -3448,7 +3443,7 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
    if (use_cb)
       tu_trace_start_render_pass(cmd);
 
-   tu_autotune_begin_renderpass<CHIP>(cmd, cs, autotune_result);
+   cmd->device->autotune->begin_renderpass(cmd, cs, rp_ctx, false);
 
    tu_cs_sanity_check(cs);
 }
@@ -3514,7 +3509,7 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
 template <chip CHIP>
 static void
 tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
-                    struct tu_renderpass_result *autotune_result)
+                    tu_autotune::rp_ctx_t rp_ctx)
 {
    tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
 
@@ -3544,7 +3539,7 @@ tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
 
    tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_BLIT_CACHE);
 
-   tu_autotune_end_renderpass<CHIP>(cmd, cs, autotune_result);
+   cmd->device->autotune->end_renderpass(cmd, cs, rp_ctx);
 
    tu_cs_sanity_check(cs);
 }
@@ -3878,7 +3873,7 @@ tu_allocate_transient_attachments(struct tu_cmd_buffer *cmd, bool sysmem)
 template <chip CHIP>
 static void
 tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
-                    struct tu_renderpass_result *autotune_result,
+                    tu_autotune::rp_ctx_t rp_ctx,
                     const VkOffset2D *fdm_offsets)
 {
    const struct tu_tiling_config *tiling = cmd->state.tiling;
@@ -3912,7 +3907,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
    tu6_emit_tile_store_cs<CHIP>(cmd, &cmd->tile_store_cs);
    tu_cs_end(&cmd->tile_store_cs);
 
-   tu6_tile_render_begin<CHIP>(cmd, &cmd->cs, autotune_result, fdm_offsets);
+   tu6_tile_render_begin<CHIP>(cmd, &cmd->cs, rp_ctx, fdm_offsets);
 
    /* Note: we reverse the order of walking the pipes and tiles on every
     * other row, to improve texture cache locality compared to raster order.
@@ -3964,7 +3959,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
       }
    }
 
-   tu6_tile_render_end<CHIP>(cmd, &cmd->cs, autotune_result);
+   tu6_tile_render_end<CHIP>(cmd, &cmd->cs, rp_ctx);
 
    tu_trace_end_render_pass<CHIP>(cmd, true);
 
@@ -3984,7 +3979,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
 template <chip CHIP>
 static void
 tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,
-                     struct tu_renderpass_result *autotune_result)
+                     tu_autotune::rp_ctx_t rp_ctx)
 {
    VkResult result = tu_allocate_transient_attachments(cmd, true);
 
@@ -3995,7 +3990,7 @@ tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,
 
    tu_trace_start_render_pass(cmd);
 
-   tu6_sysmem_render_begin<CHIP>(cmd, &cmd->cs, autotune_result);
+   tu6_sysmem_render_begin<CHIP>(cmd, &cmd->cs, rp_ctx);
 
    trace_start_draw_ib_sysmem(&cmd->trace, &cmd->cs, cmd);
 
@@ -4003,7 +3998,7 @@ tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,
 
    trace_end_draw_ib_sysmem(&cmd->trace, &cmd->cs);
 
-   tu6_sysmem_render_end<CHIP>(cmd, &cmd->cs, autotune_result);
+   tu6_sysmem_render_end<CHIP>(cmd, &cmd->cs, rp_ctx);
 
    tu_clone_trace_range(cmd, &cmd->cs, &cmd->trace,
                         cmd->trace_renderpass_start,
@@ -4020,11 +4015,11 @@ tu_cmd_render(struct tu_cmd_buffer *cmd_buffer,
    if (cmd_buffer->state.rp.has_tess)
       tu6_lazy_emit_tessfactor_addr<CHIP>(cmd_buffer);
 
-   struct tu_renderpass_result *autotune_result = NULL;
-   if (use_sysmem_rendering(cmd_buffer, &autotune_result))
-      tu_cmd_render_sysmem<CHIP>(cmd_buffer, autotune_result);
+   tu_autotune::rp_ctx_t rp_ctx = NULL;
+   if (use_sysmem_rendering(cmd_buffer, &rp_ctx))
+      tu_cmd_render_sysmem<CHIP>(cmd_buffer, rp_ctx);
    else
-      tu_cmd_render_tiles<CHIP>(cmd_buffer, autotune_result, fdm_offsets);
+      tu_cmd_render_tiles<CHIP>(cmd_buffer, rp_ctx, fdm_offsets);
 
    /* Outside of renderpasses we assume all draw states are disabled. We do
     * this outside the draw CS for the normal case where 3d gmem stores aren't
@@ -4097,7 +4092,7 @@ tu_create_cmd_buffer(struct vk_command_pool *pool,
    u_trace_init(&cmd_buffer->rp_trace, &device->trace_context);
    cmd_buffer->trace_renderpass_start =
       u_trace_begin_iterator(&cmd_buffer->rp_trace);
-   list_inithead(&cmd_buffer->renderpass_autotune_results);
+   new (&cmd_buffer->autotune_ctx) tu_autotune::cmd_buf_ctx();
 
    if (TU_DEBUG_START(CHECK_CMD_BUFFER_STATUS)) {
       cmd_buffer->status_bo = tu_cmd_buffer_setup_status_tracking(device);
@@ -4146,7 +4141,7 @@ tu_cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer)
    u_trace_fini(&cmd_buffer->trace);
    u_trace_fini(&cmd_buffer->rp_trace);
 
-   tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results);
+   cmd_buffer->autotune_ctx.~cmd_buf_ctx();
 
    for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
       if (cmd_buffer->descriptors[i].push_set.layout)
@@ -4224,7 +4219,7 @@ tu_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer,
    tu_cs_reset(&cmd_buffer->pre_chain.draw_cs);
    tu_cs_reset(&cmd_buffer->pre_chain.draw_epilogue_cs);
 
-   tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results);
+   cmd_buffer->autotune_ctx.reset();
 
    for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
       memset(&cmd_buffer->descriptors[i].sets, 0, sizeof(cmd_buffer->descriptors[i].sets));
diff --git a/src/freedreno/vulkan/tu_cmd_buffer.h b/src/freedreno/vulkan/tu_cmd_buffer.h
index 4e974e12827..cd853826207 100644
--- a/src/freedreno/vulkan/tu_cmd_buffer.h
+++ b/src/freedreno/vulkan/tu_cmd_buffer.h
@@ -644,8 +644,7 @@ struct tu_cmd_buffer
    struct u_trace_iterator trace_renderpass_start;
    struct u_trace trace, rp_trace;
 
-   struct list_head renderpass_autotune_results;
-   struct tu_autotune_results_buffer* autotune_buffer;
+   tu_autotune::cmd_buf_ctx autotune_ctx;
 
    void *patchpoints_ctx;
    struct util_dynarray fdm_bin_patchpoints;
diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc
index dceb5227116..e9f31fb67d5 100644
--- a/src/freedreno/vulkan/tu_device.cc
+++ b/src/freedreno/vulkan/tu_device.cc
@@ -2633,7 +2633,6 @@ tu_device_destroy_mutexes(struct tu_device *device)
 {
    mtx_destroy(&device->bo_mutex);
    mtx_destroy(&device->pipeline_mutex);
-   mtx_destroy(&device->autotune_mutex);
    mtx_destroy(&device->kgsl_profiling_mutex);
    mtx_destroy(&device->event_mutex);
    mtx_destroy(&device->trace_mutex);
@@ -2743,7 +2742,6 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
 
    mtx_init(&device->bo_mutex, mtx_plain);
    mtx_init(&device->pipeline_mutex, mtx_plain);
-   mtx_init(&device->autotune_mutex, mtx_plain);
    mtx_init(&device->kgsl_profiling_mutex, mtx_plain);
    mtx_init(&device->event_mutex, mtx_plain);
    mtx_init(&device->trace_mutex, mtx_plain);
@@ -2868,9 +2866,6 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
                                 TU_BO_ALLOC_ALLOW_DUMP |
                                 TU_BO_ALLOC_INTERNAL_RESOURCE),
       "pipeline_suballoc");
-   tu_bo_suballocator_init(&device->autotune_suballoc, device,
-                           128 * 1024, TU_BO_ALLOC_INTERNAL_RESOURCE,
-                           "autotune_suballoc");
    if (is_kgsl(physical_device->instance)) {
       tu_bo_suballocator_init(&device->kgsl_profiling_suballoc, device,
                               128 * 1024, TU_BO_ALLOC_INTERNAL_RESOURCE,
@@ -3019,10 +3014,9 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
    }
    pthread_condattr_destroy(&condattr);
 
-   result = tu_autotune_init(&device->autotune, device);
-   if (result != VK_SUCCESS) {
+   device->autotune = new tu_autotune(device, result);
+   if (result != VK_SUCCESS)
       goto fail_timeline_cond;
-   }
 
    device->use_z24uint_s8uint =
       physical_device->info->props.has_z24uint_s8uint &&
@@ -3180,10 +3174,9 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
       free(device->dbg_renderpass_stomp_cs);
    }
 
-   tu_autotune_fini(&device->autotune, device);
+   delete device->autotune;
 
    tu_bo_suballocator_finish(&device->pipeline_suballoc);
-   tu_bo_suballocator_finish(&device->autotune_suballoc);
    tu_bo_suballocator_finish(&device->kgsl_profiling_suballoc);
    tu_bo_suballocator_finish(&device->event_suballoc);
    tu_bo_suballocator_finish(&device->vis_stream_suballocator);
diff --git a/src/freedreno/vulkan/tu_device.h b/src/freedreno/vulkan/tu_device.h
index 08c102ae145..049d5bd581b 100644
--- a/src/freedreno/vulkan/tu_device.h
+++ b/src/freedreno/vulkan/tu_device.h
@@ -28,6 +28,7 @@
 #include "common/freedreno_rd_output.h"
 #include "util/vma.h"
 #include "util/u_vector.h"
+#include "util/rwlock.h"
 
 /* queue types */
 #define TU_QUEUE_GENERAL 0
@@ -265,7 +266,12 @@ struct tu6_global
 
    volatile uint32_t vtx_stats_query_not_running;
 
-   /* To know when renderpass stats for autotune are valid */
+   /* A fence with a monotonically increasing value that is
+    * incremented by the GPU on each submission that includes
+    * a tu_autotune::submission_entry CS. This is used to track
+    * which submissions have been processed by the GPU before
+    * processing the autotune packet on the CPU.
+    */
    volatile uint32_t autotune_fence;
 
    /* For recycling command buffers for dynamic suspend/resume comamnds */
@@ -355,12 +361,6 @@ struct tu_device
    struct tu_suballocator pipeline_suballoc;
    mtx_t pipeline_mutex;
 
-   /* Device-global BO suballocator for reducing BO management for small
-    * gmem/sysmem autotune result buffers.  Synchronized by autotune_mutex.
-    */
-   struct tu_suballocator autotune_suballoc;
-   mtx_t autotune_mutex;
-
    /* KGSL requires a small chunk of GPU mem to retrieve raw GPU time on
     * each submission.
     */
@@ -462,7 +462,7 @@ struct tu_device
    pthread_cond_t timeline_cond;
    pthread_mutex_t submit_mutex;
 
-   struct tu_autotune autotune;
+   struct tu_autotune *autotune;
 
    struct breadcrumbs_context *breadcrumbs_ctx;
 
diff --git a/src/freedreno/vulkan/tu_queue.cc b/src/freedreno/vulkan/tu_queue.cc
index a87a73f0cd4..d6acf399042 100644
--- a/src/freedreno/vulkan/tu_queue.cc
+++ b/src/freedreno/vulkan/tu_queue.cc
@@ -418,6 +418,7 @@ queue_submit(struct vk_queue *_queue, struct vk_queue_submit *vk_submit)
    struct tu_device *device = queue->device;
    bool u_trace_enabled = u_trace_should_process(&queue->device->trace_context);
    struct util_dynarray dump_cmds;
+   struct tu_cs *autotune_cs = NULL;
 
    if (vk_submit->buffer_bind_count ||
        vk_submit->image_bind_count ||
@@ -495,9 +496,8 @@ queue_submit(struct vk_queue *_queue, struct vk_queue_submit *vk_submit)
       }
    }
 
-   if (tu_autotune_submit_requires_fence(cmd_buffers, cmdbuf_count)) {
-      struct tu_cs *autotune_cs = tu_autotune_on_submit(
-         device, &device->autotune, cmd_buffers, cmdbuf_count);
+   autotune_cs = device->autotune->on_submit(cmd_buffers, cmdbuf_count);
+   if (autotune_cs) {
       submit_add_entries(device, submit, &dump_cmds, autotune_cs->entries,
                          autotune_cs->entry_count);
    }

From ff4bb3c658ccf770d101fe7aa319d5df87c3c9fa Mon Sep 17 00:00:00 2001
From: Danylo Piliaiev <dpiliaiev@igalia.com>
Date: Mon, 1 Sep 2025 17:45:18 +0200
Subject: [PATCH 04/25] tu/autotune: Improve RP hash

Makes RP hash more unique by using attachment IOVAs and considering
multiple instances of the same RP within a CB.

Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com>
---
 src/freedreno/vulkan/tu_autotune.cc | 71 ++++++++++++++++++++++++-----
 src/freedreno/vulkan/tu_autotune.h  |  5 ++
 2 files changed, 65 insertions(+), 11 deletions(-)

diff --git a/src/freedreno/vulkan/tu_autotune.cc b/src/freedreno/vulkan/tu_autotune.cc
index f7de8603a29..6ee442fbf46 100644
--- a/src/freedreno/vulkan/tu_autotune.cc
+++ b/src/freedreno/vulkan/tu_autotune.cc
@@ -378,6 +378,9 @@ struct tu_autotune::rp_entry {
    bool sysmem;
    uint32_t draw_count;
 
+   /* Amount of repeated RPs so far, used for uniquely identifying instances of the same RPs. */
+   uint32_t duplicates = 0;
+
    rp_entry(struct tu_device *device, rp_history &history, config_t config, uint32_t draw_count)
        : device(device), map(nullptr), history(&history), config(config), draw_count(draw_count)
    {
@@ -540,12 +543,25 @@ tu_autotune::rp_key::rp_key(const struct tu_render_pass *pass,
                             const struct tu_framebuffer *framebuffer,
                             const struct tu_cmd_buffer *cmd)
 {
-   /* Q: Why not make the key from framebuffer + renderpass pointers?
-    * A: At least DXVK creates new framebuffers each frame while keeping renderpasses the same. Hashing the contents
-    *    of the framebuffer and renderpass is more stable, and it maintains stability across runs, so we can reliably
-    *    identify the same renderpass instance.
+   /* It may be hard to match the same renderpass between frames, or rather it's hard to strike a
+    * balance between being too lax with identifying different renderpasses as the same one, and
+    * not recognizing the same renderpass between frames when only a small thing changed.
+    *
+    * This is mainly an issue with translation layers (particularly DXVK), because a layer may
+    * break a "renderpass" into smaller ones due to some heuristic that isn't consistent between
+    * frames.
+    *
+    * Note: Not using image IOVA leads to too many false matches.
     */
 
+   struct packed_att_properties {
+      uint64_t iova;
+      bool load          : 1;
+      bool store         : 1;
+      bool load_stencil  : 1;
+      bool store_stencil : 1;
+   };
+
    auto get_hash = [&](uint32_t *data, size_t size) {
       uint32_t *ptr = data;
       *ptr++ = framebuffer->width;
@@ -553,12 +569,18 @@ tu_autotune::rp_key::rp_key(const struct tu_render_pass *pass,
       *ptr++ = framebuffer->layers;
 
       for (unsigned i = 0; i < pass->attachment_count; i++) {
-         *ptr++ = cmd->state.attachments[i]->view.width;
-         *ptr++ = cmd->state.attachments[i]->view.height;
-         *ptr++ = cmd->state.attachments[i]->image->vk.format;
-         *ptr++ = cmd->state.attachments[i]->image->vk.array_layers;
-         *ptr++ = cmd->state.attachments[i]->image->vk.mip_levels;
+         packed_att_properties props = {
+            .iova = cmd->state.attachments[i]->image->iova + cmd->state.attachments[i]->view.offset,
+            .load = pass->attachments[i].load,
+            .store = pass->attachments[i].store,
+            .load_stencil = pass->attachments[i].load_stencil,
+            .store_stencil = pass->attachments[i].store_stencil,
+         };
+
+         memcpy(ptr, &props, sizeof(packed_att_properties));
+         ptr += sizeof(packed_att_properties) / sizeof(uint32_t);
       }
+      assert(ptr == data + size);
 
       return XXH64(data, size * sizeof(uint32_t), 0);
    };
@@ -566,8 +588,8 @@ tu_autotune::rp_key::rp_key(const struct tu_render_pass *pass,
    /* We do a manual Boost-style "small vector" optimization here where the stack is used for the vast majority of
     * cases, while only extreme cases need to allocate on the heap.
     */
-   size_t data_count = 3 + (pass->attachment_count * 5);
-   constexpr size_t STACK_MAX_DATA_COUNT = 3 + (5 * 5); /* in u32 units. */
+   size_t data_count = 3 + (pass->attachment_count * sizeof(packed_att_properties) / sizeof(uint32_t));
+   constexpr size_t STACK_MAX_DATA_COUNT = 3 + (5 * 3); /* in u32 units. */
 
    if (data_count <= STACK_MAX_DATA_COUNT) {
       /* If the data is small enough, we can use the stack. */
@@ -580,6 +602,11 @@ tu_autotune::rp_key::rp_key(const struct tu_render_pass *pass,
    }
 }
 
+tu_autotune::rp_key::rp_key(const rp_key &key, uint32_t duplicates)
+{
+   hash = XXH64(&key.hash, sizeof(key.hash), duplicates);
+}
+
 /* Exponential moving average (EMA) calculator for smoothing successive values of any metric. An alpha (smoothing
  * factor) of 0.1 means 10% weight to new values (slow adaptation), while 0.9 means 90% weight (fast adaptation).
  */
@@ -627,6 +654,7 @@ template <typename T = double> class exponential_average {
 struct tu_autotune::rp_history {
  public:
    uint64_t hash; /* The hash of the renderpass, just for debug output. */
+   uint32_t duplicates;
 
    rp_history(uint64_t hash): hash(hash)
    {
@@ -846,6 +874,16 @@ tu_autotune::cmd_buf_ctx::attach_rp_entry(struct tu_device *device,
    return batch->entries.emplace_back(device, history, config, drawcall_count);
 }
 
+tu_autotune::rp_entry *
+tu_autotune::cmd_buf_ctx::find_rp_entry(const rp_key &key)
+{
+   for (auto &entry : batch->entries) {
+      if (entry.history->hash == key.hash)
+         return &entry;
+   }
+   return nullptr;
+}
+
 tu_autotune::render_mode
 tu_autotune::get_optimal_mode(struct tu_cmd_buffer *cmd_buffer, rp_ctx_t *rp_ctx)
 {
@@ -898,6 +936,17 @@ tu_autotune::get_optimal_mode(struct tu_cmd_buffer *cmd_buffer, rp_ctx_t *rp_ctx
       return render_mode::SYSMEM;
 
    rp_key key(pass, framebuffer, cmd_buffer);
+
+   /* When nearly identical renderpasses appear multiple times within the same command buffer, we need to generate a
+    * unique hash for each instance to distinguish them. While this approach doesn't address identical renderpasses
+    * across different command buffers, it is good enough in most cases.
+    */
+   rp_entry *entry = cb_ctx.find_rp_entry(key);
+   if (entry) {
+      entry->duplicates++;
+      key = rp_key(key, entry->duplicates);
+   }
+
    auto &history = find_or_create_rp_history(key);
    *rp_ctx = &cb_ctx.attach_rp_entry(device, history, config, rp_state->drawcall_count);
 
diff --git a/src/freedreno/vulkan/tu_autotune.h b/src/freedreno/vulkan/tu_autotune.h
index f5cea4c8c3c..3b47508a2bb 100644
--- a/src/freedreno/vulkan/tu_autotune.h
+++ b/src/freedreno/vulkan/tu_autotune.h
@@ -154,6 +154,9 @@ struct tu_autotune {
              const struct tu_framebuffer *framebuffer,
              const struct tu_cmd_buffer *cmd);
 
+      /* Further salt the hash to distinguish between multiple instances of the same RP within a single command buffer. */
+      rp_key(const rp_key &key, uint32_t duplicates);
+
       /* Equality operator, used in unordered_map. */
       constexpr bool operator==(const rp_key &other) const noexcept
       {
@@ -204,6 +207,8 @@ struct tu_autotune {
       /* Creates a new RP entry attached to this CB. */
       rp_entry &attach_rp_entry(struct tu_device *device, rp_history &entry, config_t config, uint32_t draw_count);
 
+      rp_entry *find_rp_entry(const rp_key &key);
+
       friend struct tu_autotune;
 
     public:

From f4e6981a4c0b3d988a7621edf6c2951bc409f8df Mon Sep 17 00:00:00 2001
From: Dhruv Mark Collins <mark@igalia.com>
Date: Thu, 9 Oct 2025 13:56:10 +0000
Subject: [PATCH 05/25] util/rand_xor: Add extern C for C++ compatibility

Signed-off-by: Dhruv Mark Collins <mark@igalia.com>
---
 src/util/rand_xor.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/util/rand_xor.h b/src/util/rand_xor.h
index b55598f228a..830c6c3e727 100644
--- a/src/util/rand_xor.h
+++ b/src/util/rand_xor.h
@@ -28,10 +28,18 @@
 #include <stdint.h>
 #include <stdbool.h>
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 uint64_t
 rand_xorshift128plus(uint64_t seed[2]);
 
 void
 s_rand_xorshift128plus(uint64_t seed[2], bool randomised_seed);
 
+#ifdef __cplusplus
+} /* end of extern "C" */
+#endif
+
 #endif /* RAND_XOR_H */

From 3248d47b48331076e8d11a62788726a0f1c9b10f Mon Sep 17 00:00:00 2001
From: Dhruv Mark Collins <mark@igalia.com>
Date: Thu, 9 Oct 2025 20:20:41 +0000
Subject: [PATCH 06/25] tu/autotune: Add "Profiled" algorithm

This algo measures the time taken by each RP as a whole, and uses that
to move a probability distribution of whether to use GMEM or SYSMEM for
that RP. This is done with a delta of 5% per run, and the probability is
clamped to 5% and 95% to avoid getting stuck when conditions change.

Additionally, an "immediate resolve" variant which tries to work off a
single data point in SYSMEM and GMEM, then immediately resolves to the
faster path. This is useful for usage in CI which runs a single frame
multiple times where the performance isn't varying change from frame to
frame.

Signed-off-by: Dhruv Mark Collins <mark@igalia.com>
---
 docs/drivers/freedreno.rst          |  13 ++
 src/freedreno/vulkan/tu_autotune.cc | 201 +++++++++++++++++++++++++++-
 2 files changed, 213 insertions(+), 1 deletion(-)

diff --git a/docs/drivers/freedreno.rst b/docs/drivers/freedreno.rst
index cfdcaa21941..163182c5490 100644
--- a/docs/drivers/freedreno.rst
+++ b/docs/drivers/freedreno.rst
@@ -681,6 +681,19 @@ environment variables:
     Estimates the bandwidth usage of rendering in SYSMEM and GMEM modes, and chooses
     the one with lower estimated bandwidth. This is the default algorithm.
 
+  ``profiled``
+    Dynamically profiles the RP timings in SYSMEM and GMEM modes, and uses that to
+    move a probability distribution towards the optimal choice over time. This
+    algorithm tends to be far more accurate than the bandwidth algorithm at choosing
+    the optimal rendering mode but may result in larger FPS variance due to being
+    based on a probability distribution with random sampling.
+
+  ``profiled_imm``
+    Similar to ``profiled``, but only profiles the first few instances of a RP
+    and then sticks to the chosen mode for subsequent instances. This is meant
+    for single-frame traces run multiple times in a CI where this algorithm can
+    immediately chose the optimal rendering mode for each RP.
+
 .. envvar:: TU_AUTOTUNE_FLAGS
 
   Modifies the behavior of the selected algorithm. Supported flags are:
diff --git a/src/freedreno/vulkan/tu_autotune.cc b/src/freedreno/vulkan/tu_autotune.cc
index 6ee442fbf46..f1d0fab5e6a 100644
--- a/src/freedreno/vulkan/tu_autotune.cc
+++ b/src/freedreno/vulkan/tu_autotune.cc
@@ -26,6 +26,7 @@
 
 #define TU_AUTOTUNE_DEBUG_LOG_BASE      0
 #define TU_AUTOTUNE_DEBUG_LOG_BANDWIDTH 0
+#define TU_AUTOTUNE_DEBUG_LOG_PROFILED  0
 
 #if TU_AUTOTUNE_DEBUG_LOG_BASE
 #define at_log_base(fmt, ...)         mesa_logi("autotune: " fmt, ##__VA_ARGS__)
@@ -41,6 +42,12 @@
 #define at_log_bandwidth_h(fmt, hash, ...)
 #endif
 
+#if TU_AUTOTUNE_DEBUG_LOG_PROFILED
+#define at_log_profiled_h(fmt, hash, ...) mesa_logi("autotune-prof %016" PRIx64 ": " fmt, hash, ##__VA_ARGS__)
+#else
+#define at_log_profiled_h(fmt, hash, ...)
+#endif
+
 /* Process any pending entries on autotuner finish, could be used to gather data from traces. */
 #define TU_AUTOTUNE_FLUSH_AT_FINISH 0
 
@@ -80,6 +87,8 @@ render_mode_str(tu_autotune::render_mode mode)
 
 enum class tu_autotune::algorithm : uint8_t {
    BANDWIDTH = 0,    /* Uses estimated BW for determining rendering mode. */
+   PROFILED = 1,     /* Uses dynamically profiled results for determining rendering mode. */
+   PROFILED_IMM = 2, /* Same as PROFILED but immediately resolves the SYSMEM/GMEM probability. */
 
    DEFAULT = BANDWIDTH, /* Default algorithm, used if no other is specified. */
 };
@@ -93,6 +102,7 @@ enum class tu_autotune::mod_flag : uint8_t {
 /* Metric flags, for internal tracking of enabled metrics. */
 enum class tu_autotune::metric_flag : uint8_t {
    SAMPLES = BIT(1), /* Enable tracking samples passed metric. */
+   TS = BIT(2),      /* Enable tracking per-RP timestamp metric. */
 };
 
 struct PACKED tu_autotune::config_t {
@@ -106,6 +116,8 @@ struct PACKED tu_autotune::config_t {
       /* Note: Always keep in sync with rp_history to prevent UB. */
       if (algo == algorithm::BANDWIDTH) {
          metric_flags |= (uint8_t) metric_flag::SAMPLES;
+      } else if (algo == algorithm::PROFILED || algo == algorithm::PROFILED_IMM) {
+         metric_flags |= (uint8_t) metric_flag::TS;
       }
    }
 
@@ -179,6 +191,8 @@ struct PACKED tu_autotune::config_t {
       std::string str = "Algorithm: ";
 
       ALGO_STR(BANDWIDTH);
+      ALGO_STR(PROFILED);
+      ALGO_STR(PROFILED_IMM);
 
       str += ", Mod Flags: 0x" + std::to_string(mod_flags) + " (";
       MODF_STR(BIG_GMEM);
@@ -187,6 +201,7 @@ struct PACKED tu_autotune::config_t {
 
       str += ", Metric Flags: 0x" + std::to_string(metric_flags) + " (";
       METRICF_STR(SAMPLES);
+      METRICF_STR(TS);
       str += ")";
 
       return str;
@@ -244,6 +259,12 @@ tu_autotune::get_env_config()
          std::string_view algo_strv(algo_env_str);
          if (algo_strv == "bandwidth") {
             algo = algorithm::BANDWIDTH;
+         } else if (algo_strv == "profiled") {
+            algo = algorithm::PROFILED;
+         } else if (algo_strv == "profiled_imm") {
+            algo = algorithm::PROFILED_IMM;
+         } else {
+            mesa_logw("Unknown TU_AUTOTUNE_ALGO '%s', using default", algo_env_str);
          }
 
          if (TU_DEBUG(STARTUP))
@@ -505,6 +526,22 @@ struct tu_autotune::rp_entry {
       }
    }
 
+   /** RP/Tile Timestamp Metric **/
+
+   uint64_t get_rp_duration()
+   {
+      assert(config.test(metric_flag::TS));
+      rp_gpu_data &gpu = get_gpu_data();
+      return gpu.ts_end - gpu.ts_start;
+   }
+
+   void emit_metric_timestamp(struct tu_cs *cs, uint64_t timestamp_iova)
+   {
+      tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
+      tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_ALWAYS_ON_COUNTER) | CP_REG_TO_MEM_0_CNT(2) | CP_REG_TO_MEM_0_64B);
+      tu_cs_emit_qw(cs, timestamp_iova);
+   }
+
    /** CS Emission **/
 
    void emit_rp_start(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
@@ -513,6 +550,9 @@ struct tu_autotune::rp_entry {
       uint64_t bo_iova = bo.iova;
       if (config.test(metric_flag::SAMPLES))
          emit_metric_samples_start(cmd, cs, bo_iova + offsetof(rp_gpu_data, samples_start));
+
+      if (config.test(metric_flag::TS))
+         emit_metric_timestamp(cs, bo_iova + offsetof(rp_gpu_data, ts_start));
    }
 
    void emit_rp_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
@@ -522,6 +562,9 @@ struct tu_autotune::rp_entry {
       if (config.test(metric_flag::SAMPLES))
          emit_metric_samples_end(cmd, cs, bo_iova + offsetof(rp_gpu_data, samples_start),
                                  bo_iova + offsetof(rp_gpu_data, samples_end));
+
+      if (config.test(metric_flag::TS))
+         emit_metric_timestamp(cs, bo_iova + offsetof(rp_gpu_data, ts_end));
    }
 };
 
@@ -648,15 +691,71 @@ template <typename T = double> class exponential_average {
    }
 };
 
+/* An improvement over pure EMA to filter out spikes by using two EMAs:
+ * - A "slow" EMA with a low alpha to track the long-term average.
+ * - A "fast" EMA with a high alpha to track short-term changes.
+ * When retrieving the average, if the fast EMA deviates significantly from the slow EMA, it indicates a spike, and we
+ * fall back to the slow EMA.
+ */
+template <typename T = double> class adaptive_average {
+ private:
+   static constexpr double DefaultSlowAlpha = 0.1, DefaultFastAlpha = 0.5, DefaultDeviationThreshold = 0.3;
+   exponential_average<T> slow;
+   exponential_average<T> fast;
+   double deviationThreshold;
+
+ public:
+   size_t count = 0;
+
+   explicit adaptive_average(double slow_alpha = DefaultSlowAlpha,
+                             double fast_alpha = DefaultFastAlpha,
+                             double deviation_threshold = DefaultDeviationThreshold) noexcept
+       : slow(slow_alpha), fast(fast_alpha), deviationThreshold(deviation_threshold)
+   {
+   }
+
+   void add(T value) noexcept
+   {
+      slow.add(value);
+      fast.add(value);
+      count++;
+   }
+
+   T get() const noexcept
+   {
+      double s = slow.get();
+      double f = fast.get();
+      /* Use fast if it's close to slow (normal variation).
+       * Use slow if fast deviates too much (likely a spike).
+       */
+      double deviation = std::abs(f - s) / s;
+      return (deviation < deviationThreshold) ? f : s + (f - s) * deviationThreshold;
+   }
+
+   void clear() noexcept
+   {
+      slow.clear();
+      fast.clear();
+      count = 0;
+   }
+};
+
 /* All historical state pertaining to a uniquely identified RP. This integrates data from RP entries, accumulating
  * metrics over the long-term and providing autotune algorithms using the data.
  */
 struct tu_autotune::rp_history {
+ private:
+   /* Amount of duration samples for profiling before we start averaging. */
+   static constexpr uint32_t MIN_PROFILE_DURATION_COUNT = 5;
+
+   adaptive_average<uint64_t> sysmem_rp_average;
+   adaptive_average<uint64_t> gmem_rp_average;
+
  public:
    uint64_t hash; /* The hash of the renderpass, just for debug output. */
    uint32_t duplicates;
 
-   rp_history(uint64_t hash): hash(hash)
+   rp_history(uint64_t hash): hash(hash), profiled(hash)
    {
    }
 
@@ -720,6 +819,90 @@ struct tu_autotune::rp_history {
       }
    } bandwidth;
 
+   /** Profiled Algorithms **/
+   struct profiled_algo {
+    private:
+      /* Range [0 (GMEM), 100 (SYSMEM)], where 50 means no preference. */
+      constexpr static uint32_t PROBABILITY_MAX = 100, PROBABILITY_MID = 50;
+      constexpr static uint32_t PROBABILITY_PREFER_SYSMEM = 80, PROBABILITY_PREFER_GMEM = 20;
+
+      std::atomic<uint32_t> sysmem_probability = PROBABILITY_MID;
+      bool should_reset = false; /* If true, will reset sysmem_probability before next update. */
+      uint64_t seed[2] { 0x3bffb83978e24f88, 0x9238d5d56c71cd35 };
+
+    public:
+      profiled_algo(uint64_t hash)
+      {
+         seed[1] = hash;
+      }
+
+      void update(rp_history &history, bool immediate)
+      {
+         auto &sysmem_ema = history.sysmem_rp_average;
+         auto &gmem_ema = history.gmem_rp_average;
+         uint32_t sysmem_prob = sysmem_probability.load(std::memory_order_relaxed);
+         if (immediate) {
+            /* Try to immediately resolve the probability, this is useful for CI running a single trace of frames where
+             * the probabilites aren't expected to change from run to run. This environment also gives us a best case
+             * scenario for autotune performance, since we know the optimal decisions.
+             */
+
+            if (sysmem_prob == 0 || sysmem_prob == 100)
+               return; /* Already resolved, no further updates are necessary. */
+
+            if (sysmem_ema.count < 1) {
+               sysmem_prob = PROBABILITY_MAX;
+            } else if (gmem_ema.count < 1) {
+               sysmem_prob = 0;
+            } else {
+               sysmem_prob = gmem_ema.get() < sysmem_ema.get() ? 0 : PROBABILITY_MAX;
+            }
+         } else {
+            if (sysmem_ema.count < MIN_PROFILE_DURATION_COUNT || gmem_ema.count < MIN_PROFILE_DURATION_COUNT) {
+               /* Not enough data to make a decision, bias towards least used. */
+               sysmem_prob = sysmem_ema.count < gmem_ema.count ? PROBABILITY_PREFER_SYSMEM : PROBABILITY_PREFER_GMEM;
+               should_reset = true;
+            } else {
+               if (should_reset) {
+                  sysmem_prob = PROBABILITY_MID;
+                  should_reset = false;
+               }
+
+               /* Adjust probability based on timing results. */
+               constexpr uint32_t STEP_DELTA = 5, MIN_PROBABILITY = 5, MAX_PROBABILITY = 95;
+
+               uint64_t avg_sysmem = sysmem_ema.get();
+               uint64_t avg_gmem = gmem_ema.get();
+               if (avg_gmem < avg_sysmem && sysmem_prob > MIN_PROBABILITY) {
+                  sysmem_prob = MAX2(sysmem_prob - STEP_DELTA, MIN_PROBABILITY);
+               } else if (avg_sysmem < avg_gmem && sysmem_prob < MAX_PROBABILITY) {
+                  sysmem_prob = MIN2(sysmem_prob + STEP_DELTA, MAX_PROBABILITY);
+               }
+            }
+         }
+
+         sysmem_probability.store(sysmem_prob, std::memory_order_relaxed);
+
+         at_log_profiled_h("update%s avg_gmem: %" PRIu64 " us (%" PRIu64 " samples) avg_sysmem: %" PRIu64
+                           " us (%" PRIu64 " samples) = sysmem_probability: %" PRIu32,
+                           history.hash, immediate ? "-imm" : "", ticks_to_us(gmem_ema.get()), gmem_ema.count,
+                           ticks_to_us(sysmem_ema.get()), sysmem_ema.count, sysmem_prob);
+      }
+
+    public:
+      render_mode get_optimal_mode(rp_history &history)
+      {
+         uint32_t l_sysmem_probability = sysmem_probability.load(std::memory_order_relaxed);
+         bool select_sysmem = (rand_xorshift128plus(seed) % PROBABILITY_MAX) < l_sysmem_probability;
+         render_mode mode = select_sysmem ? render_mode::SYSMEM : render_mode::GMEM;
+
+         at_log_profiled_h("%" PRIu32 "%% sysmem chance, using %s", history.hash, l_sysmem_probability,
+                           render_mode_str(mode));
+
+         return mode;
+      }
+   } profiled;
+
    void process(rp_entry &entry, tu_autotune &at)
    {
       /* We use entry config to know what metrics it has, autotune config to know what algorithms are enabled. */
@@ -728,6 +911,19 @@ struct tu_autotune::rp_history {
 
       if (entry_config.test(metric_flag::SAMPLES) && at_config.is_enabled(algorithm::BANDWIDTH))
          bandwidth.update(entry.get_samples_passed());
+      if (entry_config.test(metric_flag::TS)) {
+         if (entry.sysmem) {
+            uint64_t rp_duration = entry.get_rp_duration();
+
+            sysmem_rp_average.add(rp_duration);
+         } else {
+            gmem_rp_average.add(entry.get_rp_duration());
+         }
+
+         if (at_config.is_enabled(algorithm::PROFILED) || at_config.is_enabled(algorithm::PROFILED_IMM)) {
+            profiled.update(*this, at_config.is_enabled(algorithm::PROFILED_IMM));
+         }
+      }
    }
 };
 
@@ -950,6 +1146,9 @@ tu_autotune::get_optimal_mode(struct tu_cmd_buffer *cmd_buffer, rp_ctx_t *rp_ctx
    auto &history = find_or_create_rp_history(key);
    *rp_ctx = &cb_ctx.attach_rp_entry(device, history, config, rp_state->drawcall_count);
 
+   if (config.is_enabled(algorithm::PROFILED) || config.is_enabled(algorithm::PROFILED_IMM))
+      return history.profiled.get_optimal_mode(history);
+
    if (config.is_enabled(algorithm::BANDWIDTH))
       return history.bandwidth.get_optimal_mode(history, cmd_state, pass, framebuffer, rp_state);
 

From 8f9ee83d5a62641cf19beba98682e11eee15a1ca Mon Sep 17 00:00:00 2001
From: Dhruv Mark Collins <mark@igalia.com>
Date: Thu, 9 Oct 2025 13:56:30 +0000
Subject: [PATCH 07/25] util/math: Add ROUND_DOWN_TO_NPOT

The default ROUND_DOWN_TO only handles POT alignment values, so
an additional variant was added which handles NPOT alignment too.

Signed-off-by: Dhruv Mark Collins <mark@igalia.com>
---
 src/util/u_math.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/util/u_math.h b/src/util/u_math.h
index 354683bb4ce..2c5f97b9875 100644
--- a/src/util/u_math.h
+++ b/src/util/u_math.h
@@ -674,6 +674,12 @@ ROUND_DOWN_TO(uint64_t value, uint32_t alignment)
    return ((value) & ~(uint64_t)(alignment - 1));
 }
 
+static inline uint64_t
+ROUND_DOWN_TO_NPOT(uint64_t value, uint32_t alignment)
+{
+   return value - (value % alignment);
+}
+
 /**
  * Align a value, only works pot alignemnts.
  */

From 9e60565d468acbaa20a8f742f7cadb750bf826b9 Mon Sep 17 00:00:00 2001
From: Dhruv Mark Collins <mark@igalia.com>
Date: Thu, 9 Oct 2025 13:56:54 +0000
Subject: [PATCH 08/25] tu/autotune: Prefer SYSMEM when only SW binning is
 possible

In cases where only SW binning is possible and where there would be
a performance impact from not using HW binning (i.e. > 2 tiles), it
is preferable to default to SYSMEM as the performance impact of
using GMEM is almost definitely not going to be worth it.

Signed-off-by: Dhruv Mark Collins <mark@igalia.com>
---
 src/freedreno/vulkan/tu_cmd_buffer.cc | 14 +++++++++++---
 src/freedreno/vulkan/tu_device.h      |  7 +++++--
 src/freedreno/vulkan/tu_util.cc       | 14 +++++---------
 3 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc
index eeee54e2e34..1b01e6e7a1b 100644
--- a/src/freedreno/vulkan/tu_cmd_buffer.cc
+++ b/src/freedreno/vulkan/tu_cmd_buffer.cc
@@ -1273,7 +1273,7 @@ use_hw_binning(struct tu_cmd_buffer *cmd)
       return true;
    }
 
-   return vsc->binning;
+   return vsc->binning_possible && vsc->binning_useful;
 }
 
 static bool
@@ -1328,8 +1328,16 @@ use_sysmem_rendering(struct tu_cmd_buffer *cmd,
       return true;
    }
 
-   if (TU_DEBUG(GMEM))
+   if (TU_DEBUG(GMEM)) {
+      cmd->state.rp.gmem_disable_reason = "TU_DEBUG(GMEM)";
       return false;
+   }
+
+   /* This is a case where it's better to avoid GMEM, too many tiles but no HW binning possible. */
+   if (!vsc->binning_possible && vsc->binning_useful) {
+      cmd->state.rp.gmem_disable_reason = "Too many tiles and HW binning is not possible";
+      return true;
+   }
 
    bool use_sysmem = cmd->device->autotune->get_optimal_mode(cmd, rp_ctx) == tu_autotune::render_mode::SYSMEM;
    if (use_sysmem)
@@ -6464,7 +6472,7 @@ tu_emit_subpass_begin_gmem(struct tu_cmd_buffer *cmd, struct tu_resolve_group *r
     * (perf queries), then we can't do this optimization since the
     * start-of-the-CS geometry condition will have been overwritten.
     */
-   bool cond_load_allowed = vsc->binning &&
+   bool cond_load_allowed = vsc->binning_possible &&
                             cmd->state.pass->has_cond_load_store &&
                             !cmd->state.rp.draw_cs_writes_to_cond_pred;
 
diff --git a/src/freedreno/vulkan/tu_device.h b/src/freedreno/vulkan/tu_device.h
index 049d5bd581b..ae88516564f 100644
--- a/src/freedreno/vulkan/tu_device.h
+++ b/src/freedreno/vulkan/tu_device.h
@@ -547,8 +547,11 @@ struct tu_vsc_config {
    /* Whether binning could be used for gmem rendering using this framebuffer. */
    bool binning_possible;
 
-   /* Whether binning should be used for gmem rendering using this framebuffer. */
-   bool binning;
+   /* Whether binning is useful for GMEM rendering performance using this framebuffer. This is independent of whether
+    * binning is possible, and is determined by the tile count. Not binning when it's useful would be a performance
+    * hazard, and GMEM rendering should be avoided in the case where it's useful to bin but not possible to do so.
+    */
+   bool binning_useful;
 
    /* pipe register values */
    uint32_t pipe_config[MAX_VSC_PIPES];
diff --git a/src/freedreno/vulkan/tu_util.cc b/src/freedreno/vulkan/tu_util.cc
index e19d43bb8a9..0ebceae8998 100644
--- a/src/freedreno/vulkan/tu_util.cc
+++ b/src/freedreno/vulkan/tu_util.cc
@@ -460,16 +460,12 @@ tu_tiling_config_update_pipes(struct tu_vsc_config *vsc,
 static void
 tu_tiling_config_update_binning(struct tu_vsc_config *vsc, const struct tu_device *device)
 {
-   if (vsc->binning_possible) {
-      vsc->binning = (vsc->tile_count.width * vsc->tile_count.height) > 2;
+   vsc->binning_useful = (vsc->tile_count.width * vsc->tile_count.height) > 2;
 
-      if (TU_DEBUG(FORCEBIN))
-         vsc->binning = true;
-      if (TU_DEBUG(NOBIN))
-         vsc->binning = false;
-   } else {
-      vsc->binning = false;
-   }
+   if (TU_DEBUG(FORCEBIN))
+      vsc->binning_useful = true;
+   if (TU_DEBUG(NOBIN))
+      vsc->binning_useful = false;
 }
 
 void

From e167c2ec4d3c38197cea13270a6ae53c18fe68b2 Mon Sep 17 00:00:00 2001
From: Dhruv Mark Collins <mark@igalia.com>
Date: Thu, 9 Oct 2025 20:52:25 +0000
Subject: [PATCH 09/25] tu/autotune: Add "Preempt Optimize" mode

This introduces a new option that makes autotune optimize for low
preemption latency which is crucial to ensure responsiveness on
systems with GPU-based composition. A large enough draw can entirely
block the compositor from running with draw-level preemption, this can
be mitigated by preferring to use GMEM which breaks up the draw into
smaller pieces and generally has a lower latency for preemption.

As a further mitigation, tiles in GMEM are then divided into smaller
and smaller pieces which lowers the non-preemptible duration. There
are static checks in place to avoid doing this when it would incur a
cost that is too large.

Signed-off-by: Dhruv Mark Collins <mark@igalia.com>
---
 docs/drivers/freedreno.rst            |   8 +
 src/freedreno/vulkan/tu_autotune.cc   | 340 +++++++++++++++++++++++++-
 src/freedreno/vulkan/tu_autotune.h    |  17 +-
 src/freedreno/vulkan/tu_clear_blit.cc |   5 +-
 src/freedreno/vulkan/tu_cmd_buffer.cc |  37 ++-
 src/freedreno/vulkan/tu_cmd_buffer.h  |   8 +-
 src/freedreno/vulkan/tu_device.cc     |   4 +-
 src/freedreno/vulkan/tu_device.h      |   3 +-
 src/freedreno/vulkan/tu_pass.h        |   2 +
 src/freedreno/vulkan/tu_queue.cc      |   3 +
 src/freedreno/vulkan/tu_util.cc       |  94 ++++++-
 src/freedreno/vulkan/tu_util.h        |  13 +-
 12 files changed, 497 insertions(+), 37 deletions(-)

diff --git a/docs/drivers/freedreno.rst b/docs/drivers/freedreno.rst
index 163182c5490..947872a4c4d 100644
--- a/docs/drivers/freedreno.rst
+++ b/docs/drivers/freedreno.rst
@@ -708,6 +708,14 @@ environment variables:
     is lower than a certain threshold. The benefits of GMEM rendering are less
     pronounced in these smaller RPs and SYSMEM rendering tends to win more often.
 
+  ``preempt_optimize``
+    Tries to keep non-preemptible time in the render pass is below a certain
+    threshold. This is useful for systems with GPU-based compositors where long
+    non-preemptible times can lead to missed frame deadlines, causing noticeable
+    stuttering. This flag will reduce the performance of the render pass in order
+    to improve overall system responsiveness, it should not be used unless the
+    rest of the system is affected by preemption delays.
+
   Multiple flags can be combined by separating them with commas, e.g.
   ``TU_AUTOTUNE_FLAGS=big_gmem,small_sysmem``.
 
diff --git a/src/freedreno/vulkan/tu_autotune.cc b/src/freedreno/vulkan/tu_autotune.cc
index f1d0fab5e6a..8dda5bfb190 100644
--- a/src/freedreno/vulkan/tu_autotune.cc
+++ b/src/freedreno/vulkan/tu_autotune.cc
@@ -27,6 +27,7 @@
 #define TU_AUTOTUNE_DEBUG_LOG_BASE      0
 #define TU_AUTOTUNE_DEBUG_LOG_BANDWIDTH 0
 #define TU_AUTOTUNE_DEBUG_LOG_PROFILED  0
+#define TU_AUTOTUNE_DEBUG_LOG_PREEMPT   0
 
 #if TU_AUTOTUNE_DEBUG_LOG_BASE
 #define at_log_base(fmt, ...)         mesa_logi("autotune: " fmt, ##__VA_ARGS__)
@@ -48,6 +49,12 @@
 #define at_log_profiled_h(fmt, hash, ...)
 #endif
 
+#if TU_AUTOTUNE_DEBUG_LOG_PREEMPT
+#define at_log_preempt_h(fmt, hash, ...) mesa_logi("autotune-preempt %016" PRIx64 ": " fmt, hash, ##__VA_ARGS__)
+#else
+#define at_log_preempt_h(fmt, hash, ...)
+#endif
+
 /* Process any pending entries on autotuner finish, could be used to gather data from traces. */
 #define TU_AUTOTUNE_FLUSH_AT_FINISH 0
 
@@ -97,12 +104,14 @@ enum class tu_autotune::algorithm : uint8_t {
 enum class tu_autotune::mod_flag : uint8_t {
    BIG_GMEM = BIT(1),          /* All RPs with >= 10 draws use GMEM. */
    SMALL_SYSMEM = BIT(2),      /* All RPs with <= 5 draws use SYSMEM. */
+   PREEMPT_OPTIMIZE = BIT(3),  /* Attempts to minimize the preemption latency. */
 };
 
 /* Metric flags, for internal tracking of enabled metrics. */
 enum class tu_autotune::metric_flag : uint8_t {
    SAMPLES = BIT(1), /* Enable tracking samples passed metric. */
    TS = BIT(2),      /* Enable tracking per-RP timestamp metric. */
+   TS_TILE = BIT(3), /* Enable tracking per-tile timestamp metric. */
 };
 
 struct PACKED tu_autotune::config_t {
@@ -119,6 +128,10 @@ struct PACKED tu_autotune::config_t {
       } else if (algo == algorithm::PROFILED || algo == algorithm::PROFILED_IMM) {
          metric_flags |= (uint8_t) metric_flag::TS;
       }
+
+      if (mod_flags & (uint8_t) mod_flag::PREEMPT_OPTIMIZE) {
+         metric_flags |= (uint8_t) metric_flag::TS | (uint8_t) metric_flag::TS_TILE;
+      }
    }
 
  public:
@@ -197,11 +210,13 @@ struct PACKED tu_autotune::config_t {
       str += ", Mod Flags: 0x" + std::to_string(mod_flags) + " (";
       MODF_STR(BIG_GMEM);
       MODF_STR(SMALL_SYSMEM);
+      MODF_STR(PREEMPT_OPTIMIZE);
       str += ")";
 
       str += ", Metric Flags: 0x" + std::to_string(metric_flags) + " (";
       METRICF_STR(SAMPLES);
       METRICF_STR(TS);
+      METRICF_STR(TS_TILE);
       str += ")";
 
       return str;
@@ -278,6 +293,7 @@ tu_autotune::get_env_config()
          static const struct debug_control tu_at_flags_control[] = {
             { "big_gmem", (uint32_t) mod_flag::BIG_GMEM },
             { "small_sysmem", (uint32_t) mod_flag::SMALL_SYSMEM },
+            { "preempt_optimize", (uint32_t) mod_flag::PREEMPT_OPTIMIZE },
             { NULL, 0 }
          };
 
@@ -380,6 +396,18 @@ struct PACKED tu_autotune::rp_gpu_data {
    uint64_t ts_end;
 };
 
+/* Per-tile values for GMEM rendering, this structure is appended to the end of rp_gpu_data for each tile. */
+struct PACKED tu_autotune::tile_gpu_data {
+   uint64_t ts_start;
+   uint64_t ts_end;
+
+   /* A helper for the offset of this relative to BO start. */
+   static constexpr uint64_t offset(uint32_t tile_index)
+   {
+      return sizeof(rp_gpu_data) + (tile_index * sizeof(tile_gpu_data));
+   }
+};
+
 /* An "entry" of renderpass autotune results, which is used to store the results of a renderpass autotune run for a
  * given command buffer. */
 struct tu_autotune::rp_entry {
@@ -392,11 +420,13 @@ struct tu_autotune::rp_entry {
    static_assert(alignof(rp_gpu_data) == 16);
    static_assert(offsetof(rp_gpu_data, samples_start) == 0);
    static_assert(offsetof(rp_gpu_data, samples_end) == 16);
+   static_assert(sizeof(rp_gpu_data) % alignof(tile_gpu_data) == 0);
 
  public:
    rp_history *history; /* Guaranteed to never be nullptr. */
    config_t config;     /* Configuration at the time of entry creation. */
    bool sysmem;
+   uint32_t tile_count;
    uint32_t draw_count;
 
    /* Amount of repeated RPs so far, used for uniquely identifying instances of the same RPs. */
@@ -421,7 +451,7 @@ struct tu_autotune::rp_entry {
 
    rp_entry(rp_entry &&other) noexcept
        : device(other.device), bo(other.bo), map(other.map), history(other.history), config(other.config),
-         sysmem(other.sysmem), draw_count(other.draw_count)
+         sysmem(other.sysmem), tile_count(other.tile_count), draw_count(other.draw_count)
    {
       other.map = nullptr; /* Prevent the destructor from freeing the BO. */
    }
@@ -435,6 +465,7 @@ struct tu_autotune::rp_entry {
          history = other.history;
          config = other.config;
          sysmem = other.sysmem;
+         tile_count = other.tile_count;
          draw_count = other.draw_count;
 
          other.map = nullptr;
@@ -442,10 +473,11 @@ struct tu_autotune::rp_entry {
       return *this;
    }
 
-   void allocate(bool sysmem)
+   void allocate(bool sysmem, uint32_t tile_count)
    {
       this->sysmem = sysmem;
-      size_t total_size = sizeof(rp_gpu_data);
+      this->tile_count = tile_count;
+      size_t total_size = sizeof(rp_gpu_data) + (tile_count * sizeof(tile_gpu_data));
 
       std::scoped_lock lock(device->autotune->suballoc_mutex);
       VkResult result = tu_suballoc_bo_alloc(&bo, &device->autotune->suballoc, total_size, alignof(rp_gpu_data));
@@ -464,6 +496,14 @@ struct tu_autotune::rp_entry {
       return *(rp_gpu_data *) map;
    }
 
+   tile_gpu_data &get_tile_gpu_data(uint32_t tile_index)
+   {
+      assert(map);
+      assert(tile_index < tile_count);
+      uint64_t offset = tile_gpu_data::offset(tile_index);
+      return *(tile_gpu_data *) (map + offset);
+   }
+
    /** Samples-Passed Metric **/
 
    uint64_t get_samples_passed()
@@ -535,6 +575,20 @@ struct tu_autotune::rp_entry {
       return gpu.ts_end - gpu.ts_start;
    }
 
+   /* The amount of cycles spent in the longest tile. This is used to calculate the average draw duration for
+    * determining the largest non-preemptible duration for GMEM rendering.
+    */
+   uint64_t get_max_tile_duration()
+   {
+      assert(config.test(metric_flag::TS_TILE));
+      uint64_t max_duration = 0;
+      for (uint32_t i = 0; i < tile_count; i++) {
+         tile_gpu_data &tile = get_tile_gpu_data(i);
+         max_duration = MAX2(max_duration, tile.ts_end - tile.ts_start);
+      }
+      return max_duration;
+   }
+
    void emit_metric_timestamp(struct tu_cs *cs, uint64_t timestamp_iova)
    {
       tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
@@ -566,6 +620,24 @@ struct tu_autotune::rp_entry {
       if (config.test(metric_flag::TS))
          emit_metric_timestamp(cs, bo_iova + offsetof(rp_gpu_data, ts_end));
    }
+
+   void emit_tile_start(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t tile_index)
+   {
+      assert(map && bo.iova);
+      assert(!sysmem);
+      assert(tile_index < tile_count);
+      if (config.test(metric_flag::TS_TILE))
+         emit_metric_timestamp(cs, bo.iova + tile_gpu_data::offset(tile_index) + offsetof(tile_gpu_data, ts_start));
+   }
+
+   void emit_tile_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t tile_index)
+   {
+      assert(map && bo.iova);
+      assert(!sysmem);
+      assert(tile_index < tile_count);
+      if (config.test(metric_flag::TS_TILE))
+         emit_metric_timestamp(cs, bo.iova + tile_gpu_data::offset(tile_index) + offsetof(tile_gpu_data, ts_end));
+   }
 };
 
 tu_autotune::rp_entry_batch::rp_entry_batch(): active(false), fence(0), entries()
@@ -699,7 +771,7 @@ template <typename T = double> class exponential_average {
  */
 template <typename T = double> class adaptive_average {
  private:
-   static constexpr double DefaultSlowAlpha = 0.1, DefaultFastAlpha = 0.5, DefaultDeviationThreshold = 0.3;
+   static constexpr double DEFAULT_SLOW_ALPHA = 0.1, DEFAULT_FAST_ALPHA = 0.5, DEFAULT_DEVIATION_THRESHOLD = 0.3;
    exponential_average<T> slow;
    exponential_average<T> fast;
    double deviationThreshold;
@@ -707,9 +779,9 @@ template <typename T = double> class adaptive_average {
  public:
    size_t count = 0;
 
-   explicit adaptive_average(double slow_alpha = DefaultSlowAlpha,
-                             double fast_alpha = DefaultFastAlpha,
-                             double deviation_threshold = DefaultDeviationThreshold) noexcept
+   explicit adaptive_average(double slow_alpha = DEFAULT_SLOW_ALPHA,
+                             double fast_alpha = DEFAULT_FAST_ALPHA,
+                             double deviation_threshold = DEFAULT_DEVIATION_THRESHOLD) noexcept
        : slow(slow_alpha), fast(fast_alpha), deviationThreshold(deviation_threshold)
    {
    }
@@ -903,6 +975,152 @@ struct tu_autotune::rp_history {
       }
    } profiled;
 
+   /** Preemption Latency Optimization Mode **/
+   struct preempt_optimize_mode {
+    private:
+      adaptive_average<uint64_t> sysmem_draw_average;
+      adaptive_average<uint64_t> gmem_tile_average;
+
+      /* If the renderpass has long draws which are at risk of causing high preemptible latency. */
+      std::atomic<bool> latency_risk = false;
+      /* The factor by which the tile size should be divided to reduce preemption latency. */
+      std::atomic<uint32_t> tile_size_divisor = 1;
+
+      /* The next timestamp to update the latency sensitivity parameters at. */
+      uint64_t latency_update_ts = 0;
+      /* The next timestamp where it's allowed to decrement the divisor. */
+      uint64_t divisor_decrement_ts = 0;
+      /* The next timestamp where it's allowed to mark the RP as no longer latency sensitive. */
+      uint64_t latency_switch_ts = 0;
+
+      /* Threshold of longest non-preemptible duration before activating latency optimization: 1.5ms */
+      static constexpr uint64_t TARGET_THRESHOLD = GPU_TICKS_PER_US * 1500;
+
+    public:
+      void update_sysmem(rp_history &history, uint64_t draw_duration)
+      {
+         bool l_latency_risk = latency_risk.load(std::memory_order_relaxed);
+
+         if (!l_latency_risk) {
+            /* Try to estimate the minimum non-preemptible duration for draw-level preemption, by dividing the total
+             * time by the RP by the amount of draws. This isn't very accurate as it's skewed by the time taken by
+             * commands other than draws (e.g. clears or blits), but it's a good enough estimate to catch the worst
+             * offenders.
+             *
+             * If the average draw duration is above a certain threshold, we mark the RP as latency sensitive which
+             * should bias the decision towards GMEM.
+             */
+
+            sysmem_draw_average.add(draw_duration);
+            uint64_t avg_sysmem_draw = sysmem_draw_average.get();
+            uint64_t sysmem_draw_count = sysmem_draw_average.count;
+
+            at_log_preempt_h("avg_sysmem_draw: %" PRIu64 " us (%u), latency_risk: %u"
+                             history.hash, ticks_to_us(avg_sysmem_draw), avg_sysmem_draw > TARGET_THRESHOLD,
+                             l_latency_risk
+            );
+
+            if (sysmem_draw_count >= MIN_PROFILE_DURATION_COUNT && avg_sysmem_draw > TARGET_THRESHOLD) {
+               latency_risk.store(true, std::memory_order_relaxed);
+               at_log_preempt_h("high sysmem draw duration %" PRIu64 " us, marking as latency sensitive", history.hash,
+                                ticks_to_us(avg_sysmem_draw));
+            }
+         }
+      }
+
+      void update_gmem(rp_history &history, uint64_t tile_duration)
+      {
+         constexpr uint64_t default_update_duration_ns = 100'000'000;         /* 100ms */
+         constexpr uint64_t change_update_duration_ns = 500'000'000;          /* 500ms */
+         constexpr uint64_t downward_update_duration_ns = 10'000'000'000;     /* 10s */
+         constexpr uint64_t latency_insensitive_duration_ns = 30'000'000'000; /* 30s */
+
+         gmem_tile_average.add(tile_duration);
+
+         uint64_t now = os_time_get_nano();
+         if (latency_update_ts > now)
+            return; /* No need to update yet. */
+
+         /* If the RP is latency sensitive and we're using GMEM, we should check if it's worth reducing the tile size to
+          * reduce the latency risk further or if it's already low enough that it's not worth the performance hit.
+          */
+
+         uint64_t update_duration_ns = default_update_duration_ns;
+         if (gmem_tile_average.count > MIN_PROFILE_DURATION_COUNT) {
+            uint64_t avg_gmem_tile = gmem_tile_average.get();
+            bool l_latency_risk = latency_risk.load(std::memory_order_relaxed);
+            if (!l_latency_risk) {
+               if (avg_gmem_tile > TARGET_THRESHOLD) {
+                  latency_risk.store(true, std::memory_order_relaxed);
+                  latency_switch_ts = now + latency_insensitive_duration_ns;
+
+                  at_log_preempt_h("high gmem tile duration %" PRIu64 ", marking as latency sensitive", history.hash,
+                                   avg_gmem_tile);
+               }
+            } else {
+               uint32_t l_tile_size_divisor = tile_size_divisor.load(std::memory_order_relaxed);
+               at_log_preempt_h("avg_gmem_tile: %" PRIu64 " us (%u), latency_risk: %u, tile_size_divisor: %" PRIu32,
+                                history.hash, ticks_to_us(avg_gmem_tile), avg_gmem_tile > TARGET_THRESHOLD,
+                                l_latency_risk, l_tile_size_divisor);
+
+               int delta = 0;
+               if (avg_gmem_tile > TARGET_THRESHOLD && l_tile_size_divisor < TU_GMEM_LAYOUT_DIVISOR_MAX) {
+                  /* If the average tile duration is high, we should reduce the tile size to reduce the latency risk. */
+                  delta = 1;
+
+                  divisor_decrement_ts = now + downward_update_duration_ns;
+               } else if (avg_gmem_tile * 4 < TARGET_THRESHOLD && l_tile_size_divisor > 1 &&
+                          divisor_decrement_ts <= now) {
+                  /* If the average tile duration is low enough that we can get away with a larger tile size, we should
+                   * increase the tile size to reduce the performance hit of the smaller tiles.
+                   *
+                   * Note: The 4x factor is to account for the tile duration being halved when we increase the tile size
+                   * divisor by 1, with an additional 2x factor to generally be conservative about reducing the divisor
+                   * since it can lead to oscillation between tile sizes.
+                   *
+                   * Similarly, divisor_decrement_ts is used to limit how often we can reduce the divisor to avoid
+                   * oscillation.
+                   */
+                  delta = -1;
+                  latency_switch_ts = now + latency_insensitive_duration_ns;
+               } else if (avg_gmem_tile * 10 < TARGET_THRESHOLD && l_tile_size_divisor == 1 &&
+                          latency_switch_ts <= now) {
+                  /* If the average tile duration is low enough that we no longer consider the RP latency sensitive, we
+                   * can switch it back to non-latency sensitive.
+                   */
+                  latency_risk.store(false, std::memory_order_relaxed);
+               }
+
+               if (delta != 0) {
+                  /* Clear all the results to avoid biasing the decision based on the old tile size. */
+                  gmem_tile_average.clear();
+
+                  uint32_t new_tile_size_divisor = l_tile_size_divisor + delta;
+                  at_log_preempt_h("updating tile size divisor: %" PRIu32 " -> %" PRIu32, history.hash,
+                                   l_tile_size_divisor, new_tile_size_divisor);
+
+                  tile_size_divisor.store(new_tile_size_divisor, std::memory_order_relaxed);
+
+                  update_duration_ns = change_update_duration_ns;
+               }
+            }
+
+            latency_update_ts = now + update_duration_ns;
+         }
+      }
+
+      /* If this RP has a risk of causing high preemption latency. */
+      bool is_latency_sensitive() const
+      {
+         return latency_risk.load(std::memory_order_relaxed);
+      }
+
+      uint32_t get_tile_size_divisor() const
+      {
+         return tile_size_divisor.load(std::memory_order_relaxed);
+      }
+   } preempt_optimize;
+
    void process(rp_entry &entry, tu_autotune &at)
    {
       /* We use entry config to know what metrics it has, autotune config to know what algorithms are enabled. */
@@ -916,8 +1134,14 @@ struct tu_autotune::rp_history {
             uint64_t rp_duration = entry.get_rp_duration();
 
             sysmem_rp_average.add(rp_duration);
+
+            if (at_config.test(mod_flag::PREEMPT_OPTIMIZE))
+               preempt_optimize.update_sysmem(*this, rp_duration / entry.draw_count);
          } else {
             gmem_rp_average.add(entry.get_rp_duration());
+
+            if (entry_config.test(metric_flag::TS_TILE) && at_config.test(mod_flag::PREEMPT_OPTIMIZE))
+               preempt_optimize.update_gmem(*this, entry.get_max_tile_duration());
          }
 
          if (at_config.is_enabled(algorithm::PROFILED) || at_config.is_enabled(algorithm::PROFILED_IMM)) {
@@ -1126,10 +1350,26 @@ tu_autotune::get_optimal_mode(struct tu_cmd_buffer *cmd_buffer, rp_ctx_t *rp_ctx
    if (!enabled || simultaneous_use)
       return default_mode;
 
-   if (config.test(mod_flag::BIG_GMEM) && rp_state->drawcall_count >= 10)
-      return render_mode::GMEM;
-   if (config.test(mod_flag::SMALL_SYSMEM) && rp_state->drawcall_count <= 5)
-      return render_mode::SYSMEM;
+   /* We can return early with the decision based on the draw call count, instead of needing to hash the renderpass
+    * instance and look up the history, which is far more expensive.
+    *
+    * However, certain options such as latency sensitive mode take precedence over any of the other autotuner options
+    * and we cannot do so in those cases.
+    */
+   bool can_early_return = !config.test(mod_flag::PREEMPT_OPTIMIZE);
+   auto early_return_mode = [&]() -> std::optional<render_mode> {
+      if (config.test(mod_flag::BIG_GMEM) && rp_state->drawcall_count >= 10)
+         return render_mode::GMEM;
+      if (config.test(mod_flag::SMALL_SYSMEM) && rp_state->drawcall_count <= 5)
+         return render_mode::SYSMEM;
+      return std::nullopt;
+   }();
+
+   if (can_early_return && early_return_mode) {
+      at_log_base_h("%" PRIu32 " draw calls, using %s (early)", rp_key(pass, framebuffer, cmd_buffer).hash,
+                    rp_state->drawcall_count, render_mode_str(*early_return_mode));
+      return *early_return_mode;
+   }
 
    rp_key key(pass, framebuffer, cmd_buffer);
 
@@ -1146,6 +1386,20 @@ tu_autotune::get_optimal_mode(struct tu_cmd_buffer *cmd_buffer, rp_ctx_t *rp_ctx
    auto &history = find_or_create_rp_history(key);
    *rp_ctx = &cb_ctx.attach_rp_entry(device, history, config, rp_state->drawcall_count);
 
+   if (config.test(mod_flag::PREEMPT_OPTIMIZE) && history.preempt_optimize.is_latency_sensitive()) {
+      /* Try to mitigate the risk of high preemption latency by always using GMEM, which should break up any larger
+       * draws into smaller ones with tiling.
+       */
+      at_log_base_h("high preemption latency risk, using GMEM", key.hash);
+      return render_mode::GMEM;
+   }
+
+   if (early_return_mode) {
+      at_log_base_h("%" PRIu32 " draw calls, using %s (late)", key.hash, rp_state->drawcall_count,
+                    render_mode_str(*early_return_mode));
+      return *early_return_mode;
+   }
+
    if (config.is_enabled(algorithm::PROFILED) || config.is_enabled(algorithm::PROFILED_IMM))
       return history.profiled.get_optimal_mode(history);
 
@@ -1155,15 +1409,55 @@ tu_autotune::get_optimal_mode(struct tu_cmd_buffer *cmd_buffer, rp_ctx_t *rp_ctx
    return default_mode;
 }
 
+uint32_t
+tu_autotune::get_tile_size_divisor(struct tu_cmd_buffer *cmd_buffer)
+{
+   const struct tu_cmd_state *cmd_state = &cmd_buffer->state;
+   const struct tu_render_pass *pass = cmd_state->pass;
+   const struct tu_framebuffer *framebuffer = cmd_state->framebuffer;
+   const struct tu_render_pass_state *rp_state = &cmd_state->rp;
+
+   if (!enabled || !active_config.load().test(mod_flag::PREEMPT_OPTIMIZE) || rp_state->sysmem_single_prim_mode ||
+       pass->has_fdm || cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)
+      return 1;
+
+   rp_key key(pass, framebuffer, cmd_buffer);
+   rp_history *history = find_rp_history(key);
+   if (!history) {
+      at_log_base_h("no RP history found, using tile_size_divisor=1", key.hash);
+      return 1;
+   }
+
+   uint32_t tile_size_divisor = history->preempt_optimize.get_tile_size_divisor();
+
+   return tile_size_divisor;
+}
+
+void
+tu_autotune::disable_preempt_optimize()
+{
+   config_t original, updated;
+   do {
+      original = updated = active_config.load();
+      if (!original.test(mod_flag::PREEMPT_OPTIMIZE))
+         return; /* Already disabled, nothing to do. */
+      updated.disable(mod_flag::PREEMPT_OPTIMIZE);
+   } while (!active_config.compare_and_store(original, updated));
+}
+
 /** RP-level CS emissions **/
 
 void
-tu_autotune::begin_renderpass(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx, bool sysmem)
+tu_autotune::begin_renderpass(
+   struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx, bool sysmem, uint32_t tile_count)
 {
    if (!rp_ctx)
       return;
 
-   rp_ctx->allocate(sysmem);
+   assert(sysmem || tile_count > 0);
+   assert(!sysmem || tile_count == 0);
+
+   rp_ctx->allocate(sysmem, tile_count);
    rp_ctx->emit_rp_start(cmd, cs);
 }
 
@@ -1175,3 +1469,23 @@ tu_autotune::end_renderpass(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_
 
    rp_ctx->emit_rp_end(cmd, cs);
 }
+
+/** Tile-level CS emissions **/
+
+void
+tu_autotune::begin_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx, uint32_t tile_idx)
+{
+   if (!rp_ctx)
+      return;
+
+   rp_ctx->emit_tile_start(cmd, cs, tile_idx);
+}
+
+void
+tu_autotune::end_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx, uint32_t tile_idx)
+{
+   if (!rp_ctx)
+      return;
+
+   rp_ctx->emit_tile_end(cmd, cs, tile_idx);
+}
diff --git a/src/freedreno/vulkan/tu_autotune.h b/src/freedreno/vulkan/tu_autotune.h
index 3b47508a2bb..5e68bc761ff 100644
--- a/src/freedreno/vulkan/tu_autotune.h
+++ b/src/freedreno/vulkan/tu_autotune.h
@@ -226,10 +226,25 @@ struct tu_autotune {
 
    render_mode get_optimal_mode(struct tu_cmd_buffer *cmd_buffer, rp_ctx_t *rp_ctx);
 
-   void begin_renderpass(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx, bool sysmem);
+   /* Returns the optimal tile size divisor for the given CB state. */
+   uint32_t get_tile_size_divisor(struct tu_cmd_buffer *cmd_buffer);
+
+   /* Disables preemption latency optimization within the autotuner, this is used when high-priority queues are used to
+    * ensure that the autotuner does not interfere with the high-priority queue's performance.
+    *
+    * Note: This should be called before any renderpass is started, otherwise it may lead to undefined behavior.
+    */
+   void disable_preempt_optimize();
+
+   void
+   begin_renderpass(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx, bool sysmem, uint32_t tile_count);
 
    void end_renderpass(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx);
 
+   void begin_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx, uint32_t tile_idx);
+
+   void end_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx, uint32_t tile_idx);
+
    /* The submit-time hook for autotuner, this may return a CS (can be NULL) which must be amended for autotuner
     * tracking to function correctly.
     *
diff --git a/src/freedreno/vulkan/tu_clear_blit.cc b/src/freedreno/vulkan/tu_clear_blit.cc
index 70daba82da0..fd677ba4ef2 100644
--- a/src/freedreno/vulkan/tu_clear_blit.cc
+++ b/src/freedreno/vulkan/tu_clear_blit.cc
@@ -5462,7 +5462,10 @@ tu_choose_gmem_layout(struct tu_cmd_buffer *cmd)
       }
    }
 
-   cmd->state.tiling = &cmd->state.framebuffer->tiling[cmd->state.gmem_layout];
+   cmd->state.gmem_layout_divisor = cmd->device->autotune->get_tile_size_divisor(cmd);
+
+   cmd->state.tiling = tu_framebuffer_get_tiling_config(cmd->state.framebuffer, cmd->device, cmd->state.pass,
+                                                        cmd->state.gmem_layout, cmd->state.gmem_layout_divisor);
 }
 
 struct apply_store_coords_state {
diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc
index 1b01e6e7a1b..4fc9c0a6ec3 100644
--- a/src/freedreno/vulkan/tu_cmd_buffer.cc
+++ b/src/freedreno/vulkan/tu_cmd_buffer.cc
@@ -1247,8 +1247,9 @@ tu_vsc_config(struct tu_cmd_buffer *cmd, const struct tu_tiling_config *tiling)
 static bool
 use_hw_binning(struct tu_cmd_buffer *cmd)
 {
-   const struct tu_framebuffer *fb = cmd->state.framebuffer;
-   const struct tu_tiling_config *tiling = &fb->tiling[cmd->state.gmem_layout];
+   struct tu_framebuffer *fb = cmd->state.framebuffer;
+   const struct tu_tiling_config *tiling =
+      tu_framebuffer_get_tiling_config(fb, cmd->device, cmd->state.pass, cmd->state.gmem_layout, cmd->state.gmem_layout_divisor);
    const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling);
 
    /* XFB commands are emitted for BINNING || SYSMEM, which makes it
@@ -3078,7 +3079,7 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
       tu_cs_emit_regs(cs, RB_BIN_FOVEAT(CHIP));
    }
 
-   cmd->device->autotune->begin_renderpass(cmd, cs, rp_ctx, true);
+   cmd->device->autotune->begin_renderpass(cmd, cs, rp_ctx, true, 0);
 
    tu_cs_sanity_check(cs);
 }
@@ -3451,7 +3452,8 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
    if (use_cb)
       tu_trace_start_render_pass(cmd);
 
-   cmd->device->autotune->begin_renderpass(cmd, cs, rp_ctx, false);
+   uint32_t tile_count = vsc->tile_count.width * vsc->tile_count.height;
+   cmd->device->autotune->begin_renderpass(cmd, cs, rp_ctx, false, tile_count);
 
    tu_cs_sanity_check(cs);
 }
@@ -3460,13 +3462,18 @@ template <chip CHIP>
 static void
 tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
                 const struct tu_tile_config *tile,
-                bool fdm, const VkOffset2D *fdm_offsets)
+                bool fdm, const VkOffset2D *fdm_offsets,
+                tu_autotune::rp_ctx_t rp_ctx,
+                const struct tu_vsc_config *vsc)
 {
+   uint32_t tile_idx = (tile->pos.y * vsc->tile_count.width) + tile->pos.x;
    tu6_emit_tile_select<CHIP>(cmd, &cmd->cs, tile, fdm, fdm_offsets);
    tu_lrz_before_tile<CHIP>(cmd, &cmd->cs);
 
    trace_start_draw_ib_gmem(&cmd->trace, &cmd->cs, cmd);
 
+   cmd->device->autotune->begin_tile(cmd, cs, rp_ctx, tile_idx);
+
    /* Primitives that passed all tests are still counted in in each
     * tile even with HW binning beforehand. Do not permit it.
     */
@@ -3478,6 +3485,8 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
    if (cmd->state.prim_generated_query_running_before_rp)
       tu_emit_event_write<CHIP>(cmd, cs, FD_START_PRIMITIVE_CTRS);
 
+   cmd->device->autotune->end_tile(cmd, cs, rp_ctx, tile_idx);
+
    if (use_hw_binning(cmd)) {
       tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
       tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_END_OF_DRAWS) |
@@ -3785,7 +3794,9 @@ void
 tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe,
                    uint32_t tx1, uint32_t ty1, uint32_t tx2, uint32_t ty2,
                    const struct tu_image_view *fdm,
-                   const VkOffset2D *fdm_offsets)
+                   const VkOffset2D *fdm_offsets,
+                   tu_autotune::rp_ctx_t rp_ctx,
+                   const struct tu_vsc_config *vsc)
 {
    uint32_t width = tx2 - tx1;
    uint32_t height = ty2 - ty1;
@@ -3848,7 +3859,8 @@ tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe,
             continue;
 
          tu6_render_tile<CHIP>(cmd, &cmd->cs, &tiles[tile_idx],
-                               true, fdm_offsets);
+                               true, fdm_offsets,
+                               rp_ctx, vsc);
       }
    }
 }
@@ -3936,7 +3948,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
 
          if (merge_tiles) {
             tu_render_pipe_fdm<CHIP>(cmd, pipe, tx1, ty1, tx2, ty2, fdm,
-                                     fdm_offsets);
+                                     fdm_offsets, rp_ctx, vsc);
             continue;
          }
 
@@ -3960,7 +3972,8 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
                   tu_calc_frag_area(cmd, &tile, fdm, fdm_offsets);
 
                tu6_render_tile<CHIP>(cmd, &cmd->cs, &tile, has_fdm,
-                                     fdm_offsets);
+                                     fdm_offsets,
+                                     rp_ctx, vsc);
             }
             slot_row += tile_row_stride;
          }
@@ -4052,6 +4065,7 @@ static void tu_reset_render_pass(struct tu_cmd_buffer *cmd_buffer)
    cmd_buffer->state.attachments = NULL;
    cmd_buffer->state.clear_values = NULL;
    cmd_buffer->state.gmem_layout = TU_GMEM_LAYOUT_COUNT; /* invalid value to prevent looking up gmem offsets */
+   cmd_buffer->state.gmem_layout_divisor = 0;
    cmd_buffer->state.renderpass_cb_disabled = false;
    memset(&cmd_buffer->state.rp, 0, sizeof(cmd_buffer->state.rp));
 
@@ -6089,7 +6103,9 @@ tu_restore_suspended_pass(struct tu_cmd_buffer *cmd,
    cmd->state.clear_values = suspended->state.suspended_pass.clear_values;
    cmd->state.render_area = suspended->state.suspended_pass.render_area;
    cmd->state.gmem_layout = suspended->state.suspended_pass.gmem_layout;
-   cmd->state.tiling = &cmd->state.framebuffer->tiling[cmd->state.gmem_layout];
+   cmd->state.gmem_layout_divisor = suspended->state.suspended_pass.gmem_layout_divisor;
+   cmd->state.tiling = tu_framebuffer_get_tiling_config(cmd->state.framebuffer, cmd->device, cmd->state.pass,
+                                                        cmd->state.gmem_layout, cmd->state.gmem_layout_divisor);
    cmd->state.lrz = suspended->state.suspended_pass.lrz;
 }
 
@@ -7040,6 +7056,7 @@ tu_CmdBeginRendering(VkCommandBuffer commandBuffer,
       cmd->state.suspended_pass.attachments = cmd->state.attachments;
       cmd->state.suspended_pass.clear_values = cmd->state.clear_values;
       cmd->state.suspended_pass.gmem_layout = cmd->state.gmem_layout;
+      cmd->state.suspended_pass.gmem_layout_divisor = cmd->state.gmem_layout_divisor;
    }
 
    tu_fill_render_pass_state(&cmd->state.vk_rp, cmd->state.pass, cmd->state.subpass);
diff --git a/src/freedreno/vulkan/tu_cmd_buffer.h b/src/freedreno/vulkan/tu_cmd_buffer.h
index cd853826207..0f8aa1500d6 100644
--- a/src/freedreno/vulkan/tu_cmd_buffer.h
+++ b/src/freedreno/vulkan/tu_cmd_buffer.h
@@ -524,11 +524,12 @@ struct tu_cmd_state
    /* Decides which GMEM layout to use from the tu_pass, based on whether the CCU
     * might get used by tu_store_gmem_attachment().
     */
-   enum tu_gmem_layout gmem_layout;
+   tu_gmem_layout gmem_layout;
+   uint32_t gmem_layout_divisor;
 
    const struct tu_render_pass *pass;
    const struct tu_subpass *subpass;
-   const struct tu_framebuffer *framebuffer;
+   struct tu_framebuffer *framebuffer;
    const struct tu_tiling_config *tiling;
    VkRect2D render_area;
 
@@ -543,9 +544,10 @@ struct tu_cmd_state
    struct {
       const struct tu_render_pass *pass;
       const struct tu_subpass *subpass;
-      const struct tu_framebuffer *framebuffer;
+      struct tu_framebuffer *framebuffer;
       VkRect2D render_area;
       enum tu_gmem_layout gmem_layout;
+      uint32_t gmem_layout_divisor;
 
       const struct tu_image_view **attachments;
       VkClearValue *clear_values;
diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc
index e9f31fb67d5..a0d10d2c072 100644
--- a/src/freedreno/vulkan/tu_device.cc
+++ b/src/freedreno/vulkan/tu_device.cc
@@ -4002,7 +4002,7 @@ tu_CreateFramebuffer(VkDevice _device,
       }
    }
 
-   tu_framebuffer_tiling_config(framebuffer, device, pass);
+   tu_framebuffer_init_tiling_config(framebuffer, device, pass);
 
    /* For MSRTSS, allocate extra images that are tied to the VkFramebuffer */
    if (msrtss_attachment_count > 0) {
@@ -4064,7 +4064,7 @@ tu_setup_dynamic_framebuffer(struct tu_cmd_buffer *cmd_buffer,
          view->image->max_tile_h_constraint_fdm;
    }
 
-   tu_framebuffer_tiling_config(framebuffer, cmd_buffer->device, pass);
+   tu_framebuffer_init_tiling_config(framebuffer, cmd_buffer->device, pass);
 }
 
 VkResult
diff --git a/src/freedreno/vulkan/tu_device.h b/src/freedreno/vulkan/tu_device.h
index ae88516564f..dbfd2bc1554 100644
--- a/src/freedreno/vulkan/tu_device.h
+++ b/src/freedreno/vulkan/tu_device.h
@@ -580,7 +580,8 @@ struct tu_framebuffer
 
    uint32_t max_tile_w_constraint;
    uint32_t max_tile_h_constraint;
-   struct tu_tiling_config tiling[TU_GMEM_LAYOUT_COUNT];
+   uint32_t initd_divisor; /* The tile divisors up to this have been initialized, for lazy init. */
+   struct tu_tiling_config tiling[TU_GMEM_LAYOUT_COUNT * TU_GMEM_LAYOUT_DIVISOR_MAX];
 
    uint32_t attachment_count;
    const struct tu_image_view *attachments[0];
diff --git a/src/freedreno/vulkan/tu_pass.h b/src/freedreno/vulkan/tu_pass.h
index da92babc657..5dc515f8db6 100644
--- a/src/freedreno/vulkan/tu_pass.h
+++ b/src/freedreno/vulkan/tu_pass.h
@@ -22,6 +22,8 @@ enum tu_gmem_layout
    TU_GMEM_LAYOUT_COUNT,
 };
 
+constexpr uint32_t TU_GMEM_LAYOUT_DIVISOR_MAX = 6; /* 1x (no divisor), 2 (1/2), 3 (1/3) */
+
 struct tu_subpass_barrier {
    VkPipelineStageFlags2 src_stage_mask;
    VkPipelineStageFlags2 dst_stage_mask;
diff --git a/src/freedreno/vulkan/tu_queue.cc b/src/freedreno/vulkan/tu_queue.cc
index d6acf399042..2f963ce9ee0 100644
--- a/src/freedreno/vulkan/tu_queue.cc
+++ b/src/freedreno/vulkan/tu_queue.cc
@@ -641,6 +641,9 @@ tu_queue_init(struct tu_device *device,
 
    queue->fence = -1;
 
+   if (global_priority == VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR)
+      device->autotune->disable_preempt_optimize();
+
    return VK_SUCCESS;
 }
 
diff --git a/src/freedreno/vulkan/tu_util.cc b/src/freedreno/vulkan/tu_util.cc
index 0ebceae8998..ffd2975659b 100644
--- a/src/freedreno/vulkan/tu_util.cc
+++ b/src/freedreno/vulkan/tu_util.cc
@@ -365,6 +365,51 @@ is_hw_binning_possible(const struct tu_vsc_config *vsc)
    return tiles_per_pipe <= 32;
 }
 
+static void
+tu_tiling_config_divide_tile(const struct tu_device *dev,
+                             const struct tu_render_pass *pass,
+                             const struct tu_framebuffer *fb,
+                             const struct tu_tiling_config *tiling,
+                             struct tu_tiling_config *new_tiling,
+                             uint32_t divisor)
+{
+   assert(divisor > 0);
+
+   *new_tiling = *tiling;
+   if (divisor == 1 || !tiling->possible || tiling->tile0.width == ~0) {
+      /* If the divisor is 1, or if the tiling is not possible, or if the
+       * tiling is invalid, just return the original tiling. */
+      return;
+   }
+
+   /* Get the hardware-specified alignment values. */
+   const uint32_t tile_align_w = pass->tile_align_w;
+   const uint32_t tile_align_h = dev->physical_device->info->tile_align_h;
+
+   /* Divide the current tile dimensions by the divisor. */
+   uint32_t new_tile_width = tiling->tile0.width / divisor;
+   uint32_t new_tile_height = tiling->tile0.height / divisor;
+
+   /* Clamp to the minimum alignment if necessary and align down. */
+   if (new_tile_width < tile_align_w)
+      new_tile_width = tile_align_w;
+   else
+      new_tile_width = ROUND_DOWN_TO_NPOT(new_tile_width, tile_align_w);
+
+   if (new_tile_height < tile_align_h)
+      new_tile_height = tile_align_h;
+   else
+      new_tile_height = ROUND_DOWN_TO_NPOT(new_tile_height, tile_align_h);
+
+   new_tiling->tile0.width = new_tile_width;
+   new_tiling->tile0.height = new_tile_height;
+
+   /* Recalculate the tile count from the framebuffer dimensions to ensure
+    * full coverage. */
+   new_tiling->vsc.tile_count.width = DIV_ROUND_UP(fb->width, new_tile_width);
+   new_tiling->vsc.tile_count.height = DIV_ROUND_UP(fb->height, new_tile_height);
+}
+
 static void
 tu_tiling_config_update_pipe_layout(struct tu_vsc_config *vsc,
                                     const struct tu_device *dev,
@@ -469,9 +514,9 @@ tu_tiling_config_update_binning(struct tu_vsc_config *vsc, const struct tu_devic
 }
 
 void
-tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
-                             const struct tu_device *device,
-                             const struct tu_render_pass *pass)
+tu_framebuffer_init_tiling_config(struct tu_framebuffer *fb,
+                                  const struct tu_device *device,
+                                  const struct tu_render_pass *pass)
 {
    for (int gmem_layout = 0; gmem_layout < TU_GMEM_LAYOUT_COUNT; gmem_layout++) {
       struct tu_tiling_config *tiling = &fb->tiling[gmem_layout];
@@ -495,6 +540,49 @@ tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
          tu_tiling_config_update_binning(fdm_offset_vsc, device);
       }
    }
+
+   fb->initd_divisor = 1;
+}
+
+const struct tu_tiling_config *
+tu_framebuffer_get_tiling_config(struct tu_framebuffer *fb,
+                                 const struct tu_device *device,
+                                 const struct tu_render_pass *pass,
+                                 int gmem_layout,
+                                 uint32_t divisor)
+{
+   assert(divisor >= 1 && divisor <= TU_GMEM_LAYOUT_DIVISOR_MAX);
+   assert(divisor == 1 || !pass->has_fdm); /* For FDM, it's expected that FDM alone will be sufficient to
+                                              appropriately size the tiles for the framebuffer.*/
+   struct tu_tiling_config *tiling = &fb->tiling[(TU_GMEM_LAYOUT_COUNT * (divisor - 1)) + gmem_layout];
+
+   if (divisor > fb->initd_divisor) {
+      const struct tu_tiling_config *base_tiling =
+         tu_framebuffer_get_tiling_config(fb, device, pass, gmem_layout, divisor - 1);
+      tu_tiling_config_divide_tile(device, pass, fb, base_tiling, tiling, divisor);
+
+      struct tu_vsc_config *vsc = &tiling->vsc;
+      if (tiling->possible) {
+         tu_tiling_config_update_pipe_layout(vsc, device, false);
+         tu_tiling_config_update_pipes(vsc, device);
+         tu_tiling_config_update_binning(vsc, device);
+
+         struct tu_vsc_config *fdm_offset_vsc = &tiling->fdm_offset_vsc;
+         fdm_offset_vsc->tile_count = (VkExtent2D) { ~1, ~1 };
+      }
+
+      if (!tiling->possible ||                               /* If tiling is no longer possible, this is pointless. */
+          (vsc->binning_useful && !vsc->binning_possible) || /* Dividing further without HW binning is a bad idea.  */
+          (vsc->tile_count.width * vsc->tile_count.height > 100) /* 100 tiles are too many, even with HW binning.   */
+      ) {
+         /* Revert to the previous level's tiling configuration. */
+         *tiling = *base_tiling;
+      }
+
+      fb->initd_divisor = divisor;
+   }
+
+   return tiling;
 }
 
 void
diff --git a/src/freedreno/vulkan/tu_util.h b/src/freedreno/vulkan/tu_util.h
index 7ce6d3e053a..b1ed4354e39 100644
--- a/src/freedreno/vulkan/tu_util.h
+++ b/src/freedreno/vulkan/tu_util.h
@@ -136,9 +136,16 @@ __tu_finishme(const char *file, int line, const char *format, ...)
    } while (0)
 
 void
-tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
-                             const struct tu_device *device,
-                             const struct tu_render_pass *pass);
+tu_framebuffer_init_tiling_config(struct tu_framebuffer *fb,
+                                  const struct tu_device *device,
+                                  const struct tu_render_pass *pass);
+
+const struct tu_tiling_config *
+tu_framebuffer_get_tiling_config(struct tu_framebuffer *fb,
+                                 const struct tu_device *device,
+                                 const struct tu_render_pass *pass,
+                                 int gmem_layout,
+                                 uint32_t divisor);
 
 #define TU_STAGE_MASK ((1 << MESA_SHADER_STAGES) - 1)
 

From 1c607c72b058e54bb98aed039695224ae44171ec Mon Sep 17 00:00:00 2001
From: Dhruv Mark Collins <mark@igalia.com>
Date: Thu, 9 Oct 2025 21:06:59 +0000
Subject: [PATCH 10/25] tu/autotune: Log preemption delay perfcntr for
 debugging

It's difficult to validate the accuracy of the latency optimization
at the system level, this adds in a compile-time debug option to
read the CP preemption delay performance counters which provides a
much clearer picture of the improvements.

Signed-off-by: Dhruv Mark Collins <mark@igalia.com>
---
 src/freedreno/vulkan/tu_autotune.cc | 222 +++++++++++++++++++++++++++-
 src/freedreno/vulkan/tu_autotune.h  |  10 ++
 2 files changed, 230 insertions(+), 2 deletions(-)

diff --git a/src/freedreno/vulkan/tu_autotune.cc b/src/freedreno/vulkan/tu_autotune.cc
index 8dda5bfb190..aa214b6ed03 100644
--- a/src/freedreno/vulkan/tu_autotune.cc
+++ b/src/freedreno/vulkan/tu_autotune.cc
@@ -55,6 +55,12 @@
 #define at_log_preempt_h(fmt, hash, ...)
 #endif
 
+#if TU_AUTOTUNE_DEBUG_PERFCTR
+#define at_log_perfctr_h(fmt, hash, ...) mesa_logi("autotune-perfctr %016" PRIx64 ": " fmt, hash, ##__VA_ARGS__)
+#else
+#define at_log_perfctr_h(fmt, hash, ...)
+#endif
+
 /* Process any pending entries on autotuner finish, could be used to gather data from traces. */
 #define TU_AUTOTUNE_FLUSH_AT_FINISH 0
 
@@ -387,6 +393,16 @@ tu_autotune::get_cs_for_fence(uint32_t fence)
 
 /** RP Entry Management **/
 
+#if TU_AUTOTUNE_DEBUG_PERFCTR
+struct PACKED tu_perf_ctr_sample {
+   uint64_t begin;
+   uint64_t end;
+   /* The selector value at the beginning/end, used to validate that the countable wasn't changed during a preemption. */
+   uint32_t selector_begin;
+   uint32_t selector_end;
+};
+#endif
+
 /* The part of the per-RP entry which is written by the GPU. */
 struct PACKED tu_autotune::rp_gpu_data {
    /* HW requires the sample start/stop locations to be 128b aligned. */
@@ -394,6 +410,12 @@ struct PACKED tu_autotune::rp_gpu_data {
    alignas(16) uint64_t samples_end;
    uint64_t ts_start;
    uint64_t ts_end;
+
+#if TU_AUTOTUNE_DEBUG_PERFCTR
+   struct tu_perf_ctr_sample preemption_reaction_delay, num_preemptions, always_count;
+   uint64_t cntrs_ready;
+   constexpr static uint64_t CNTRS_READY_MAGIC = 0xABCDEFEFE;
+#endif
 };
 
 /* Per-tile values for GMEM rendering, this structure is appended to the end of rp_gpu_data for each tile. */
@@ -596,6 +618,110 @@ struct tu_autotune::rp_entry {
       tu_cs_emit_qw(cs, timestamp_iova);
    }
 
+   /** Debug Performance Counters **/
+
+#if TU_AUTOTUNE_DEBUG_PERFCTR
+   uint64_t get_preemption_reaction_delay(tu_autotune &at, uint64_t rp_hash)
+   {
+      rp_gpu_data &gpu = get_gpu_data();
+
+      while (p_atomic_read(&gpu.cntrs_ready) != rp_gpu_data::CNTRS_READY_MAGIC) {
+         /* Just spin until the counter values are written out. */
+      }
+
+      auto read_counter = [&](const struct tu_perf_ctr_sample &sample, const struct fd_perfcntr_countable *ctbl,
+                              uint64_t &outValue, const char *name) {
+         if (sample.selector_begin != sample.selector_end || sample.selector_begin != ctbl->selector) {
+            mesa_loge(
+               "autotune %016" PRIx64 ": %s: selector mismatch %" PRIu32 " != %" PRIu32 " (%" PRIu32 " - %" PRIu32 ")",
+               rp_hash, ctbl->name, sample.selector_begin, sample.selector_end, sample.selector_begin, ctbl->selector);
+         }
+
+         outValue = sample.end - sample.begin;
+         if (sample.end < sample.begin) {
+            mesa_loge("autotune %016" PRIx64 ": %s: end < begin %" PRIu64 " < %" PRIu64, rp_hash, name, sample.end,
+                      sample.begin);
+            outValue = 0;
+         }
+      };
+
+      /* We read all counters for logging, even though we only need to return the preemption reaction delay. */
+      uint64_t preemption_reaction_delay;
+      uint64_t num_preemptions;
+      uint64_t always_count;
+      read_counter(gpu.preemption_reaction_delay, at.preemption_reaction_delay, preemption_reaction_delay,
+                   "preemption_reaction_delay");
+      read_counter(gpu.num_preemptions, at.num_preemptions, num_preemptions, "num_preemptions");
+      read_counter(gpu.always_count, at.always_count, always_count, "always_count");
+
+      if (preemption_reaction_delay || num_preemptions) {
+         at_log_perfctr_h("preemption_reaction_delay: %" PRIu64 ", always_count: %" PRIu64
+                          ", num_preemptions: %" PRIu64,
+                          rp_hash, preemption_reaction_delay, always_count, num_preemptions);
+      }
+
+      return preemption_reaction_delay;
+   }
+
+   void emit_debug_perfcntr_start(struct tu_cs *cs, tu_autotune &at, uint64_t bo_iova)
+   {
+      auto countable_begin = [&](const struct fd_perfcntr_countable *ctbl, uint32_t cntr_idx, uint32_t offset) {
+         const struct fd_perfcntr_counter *ctr = &at.cp_group->counters[cntr_idx];
+         uint64_t offset_iova = bo_iova + offset;
+         assert(!ctr->enable); /* CP counters shouldn't use it. */
+
+         tu_cs_emit_pkt4(cs, ctr->select_reg, 1);
+         tu_cs_emit(cs, ctbl->selector);
+
+         tu_cs_emit_wfi(cs);
+
+         tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
+         tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(ctr->select_reg) | CP_REG_TO_MEM_0_CNT(1));
+         tu_cs_emit_qw(cs, offset_iova + offsetof(struct tu_perf_ctr_sample, selector_begin));
+
+         tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
+         tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(ctr->counter_reg_lo) | CP_REG_TO_MEM_0_64B);
+         tu_cs_emit_qw(cs, offset_iova + offsetof(struct tu_perf_ctr_sample, begin));
+      };
+
+      countable_begin(at.preemption_reaction_delay, 10, offsetof(rp_gpu_data, preemption_reaction_delay));
+      countable_begin(at.num_preemptions, 11, offsetof(rp_gpu_data, num_preemptions));
+      countable_begin(at.always_count, 12, offsetof(rp_gpu_data, always_count));
+
+      tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
+      tu_cs_emit_wfi(cs);
+   }
+
+   void emit_debug_perfcntr_end(struct tu_cs *cs, tu_autotune &at, uint64_t bo_iova)
+   {
+      tu_cs_emit_wfi(cs);
+
+      auto countable_end = [&](uint32_t cntr_idx, uint64_t offset) {
+         const struct fd_perfcntr_counter *ctr = &at.cp_group->counters[cntr_idx];
+         uint64_t offset_iova = bo_iova + offset;
+
+         tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
+         tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(ctr->select_reg) | CP_REG_TO_MEM_0_CNT(1));
+         tu_cs_emit_qw(cs, offset_iova + offsetof(struct tu_perf_ctr_sample, selector_end));
+
+         tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
+         tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(ctr->counter_reg_lo) | CP_REG_TO_MEM_0_64B);
+         tu_cs_emit_qw(cs, offset_iova + offsetof(struct tu_perf_ctr_sample, end));
+      };
+
+      countable_end(10, offsetof(rp_gpu_data, preemption_reaction_delay));
+      countable_end(11, offsetof(rp_gpu_data, num_preemptions));
+      countable_end(12, offsetof(rp_gpu_data, always_count));
+
+      tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
+      tu_cs_emit_wfi(cs);
+
+      tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
+      tu_cs_emit_qw(cs, bo_iova + offsetof(rp_gpu_data, cntrs_ready));
+      tu_cs_emit_qw(cs, rp_gpu_data::CNTRS_READY_MAGIC);
+   }
+#endif
+
    /** CS Emission **/
 
    void emit_rp_start(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
@@ -607,6 +733,10 @@ struct tu_autotune::rp_entry {
 
       if (config.test(metric_flag::TS))
          emit_metric_timestamp(cs, bo_iova + offsetof(rp_gpu_data, ts_start));
+
+#if TU_AUTOTUNE_DEBUG_PERFCTR
+      emit_debug_perfcntr_start(cs, *cmd->device->autotune, bo_iova);
+#endif
    }
 
    void emit_rp_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
@@ -619,6 +749,10 @@ struct tu_autotune::rp_entry {
 
       if (config.test(metric_flag::TS))
          emit_metric_timestamp(cs, bo_iova + offsetof(rp_gpu_data, ts_end));
+
+#if TU_AUTOTUNE_DEBUG_PERFCTR
+      emit_debug_perfcntr_end(cs, *cmd->device->autotune, bo_iova);
+#endif
    }
 
    void emit_tile_start(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t tile_index)
@@ -996,6 +1130,17 @@ struct tu_autotune::rp_history {
       /* Threshold of longest non-preemptible duration before activating latency optimization: 1.5ms */
       static constexpr uint64_t TARGET_THRESHOLD = GPU_TICKS_PER_US * 1500;
 
+#if TU_AUTOTUNE_DEBUG_PERFCTR
+      /* The highest preemption reaction delay recorded for the RP since the last update. */
+      uint64_t max_preemption_latency = 0;
+
+    public:
+      void update_preemption_latency(uint64_t preemption_latency)
+      {
+         max_preemption_latency = MAX2(max_preemption_latency, preemption_latency);
+      }
+#endif
+
     public:
       void update_sysmem(rp_history &history, uint64_t draw_duration)
       {
@@ -1016,10 +1161,22 @@ struct tu_autotune::rp_history {
             uint64_t sysmem_draw_count = sysmem_draw_average.count;
 
             at_log_preempt_h("avg_sysmem_draw: %" PRIu64 " us (%u), latency_risk: %u"
+#if TU_AUTOTUNE_DEBUG_PERFCTR
+                             ", preemption_latency: %" PRIu64
+#endif
+                             ,
                              history.hash, ticks_to_us(avg_sysmem_draw), avg_sysmem_draw > TARGET_THRESHOLD,
                              l_latency_risk
+#if TU_AUTOTUNE_DEBUG_PERFCTR
+                             ,
+                             max_preemption_latency
+#endif
             );
 
+#if TU_AUTOTUNE_DEBUG_PERFCTR
+            max_preemption_latency = 0;
+#endif
+
             if (sysmem_draw_count >= MIN_PROFILE_DURATION_COUNT && avg_sysmem_draw > TARGET_THRESHOLD) {
                latency_risk.store(true, std::memory_order_relaxed);
                at_log_preempt_h("high sysmem draw duration %" PRIu64 " us, marking as latency sensitive", history.hash,
@@ -1059,9 +1216,22 @@ struct tu_autotune::rp_history {
                }
             } else {
                uint32_t l_tile_size_divisor = tile_size_divisor.load(std::memory_order_relaxed);
-               at_log_preempt_h("avg_gmem_tile: %" PRIu64 " us (%u), latency_risk: %u, tile_size_divisor: %" PRIu32,
+               at_log_preempt_h("avg_gmem_tile: %" PRIu64 " us (%u), latency_risk: %u, tile_size_divisor: %" PRIu32
+#if TU_AUTOTUNE_DEBUG_PERFCTR
+                                ", preemption_latency: %" PRIu64
+#endif
+                                ,
                                 history.hash, ticks_to_us(avg_gmem_tile), avg_gmem_tile > TARGET_THRESHOLD,
-                                l_latency_risk, l_tile_size_divisor);
+                                l_latency_risk, l_tile_size_divisor
+#if TU_AUTOTUNE_DEBUG_PERFCTR
+                                ,
+                                max_preemption_latency
+#endif
+                  );
+
+#if TU_AUTOTUNE_DEBUG_PERFCTR
+               max_preemption_latency = 0;
+#endif
 
                int delta = 0;
                if (avg_gmem_tile > TARGET_THRESHOLD && l_tile_size_divisor < TU_GMEM_LAYOUT_DIVISOR_MAX) {
@@ -1129,6 +1299,11 @@ struct tu_autotune::rp_history {
 
       if (entry_config.test(metric_flag::SAMPLES) && at_config.is_enabled(algorithm::BANDWIDTH))
          bandwidth.update(entry.get_samples_passed());
+
+#if TU_AUTOTUNE_DEBUG_PERFCTR
+      preempt_optimize.update_preemption_latency(entry.get_preemption_reaction_delay(at, hash));
+#endif
+
       if (entry_config.test(metric_flag::TS)) {
          if (entry.sysmem) {
             uint64_t rp_duration = entry.get_rp_duration();
@@ -1252,6 +1427,49 @@ tu_autotune::tu_autotune(struct tu_device *device, VkResult &result): device(dev
 {
    tu_bo_suballocator_init(&suballoc, device, 128 * 1024, TU_BO_ALLOC_INTERNAL_RESOURCE, "autotune_suballoc");
 
+#if TU_AUTOTUNE_DEBUG_PERFCTR
+   uint32_t group_count;
+   const struct fd_perfcntr_group *groups = fd_perfcntrs(&device->physical_device->dev_id, &group_count);
+
+   for (uint32_t i = 0; i < group_count; i++) {
+      if (strcmp(groups[i].name, "CP") == 0) {
+         cp_group = &groups[i];
+         break;
+      }
+   }
+
+   if (!cp_group) {
+      mesa_loge("autotune: CP group not found");
+      result = VK_ERROR_INITIALIZATION_FAILED;
+      return;
+   } else if (cp_group->num_countables < 5) {
+      mesa_loge("autotune: CP group has too few countables");
+      result = VK_ERROR_INITIALIZATION_FAILED;
+      return;
+   }
+
+   auto get_perfcntr_countable = [](const struct fd_perfcntr_group *group,
+                                    const char *name) -> const struct fd_perfcntr_countable * {
+      for (uint32_t i = 0; i < group->num_countables; i++) {
+         if (strcmp(group->countables[i].name, name) == 0)
+            return &group->countables[i];
+      }
+
+      mesa_loge("autotune: %s not found in group %s", name, group->name);
+      return nullptr;
+   };
+
+   preemption_reaction_delay = get_perfcntr_countable(cp_group, "PERF_CP_PREEMPTION_REACTION_DELAY");
+   num_preemptions = get_perfcntr_countable(cp_group, "PERF_CP_NUM_PREEMPTIONS");
+   always_count = get_perfcntr_countable(cp_group, "PERF_CP_ALWAYS_COUNT");
+
+   if (!preemption_reaction_delay || !num_preemptions || !always_count) {
+      mesa_loge("autotune: preemption countables not found");
+      result = VK_ERROR_INITIALIZATION_FAILED;
+      return;
+   }
+#endif
+
    result = VK_SUCCESS;
    return;
 }
diff --git a/src/freedreno/vulkan/tu_autotune.h b/src/freedreno/vulkan/tu_autotune.h
index 5e68bc761ff..c3199672d6b 100644
--- a/src/freedreno/vulkan/tu_autotune.h
+++ b/src/freedreno/vulkan/tu_autotune.h
@@ -19,6 +19,9 @@
 #include "tu_cs.h"
 #include "tu_suballoc.h"
 
+/* Compile-time toggle for debugging preemption latency with CP preemption performance counters. */
+#define TU_AUTOTUNE_DEBUG_PERFCTR 0
+
 /* Autotune allows for us to tune rendering parameters (such as GMEM vs SYSMEM, tile size divisor, etc.) based on
  * dynamic analysis of the rendering workload via on-GPU profiling. This lets us make much better decisions than static
  * analysis, since we can adapt to the actual workload rather than relying on heuristics.
@@ -186,6 +189,13 @@ struct tu_autotune {
    rp_history *find_rp_history(const rp_key &key);
    rp_history &find_or_create_rp_history(const rp_key &key);
 
+   /** Debug Performance Counters **/
+
+#if TU_AUTOTUNE_DEBUG_PERFCTR
+   const fd_perfcntr_group *cp_group;
+   const fd_perfcntr_countable *preemption_reaction_delay, *num_preemptions, *always_count;
+#endif
+
  public:
    tu_autotune(struct tu_device *device, VkResult &result);
 

From c54792072bb6aa80ae1a83f8e1a9a9b839e6e994 Mon Sep 17 00:00:00 2001
From: Dhruv Mark Collins <mark@igalia.com>
Date: Thu, 9 Oct 2025 13:56:46 +0000
Subject: [PATCH 11/25] tu/autotune: Disable autotuning for small renderpasses
 by default

Tuning these small renderpasses is difficult due to their high
variability across command buffers and low impact on overall performance
in most cases. This change disables autotuning for renderpasses with 5
or fewer draw calls unless the TUNE_SMALL modifier flag is explicitly
set.

Signed-off-by: Dhruv Mark Collins <mark@igalia.com>
---
 src/freedreno/vulkan/tu_autotune.cc | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/src/freedreno/vulkan/tu_autotune.cc b/src/freedreno/vulkan/tu_autotune.cc
index aa214b6ed03..1ace2d1f777 100644
--- a/src/freedreno/vulkan/tu_autotune.cc
+++ b/src/freedreno/vulkan/tu_autotune.cc
@@ -108,9 +108,9 @@ enum class tu_autotune::algorithm : uint8_t {
 
 /* Modifier flags, these modify the behavior of the autotuner in a user-defined way. */
 enum class tu_autotune::mod_flag : uint8_t {
-   BIG_GMEM = BIT(1),          /* All RPs with >= 10 draws use GMEM. */
-   SMALL_SYSMEM = BIT(2),      /* All RPs with <= 5 draws use SYSMEM. */
-   PREEMPT_OPTIMIZE = BIT(3),  /* Attempts to minimize the preemption latency. */
+   BIG_GMEM = BIT(1),         /* All RPs with >= 10 draws use GMEM. */
+   TUNE_SMALL = BIT(2),       /* Try tuning all RPs with <= 5 draws, ignored by default. */
+   PREEMPT_OPTIMIZE = BIT(3), /* Attempts to minimize the preemption latency. */
 };
 
 /* Metric flags, for internal tracking of enabled metrics. */
@@ -215,7 +215,7 @@ struct PACKED tu_autotune::config_t {
 
       str += ", Mod Flags: 0x" + std::to_string(mod_flags) + " (";
       MODF_STR(BIG_GMEM);
-      MODF_STR(SMALL_SYSMEM);
+      MODF_STR(TUNE_SMALL);
       MODF_STR(PREEMPT_OPTIMIZE);
       str += ")";
 
@@ -298,7 +298,7 @@ tu_autotune::get_env_config()
       if (flags_env_str) {
          static const struct debug_control tu_at_flags_control[] = {
             { "big_gmem", (uint32_t) mod_flag::BIG_GMEM },
-            { "small_sysmem", (uint32_t) mod_flag::SMALL_SYSMEM },
+            { "tune_small", (uint32_t) mod_flag::TUNE_SMALL },
             { "preempt_optimize", (uint32_t) mod_flag::PREEMPT_OPTIMIZE },
             { NULL, 0 }
          };
@@ -1565,7 +1565,14 @@ tu_autotune::get_optimal_mode(struct tu_cmd_buffer *cmd_buffer, rp_ctx_t *rp_ctx
     */
    bool simultaneous_use = cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
 
-   if (!enabled || simultaneous_use)
+   /* These smaller RPs with few draws are too difficult to create a balanced hash for that can independently identify
+    * them while not being so unique to not properly identify them across CBs. They're generally insigificant outside of
+    * a few edge cases such as during deferred rendering G-buffer passes, as we don't have a good way to deal with those
+    * edge cases yet, we just disable the autotuner for small RPs entirely for now unless TUNE_SMALL is specified.
+    */
+   bool ignore_small_rp = !config.test(mod_flag::TUNE_SMALL) && rp_state->drawcall_count <= 5;
+
+   if (!enabled || simultaneous_use || ignore_small_rp)
       return default_mode;
 
    /* We can return early with the decision based on the draw call count, instead of needing to hash the renderpass
@@ -1578,8 +1585,6 @@ tu_autotune::get_optimal_mode(struct tu_cmd_buffer *cmd_buffer, rp_ctx_t *rp_ctx
    auto early_return_mode = [&]() -> std::optional<render_mode> {
       if (config.test(mod_flag::BIG_GMEM) && rp_state->drawcall_count >= 10)
          return render_mode::GMEM;
-      if (config.test(mod_flag::SMALL_SYSMEM) && rp_state->drawcall_count <= 5)
-         return render_mode::SYSMEM;
       return std::nullopt;
    }();
 

From 0f6e420a7d2b6b5430ed8461cc652bcb14671326 Mon Sep 17 00:00:00 2001
From: Dhruv Mark Collins <mark@igalia.com>
Date: Thu, 9 Oct 2025 13:56:49 +0000
Subject: [PATCH 12/25] tu/autotune: Add prefer SYSMEM mode

Certain games tend to use rendering patterns that don't benefit
from GMEM rendering, and thus we're better off not bothering
with profiling them and simply defaulting to SYSMEM.

Signed-off-by: Dhruv Mark Collins <mark@igalia.com>
---
 docs/drivers/freedreno.rst          |  5 +++++
 src/freedreno/vulkan/tu_autotune.cc | 12 +++++++++---
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/docs/drivers/freedreno.rst b/docs/drivers/freedreno.rst
index 947872a4c4d..7f55cbe58a0 100644
--- a/docs/drivers/freedreno.rst
+++ b/docs/drivers/freedreno.rst
@@ -694,6 +694,11 @@ environment variables:
     for single-frame traces run multiple times in a CI where this algorithm can
     immediately chose the optimal rendering mode for each RP.
 
+  ``prefer_sysmem``
+    Always chooses SYSMEM rendering. This is useful for games that don't benefit
+    from GMEM rendering due to their rendering patterns, setting this is better
+    than using ``TU_DEBUG=sysmem`` when done for performance reasons.
+
 .. envvar:: TU_AUTOTUNE_FLAGS
 
   Modifies the behavior of the selected algorithm. Supported flags are:
diff --git a/src/freedreno/vulkan/tu_autotune.cc b/src/freedreno/vulkan/tu_autotune.cc
index 1ace2d1f777..fc46ba6bbe6 100644
--- a/src/freedreno/vulkan/tu_autotune.cc
+++ b/src/freedreno/vulkan/tu_autotune.cc
@@ -99,9 +99,10 @@ render_mode_str(tu_autotune::render_mode mode)
 /** Configuration **/
 
 enum class tu_autotune::algorithm : uint8_t {
-   BANDWIDTH = 0,    /* Uses estimated BW for determining rendering mode. */
-   PROFILED = 1,     /* Uses dynamically profiled results for determining rendering mode. */
-   PROFILED_IMM = 2, /* Same as PROFILED but immediately resolves the SYSMEM/GMEM probability. */
+   BANDWIDTH = 0,     /* Uses estimated BW for determining rendering mode. */
+   PROFILED = 1,      /* Uses dynamically profiled results for determining rendering mode. */
+   PROFILED_IMM = 2,  /* Same as PROFILED but immediately resolves the SYSMEM/GMEM probability. */
+   PREFER_SYSMEM = 3, /* Always use SYSMEM unless we have strong evidence that GMEM is better. */
 
    DEFAULT = BANDWIDTH, /* Default algorithm, used if no other is specified. */
 };
@@ -212,6 +213,7 @@ struct PACKED tu_autotune::config_t {
       ALGO_STR(BANDWIDTH);
       ALGO_STR(PROFILED);
       ALGO_STR(PROFILED_IMM);
+      ALGO_STR(PREFER_SYSMEM);
 
       str += ", Mod Flags: 0x" + std::to_string(mod_flags) + " (";
       MODF_STR(BIG_GMEM);
@@ -284,6 +286,8 @@ tu_autotune::get_env_config()
             algo = algorithm::PROFILED;
          } else if (algo_strv == "profiled_imm") {
             algo = algorithm::PROFILED_IMM;
+         } else if (algo_strv == "prefer_sysmem") {
+            algo = algorithm::PREFER_SYSMEM;
          } else {
             mesa_logw("Unknown TU_AUTOTUNE_ALGO '%s', using default", algo_env_str);
          }
@@ -1585,6 +1589,8 @@ tu_autotune::get_optimal_mode(struct tu_cmd_buffer *cmd_buffer, rp_ctx_t *rp_ctx
    auto early_return_mode = [&]() -> std::optional<render_mode> {
       if (config.test(mod_flag::BIG_GMEM) && rp_state->drawcall_count >= 10)
          return render_mode::GMEM;
+      if (config.is_enabled(algorithm::PREFER_SYSMEM))
+         return render_mode::SYSMEM;
       return std::nullopt;
    }();
 

From 7127eefa4dd0aba521acd97079e57cf19acd0eca Mon Sep 17 00:00:00 2001
From: Dhruv Mark Collins <mark@igalia.com>
Date: Thu, 9 Oct 2025 13:56:50 +0000
Subject: [PATCH 13/25] tu+util: Allow setting autotune mode from driconf

Allows for setting an override for the default autotune mode using
driconf, allowing for setting policy on a per-app basis.

Signed-off-by: Dhruv Mark Collins <mark@igalia.com>
---
 docs/drivers/freedreno.rst          |  2 ++
 src/freedreno/vulkan/tu_autotune.cc | 13 ++++++++-----
 src/freedreno/vulkan/tu_device.cc   |  3 +++
 src/freedreno/vulkan/tu_device.h    |  3 +++
 src/util/driconf.h                  |  4 ++++
 5 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/docs/drivers/freedreno.rst b/docs/drivers/freedreno.rst
index 7f55cbe58a0..44f2e0d4a6b 100644
--- a/docs/drivers/freedreno.rst
+++ b/docs/drivers/freedreno.rst
@@ -699,6 +699,8 @@ environment variables:
     from GMEM rendering due to their rendering patterns, setting this is better
     than using ``TU_DEBUG=sysmem`` when done for performance reasons.
 
+  The algorithm can be set via the driconf option ``tu_autotune_algorithm`` as well.
+
 .. envvar:: TU_AUTOTUNE_FLAGS
 
   Modifies the behavior of the selected algorithm. Supported flags are:
diff --git a/src/freedreno/vulkan/tu_autotune.cc b/src/freedreno/vulkan/tu_autotune.cc
index fc46ba6bbe6..5bd4b54600d 100644
--- a/src/freedreno/vulkan/tu_autotune.cc
+++ b/src/freedreno/vulkan/tu_autotune.cc
@@ -275,11 +275,14 @@ tu_autotune::get_env_config()
    static std::once_flag once;
    static config_t at_config;
    std::call_once(once, [&] {
-      const char *algo_env_str = os_get_option("TU_AUTOTUNE_ALGO");
+      const char *algo_str = os_get_option("TU_AUTOTUNE_ALGO");
       algorithm algo = algorithm::DEFAULT;
 
-      if (algo_env_str) {
-         std::string_view algo_strv(algo_env_str);
+      if (!algo_str)
+         algo_str = device->instance->autotune_algo; /* From dri conf. */
+
+      if (algo_str) {
+         std::string_view algo_strv(algo_str);
          if (algo_strv == "bandwidth") {
             algo = algorithm::BANDWIDTH;
          } else if (algo_strv == "profiled") {
@@ -289,11 +292,11 @@ tu_autotune::get_env_config()
          } else if (algo_strv == "prefer_sysmem") {
             algo = algorithm::PREFER_SYSMEM;
          } else {
-            mesa_logw("Unknown TU_AUTOTUNE_ALGO '%s', using default", algo_env_str);
+            mesa_logw("Unknown TU_AUTOTUNE_ALGO '%s', using default", algo_str);
          }
 
          if (TU_DEBUG(STARTUP))
-            mesa_logi("TU_AUTOTUNE_ALGO=%u (%s)", (uint8_t) algo, algo_env_str);
+            mesa_logi("TU_AUTOTUNE_ALGO=%u (%s)", (uint8_t) algo, algo_str);
       }
 
       /* Parse the flags from the environment variable. */
diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc
index a0d10d2c072..820f0bda174 100644
--- a/src/freedreno/vulkan/tu_device.cc
+++ b/src/freedreno/vulkan/tu_device.cc
@@ -1795,6 +1795,7 @@ static const driOptionDescription tu_dri_options[] = {
       DRI_CONF_TU_USE_TEX_COORD_ROUND_NEAREST_EVEN_MODE(false)
       DRI_CONF_TU_IGNORE_FRAG_DEPTH_DIRECTION(false)
       DRI_CONF_TU_ENABLE_SOFTFLOAT32(false)
+      DRI_CONF_TU_AUTOTUNE_ALGORITHM()
    DRI_CONF_SECTION_END
 };
 
@@ -1825,6 +1826,8 @@ tu_init_dri_options(struct tu_instance *instance)
          driQueryOptionb(&instance->dri_options, "tu_ignore_frag_depth_direction");
    instance->enable_softfloat32 =
          driQueryOptionb(&instance->dri_options, "tu_enable_softfloat32");
+   instance->autotune_algo =
+         driQueryOptionstr(&instance->dri_options, "tu_autotune_algorithm");
 }
 
 static uint32_t instance_count = 0;
diff --git a/src/freedreno/vulkan/tu_device.h b/src/freedreno/vulkan/tu_device.h
index dbfd2bc1554..dffb2c3f001 100644
--- a/src/freedreno/vulkan/tu_device.h
+++ b/src/freedreno/vulkan/tu_device.h
@@ -234,6 +234,9 @@ struct tu_instance
     * However we don't want native Vulkan apps using this.
     */
    bool enable_softfloat32;
+
+   /* Configuration option to use a specific autotune algorithm by default. */
+   const char *autotune_algo;
 };
 VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance,
                        VK_OBJECT_TYPE_INSTANCE)
diff --git a/src/util/driconf.h b/src/util/driconf.h
index 57149581f64..a8082e3e942 100644
--- a/src/util/driconf.h
+++ b/src/util/driconf.h
@@ -654,6 +654,10 @@
    DRI_CONF_OPT_B(tu_enable_softfloat32, def, \
                   "Enable softfloat emulation for float32 denormals")
 
+#define DRI_CONF_TU_AUTOTUNE_ALGORITHM() \
+   DRI_CONF_OPT_S_NODEF(tu_autotune_algorithm, \
+                        "Set the preferred autotune algorithm")
+
 /**
  * \brief Honeykrisp specific configuration options
  */

From f81a8a6507c178c7c7dcd4441d08f07d7e842461 Mon Sep 17 00:00:00 2001
From: Dhruv Mark Collins <mark@igalia.com>
Date: Thu, 9 Oct 2025 13:56:56 +0000
Subject: [PATCH 14/25] tu/autotune: Add render mode locking to PROFILED
 algorithm

There are certain scenarios where even switching to another render
mode has significant negative implications for performance even
when done for a single invocation. Now we try to heuristically
pick out these cases and lock them into the optimal mode, at the
moment the heuristic is fairly conservative but it manages to lock
RPs in under a minute in most cases.

Signed-off-by: Dhruv Mark Collins <mark@igalia.com>
---
 src/freedreno/vulkan/tu_autotune.cc | 49 +++++++++++++++++++++++------
 1 file changed, 39 insertions(+), 10 deletions(-)

diff --git a/src/freedreno/vulkan/tu_autotune.cc b/src/freedreno/vulkan/tu_autotune.cc
index 5bd4b54600d..fbcc22ff288 100644
--- a/src/freedreno/vulkan/tu_autotune.cc
+++ b/src/freedreno/vulkan/tu_autotune.cc
@@ -1041,6 +1041,7 @@ struct tu_autotune::rp_history {
 
       std::atomic<uint32_t> sysmem_probability = PROBABILITY_MID;
       bool should_reset = false; /* If true, will reset sysmem_probability before next update. */
+      bool locked = false;       /* If true, the probability will no longer be updated. */
       uint64_t seed[2] { 0x3bffb83978e24f88, 0x9238d5d56c71cd35 };
 
     public:
@@ -1051,6 +1052,9 @@ struct tu_autotune::rp_history {
 
       void update(rp_history &history, bool immediate)
       {
+         if (locked)
+            return;
+
          auto &sysmem_ema = history.sysmem_rp_average;
          auto &gmem_ema = history.gmem_rp_average;
          uint32_t sysmem_prob = sysmem_probability.load(std::memory_order_relaxed);
@@ -1060,15 +1064,13 @@ struct tu_autotune::rp_history {
              * scenario for autotune performance, since we know the optimal decisions.
              */
 
-            if (sysmem_prob == 0 || sysmem_prob == 100)
-               return; /* Already resolved, no further updates are necessary. */
-
             if (sysmem_ema.count < 1) {
                sysmem_prob = PROBABILITY_MAX;
             } else if (gmem_ema.count < 1) {
                sysmem_prob = 0;
             } else {
                sysmem_prob = gmem_ema.get() < sysmem_ema.get() ? 0 : PROBABILITY_MAX;
+               locked = true;
             }
          } else {
             if (sysmem_ema.count < MIN_PROFILE_DURATION_COUNT || gmem_ema.count < MIN_PROFILE_DURATION_COUNT) {
@@ -1082,14 +1084,41 @@ struct tu_autotune::rp_history {
                }
 
                /* Adjust probability based on timing results. */
-               constexpr uint32_t STEP_DELTA = 5, MIN_PROBABILITY = 5, MAX_PROBABILITY = 95;
+               constexpr uint32_t STEP_DELTA = 5; /* 5% */
+               constexpr uint32_t MIN_PROB = 5, MAX_PROB = 95;
 
                uint64_t avg_sysmem = sysmem_ema.get();
                uint64_t avg_gmem = gmem_ema.get();
-               if (avg_gmem < avg_sysmem && sysmem_prob > MIN_PROBABILITY) {
-                  sysmem_prob = MAX2(sysmem_prob - STEP_DELTA, MIN_PROBABILITY);
-               } else if (avg_sysmem < avg_gmem && sysmem_prob < MAX_PROBABILITY) {
-                  sysmem_prob = MIN2(sysmem_prob + STEP_DELTA, MAX_PROBABILITY);
+
+               if (avg_gmem < avg_sysmem && sysmem_prob > MIN_PROB) {
+                  sysmem_prob = MAX2(sysmem_prob - STEP_DELTA, MIN_PROB);
+               } else if (avg_sysmem < avg_gmem && sysmem_prob < MAX_PROB) {
+                  sysmem_prob = MIN2(sysmem_prob + STEP_DELTA, MAX_PROB);
+               }
+
+               /* If the RP duration exceeds a certain minimum duration threshold (i.e. has a large impact on frametime)
+                * and the percentage difference between the modes is large enough, we lock into the optimal mode. This
+                * avoids performance hazards from switching to an extremely suboptimal mode even if done very rarely.
+                * Note: Due to the potentially huge negative impact of a bad lock, this is a very conservative check.
+                */
+               constexpr uint32_t MIN_LOCK_DURATION_COUNT = 50;
+               constexpr uint64_t MIN_LOCK_THRESHOLD = GPU_TICKS_PER_US * 1'000; /* 1ms */
+               constexpr uint32_t LOCK_PERCENT_DIFF = 40;
+
+               bool has_resolved = sysmem_prob == MAX_PROB || sysmem_prob == MIN_PROB;
+               bool enough_samples =
+                  sysmem_ema.count >= MIN_LOCK_DURATION_COUNT && gmem_ema.count >= MIN_LOCK_DURATION_COUNT;
+               uint64_t min_avg = MIN2(avg_sysmem, avg_gmem);
+               uint64_t max_avg = MAX2(avg_sysmem, avg_gmem);
+               uint64_t percent_diff = (100 * (max_avg - min_avg)) / min_avg;
+
+               if (has_resolved && enough_samples && max_avg >= MIN_LOCK_THRESHOLD &&
+                   percent_diff >= LOCK_PERCENT_DIFF) {
+                  if (avg_gmem < avg_sysmem)
+                     sysmem_prob = 0;
+                  else
+                     sysmem_prob = 100;
+                  locked = true;
                }
             }
          }
@@ -1097,9 +1126,9 @@ struct tu_autotune::rp_history {
          sysmem_probability.store(sysmem_prob, std::memory_order_relaxed);
 
          at_log_profiled_h("update%s avg_gmem: %" PRIu64 " us (%" PRIu64 " samples) avg_sysmem: %" PRIu64
-                           " us (%" PRIu64 " samples) = sysmem_probability: %" PRIu32,
+                           " us (%" PRIu64 " samples) = sysmem_probability: %" PRIu32 " locked: %u",
                            history.hash, immediate ? "-imm" : "", ticks_to_us(gmem_ema.get()), gmem_ema.count,
-                           ticks_to_us(sysmem_ema.get()), sysmem_ema.count, sysmem_prob);
+                           ticks_to_us(sysmem_ema.get()), sysmem_ema.count, sysmem_prob, locked);
       }
 
     public:

From fa5dacc63eab1c16d1139363032aa3f61224c5eb Mon Sep 17 00:00:00 2001
From: Dhruv Mark Collins <mark@igalia.com>
Date: Thu, 9 Oct 2025 13:56:55 +0000
Subject: [PATCH 15/25] tu/autotune: Switch to PROFILED as default mode

It's likely that profiled works much better as the default policy
compared to bandwidth estimation as it is directly measuring the
performance of GMEM and SYSMEM and basing the choice on that directly.

Signed-off-by: Dhruv Mark Collins <mark@igalia.com>
---
 docs/drivers/freedreno.rst          | 5 +++--
 src/freedreno/vulkan/tu_autotune.cc | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/docs/drivers/freedreno.rst b/docs/drivers/freedreno.rst
index 44f2e0d4a6b..3bfbb348bbb 100644
--- a/docs/drivers/freedreno.rst
+++ b/docs/drivers/freedreno.rst
@@ -679,14 +679,15 @@ environment variables:
 
   ``bandwidth``
     Estimates the bandwidth usage of rendering in SYSMEM and GMEM modes, and chooses
-    the one with lower estimated bandwidth. This is the default algorithm.
+    the one with lower estimated bandwidth.
 
   ``profiled``
     Dynamically profiles the RP timings in SYSMEM and GMEM modes, and uses that to
     move a probability distribution towards the optimal choice over time. This
     algorithm tends to be far more accurate than the bandwidth algorithm at choosing
     the optimal rendering mode but may result in larger FPS variance due to being
-    based on a probability distribution with random sampling.
+    based on a probability distribution with random sampling. This is the default
+    algorithm.
 
   ``profiled_imm``
     Similar to ``profiled``, but only profiles the first few instances of a RP
diff --git a/src/freedreno/vulkan/tu_autotune.cc b/src/freedreno/vulkan/tu_autotune.cc
index fbcc22ff288..e5c3941e5ba 100644
--- a/src/freedreno/vulkan/tu_autotune.cc
+++ b/src/freedreno/vulkan/tu_autotune.cc
@@ -104,7 +104,7 @@ enum class tu_autotune::algorithm : uint8_t {
    PROFILED_IMM = 2,  /* Same as PROFILED but immediately resolves the SYSMEM/GMEM probability. */
    PREFER_SYSMEM = 3, /* Always use SYSMEM unless we have strong evidence that GMEM is better. */
 
-   DEFAULT = BANDWIDTH, /* Default algorithm, used if no other is specified. */
+   DEFAULT = PROFILED, /* Default algorithm, used if no other is specified. */
 };
 
 /* Modifier flags, these modify the behavior of the autotuner in a user-defined way. */

From d95c43c852515df6b257788400fb4c4475f34df0 Mon Sep 17 00:00:00 2001
From: Dhruv Mark Collins <mark@igalia.com>
Date: Mon, 27 Oct 2025 08:10:28 +0000
Subject: [PATCH 16/25] tu/autotune: Allow 99% max probability in profiled mode

The maximum probability was limited to 95% earlier due to the step
delta of 5% (95+5=100% which we wanted to avoid). This introduces a
new slower step delta after 95% which steps at 1% up to 99% which
is significantly better in terms of eliminating the performance loss
or stuttering from when there is a large difference between the modes.

Signed-off-by: Dhruv Mark Collins <mark@igalia.com>
---
 src/freedreno/vulkan/tu_autotune.cc | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/freedreno/vulkan/tu_autotune.cc b/src/freedreno/vulkan/tu_autotune.cc
index e5c3941e5ba..e450535c780 100644
--- a/src/freedreno/vulkan/tu_autotune.cc
+++ b/src/freedreno/vulkan/tu_autotune.cc
@@ -1084,16 +1084,22 @@ struct tu_autotune::rp_history {
                }
 
                /* Adjust probability based on timing results. */
-               constexpr uint32_t STEP_DELTA = 5; /* 5% */
-               constexpr uint32_t MIN_PROB = 5, MAX_PROB = 95;
+               constexpr uint32_t FAST_STEP_DELTA = 5, FAST_MIN_PROBABILITY = 5, FAST_MAX_PROBABILITY = 95;
+               constexpr uint32_t SLOW_STEP_DELTA = 1, SLOW_MIN_PROBABILITY = 1, SLOW_MAX_PROBABILITY = 99;
 
                uint64_t avg_sysmem = sysmem_ema.get();
                uint64_t avg_gmem = gmem_ema.get();
 
-               if (avg_gmem < avg_sysmem && sysmem_prob > MIN_PROB) {
-                  sysmem_prob = MAX2(sysmem_prob - STEP_DELTA, MIN_PROB);
-               } else if (avg_sysmem < avg_gmem && sysmem_prob < MAX_PROB) {
-                  sysmem_prob = MIN2(sysmem_prob + STEP_DELTA, MAX_PROB);
+               if (avg_gmem < avg_sysmem) {
+                  if (sysmem_prob > FAST_MIN_PROBABILITY && sysmem_prob <= FAST_MAX_PROBABILITY)
+                     sysmem_prob = MAX2(sysmem_prob - FAST_STEP_DELTA, FAST_MIN_PROBABILITY);
+                  else if (sysmem_prob > SLOW_MIN_PROBABILITY)
+                     sysmem_prob = MAX2(sysmem_prob - SLOW_STEP_DELTA, SLOW_MIN_PROBABILITY);
+               } else if (avg_sysmem < avg_gmem) {
+                  if (sysmem_prob >= FAST_MIN_PROBABILITY && sysmem_prob < FAST_MAX_PROBABILITY)
+                     sysmem_prob = MIN2(sysmem_prob + FAST_STEP_DELTA, FAST_MAX_PROBABILITY);
+                  else if (sysmem_prob < SLOW_MAX_PROBABILITY)
+                     sysmem_prob = MIN2(sysmem_prob + SLOW_STEP_DELTA, SLOW_MAX_PROBABILITY);
                }
 
                /* If the RP duration exceeds a certain minimum duration threshold (i.e. has a large impact on frametime)
@@ -1105,7 +1111,7 @@ struct tu_autotune::rp_history {
                constexpr uint64_t MIN_LOCK_THRESHOLD = GPU_TICKS_PER_US * 1'000; /* 1ms */
                constexpr uint32_t LOCK_PERCENT_DIFF = 40;
 
-               bool has_resolved = sysmem_prob == MAX_PROB || sysmem_prob == MIN_PROB;
+               bool has_resolved = sysmem_prob == SLOW_MAX_PROBABILITY || sysmem_prob == SLOW_MIN_PROBABILITY;
                bool enough_samples =
                   sysmem_ema.count >= MIN_LOCK_DURATION_COUNT && gmem_ema.count >= MIN_LOCK_DURATION_COUNT;
                uint64_t min_avg = MIN2(avg_sysmem, avg_gmem);

From 273c2121d04eb6edae10cfa4610472a5eebfc714 Mon Sep 17 00:00:00 2001
From: Dhruv Mark Collins <mark@igalia.com>
Date: Mon, 27 Oct 2025 15:23:06 +0000
Subject: [PATCH 17/25] tu/autotune: Make profiled mode locking less
 conservative

Lowers the thresholds for locking as the locks were occurring too
infrequently when they were necessary, this is especially worse with
the higher 99% max probability.

Signed-off-by: Dhruv Mark Collins <mark@igalia.com>
---
 src/freedreno/vulkan/tu_autotune.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/freedreno/vulkan/tu_autotune.cc b/src/freedreno/vulkan/tu_autotune.cc
index e450535c780..22ce2b3069e 100644
--- a/src/freedreno/vulkan/tu_autotune.cc
+++ b/src/freedreno/vulkan/tu_autotune.cc
@@ -1107,9 +1107,9 @@ struct tu_autotune::rp_history {
                 * avoids performance hazards from switching to an extremely suboptimal mode even if done very rarely.
                 * Note: Due to the potentially huge negative impact of a bad lock, this is a very conservative check.
                 */
-               constexpr uint32_t MIN_LOCK_DURATION_COUNT = 50;
+               constexpr uint32_t MIN_LOCK_DURATION_COUNT = 15;
                constexpr uint64_t MIN_LOCK_THRESHOLD = GPU_TICKS_PER_US * 1'000; /* 1ms */
-               constexpr uint32_t LOCK_PERCENT_DIFF = 40;
+               constexpr uint32_t LOCK_PERCENT_DIFF = 30;
 
                bool has_resolved = sysmem_prob == SLOW_MAX_PROBABILITY || sysmem_prob == SLOW_MIN_PROBABILITY;
                bool enough_samples =

From 37c1f779df147b48f068d10ba272254ff04db14c Mon Sep 17 00:00:00 2001
From: Dhruv Mark Collins <mark@igalia.com>
Date: Tue, 28 Oct 2025 15:34:24 +0000
Subject: [PATCH 18/25] tu/autotune: Limit maximum amount of RP histories to
 1024

We allowed for unbounded growth of RP history entries which could
lead to a memory leak, this just limits them to a fairly large value
of 1024 instead.

Signed-off-by: Dhruv Mark Collins <mark@igalia.com>
---
 src/freedreno/vulkan/tu_autotune.cc | 164 +++++++++++++++++++++++++---
 src/freedreno/vulkan/tu_autotune.h  |  13 ++-
 2 files changed, 156 insertions(+), 21 deletions(-)

diff --git a/src/freedreno/vulkan/tu_autotune.cc b/src/freedreno/vulkan/tu_autotune.cc
index 22ce2b3069e..d42c3346a6c 100644
--- a/src/freedreno/vulkan/tu_autotune.cc
+++ b/src/freedreno/vulkan/tu_autotune.cc
@@ -5,6 +5,7 @@
 
 #include "tu_autotune.h"
 
+#include <algorithm>
 #include <array>
 #include <atomic>
 #include <optional>
@@ -437,6 +438,60 @@ struct PACKED tu_autotune::tile_gpu_data {
    }
 };
 
+/* A small wrapper around rp_history to provide ref-counting and usage timestamps. */
+struct tu_autotune::rp_history_handle {
+   rp_history *history;
+
+   /* Note: Must be called with rp_mutex held. */
+   rp_history_handle(rp_history &history);
+
+   constexpr rp_history_handle(std::nullptr_t): history(nullptr)
+   {
+   }
+
+   rp_history_handle(const rp_history_handle &) = delete;
+   rp_history_handle &operator=(const rp_history_handle &) = delete;
+
+   constexpr rp_history_handle(rp_history_handle &&other): history(other.history)
+   {
+      other.history = nullptr;
+   }
+
+   constexpr rp_history_handle &operator=(rp_history_handle &&other)
+   {
+      if (this != &other) {
+         history = other.history;
+         other.history = nullptr;
+      }
+      return *this;
+   }
+
+   constexpr operator bool() const
+   {
+      return history != nullptr;
+   }
+
+   constexpr rp_history &operator*() const
+   {
+      assert(history);
+      return *history;
+   }
+
+   constexpr operator rp_history *() const
+   {
+      assert(history);
+      return history;
+   }
+
+   constexpr rp_history *operator->() const
+   {
+      assert(history);
+      return history;
+   }
+
+   ~rp_history_handle();
+};
+
 /* An "entry" of renderpass autotune results, which is used to store the results of a renderpass autotune run for a
  * given command buffer. */
 struct tu_autotune::rp_entry {
@@ -452,8 +507,8 @@ struct tu_autotune::rp_entry {
    static_assert(sizeof(rp_gpu_data) % alignof(tile_gpu_data) == 0);
 
  public:
-   rp_history *history; /* Guaranteed to never be nullptr. */
-   config_t config;     /* Configuration at the time of entry creation. */
+   rp_history_handle history;
+   config_t config; /* Configuration at the time of entry creation. */
    bool sysmem;
    uint32_t tile_count;
    uint32_t draw_count;
@@ -461,8 +516,8 @@ struct tu_autotune::rp_entry {
    /* Amount of repeated RPs so far, used for uniquely identifying instances of the same RPs. */
    uint32_t duplicates = 0;
 
-   rp_entry(struct tu_device *device, rp_history &history, config_t config, uint32_t draw_count)
-       : device(device), map(nullptr), history(&history), config(config), draw_count(draw_count)
+   rp_entry(struct tu_device *device, rp_history_handle &&history, config_t config, uint32_t draw_count)
+       : device(device), map(nullptr), history(std::move(history)), config(config), draw_count(draw_count)
    {
    }
 
@@ -479,7 +534,7 @@ struct tu_autotune::rp_entry {
    rp_entry &operator=(const rp_entry &) = delete;
 
    rp_entry(rp_entry &&other) noexcept
-       : device(other.device), bo(other.bo), map(other.map), history(other.history), config(other.config),
+       : device(other.device), bo(other.bo), map(other.map), history(std::move(other.history)), config(other.config),
          sysmem(other.sysmem), tile_count(other.tile_count), draw_count(other.draw_count)
    {
       other.map = nullptr; /* Prevent the destructor from freeing the BO. */
@@ -491,7 +546,7 @@ struct tu_autotune::rp_entry {
          device = other.device;
          bo = other.bo;
          map = other.map;
-         history = other.history;
+         history = std::move(other.history);
          config = other.config;
          sysmem = other.sysmem;
          tile_count = other.tile_count;
@@ -966,9 +1021,11 @@ struct tu_autotune::rp_history {
 
  public:
    uint64_t hash; /* The hash of the renderpass, just for debug output. */
-   uint32_t duplicates;
 
-   rp_history(uint64_t hash): hash(hash), profiled(hash)
+   std::atomic<uint32_t> refcount = 0; /* Reference count to prevent deletion when active. */
+   std::atomic<uint64_t> last_use_ts;  /* Last time the reference count was updated, in monotonic nanoseconds. */
+
+   rp_history(uint64_t hash): hash(hash), last_use_ts(os_time_get_nano()), profiled(hash)
    {
    }
 
@@ -1368,18 +1425,34 @@ struct tu_autotune::rp_history {
    }
 };
 
-tu_autotune::rp_history *
+tu_autotune::rp_history_handle::~rp_history_handle()
+{
+   if (!history)
+      return;
+
+   history->last_use_ts.store(os_time_get_nano(), std::memory_order_relaxed);
+   ASSERTED uint32_t old_refcount = history->refcount.fetch_sub(1, std::memory_order_relaxed);
+   assert(old_refcount != 0); /* Underflow check. */
+}
+
+tu_autotune::rp_history_handle::rp_history_handle(rp_history &history): history(&history)
+{
+   history.refcount.fetch_add(1, std::memory_order_relaxed);
+   history.last_use_ts.store(os_time_get_nano(), std::memory_order_relaxed);
+}
+
+tu_autotune::rp_history_handle
 tu_autotune::find_rp_history(const rp_key &key)
 {
    std::shared_lock lock(rp_mutex);
    auto it = rp_histories.find(key);
    if (it != rp_histories.end())
-      return &it->second;
+      return rp_history_handle(it->second);
 
-   return nullptr;
+   return rp_history_handle(nullptr);
 }
 
-tu_autotune::rp_history &
+tu_autotune::rp_history_handle
 tu_autotune::find_or_create_rp_history(const rp_key &key)
 {
    rp_history *existing = find_rp_history(key);
@@ -1392,7 +1465,63 @@ tu_autotune::find_or_create_rp_history(const rp_key &key)
    if (it != rp_histories.end())
       return it->second; /* Another thread created the history while we were waiting for the lock. */
    auto history = rp_histories.emplace(std::make_pair(key, key.hash));
-   return history.first->second;
+   return rp_history_handle(history.first->second);
+}
+
+void
+tu_autotune::reap_old_rp_histories()
+{
+   constexpr uint64_t REAP_INTERVAL_NS = 10'000'000'000; /* 10s */
+   uint64_t now = os_time_get_nano();
+   if (last_reap_ts + REAP_INTERVAL_NS > now)
+      return;
+   last_reap_ts = now;
+
+   constexpr size_t MAX_RP_HISTORIES = 1024; /* Not a hard limit, we might exceed this if there's many active RPs. */
+   {
+      /* Quicker non-unique lock, should hit this path mostly. */
+      std::shared_lock lock(rp_mutex);
+      if (rp_histories.size() <= MAX_RP_HISTORIES)
+         return;
+   }
+
+   std::unique_lock lock(rp_mutex);
+   size_t og_size = rp_histories.size();
+   if (og_size <= MAX_RP_HISTORIES)
+      return;
+
+   std::vector<rp_histories_t::iterator> candidates;
+   candidates.reserve(og_size);
+   for (auto it = rp_histories.begin(); it != rp_histories.end(); ++it) {
+      if (it->second.refcount.load(std::memory_order_relaxed) == 0)
+         candidates.push_back(it);
+   }
+
+   size_t to_purge = std::min(candidates.size(), og_size - MAX_RP_HISTORIES);
+   if (to_purge == 0) {
+      at_log_base("no RP histories to reap at size %zu, all are active", og_size);
+      return;
+   }
+
+   /* Partition candidates by last use timestamp, oldest first. */
+   auto partition_end = candidates.begin() + to_purge;
+   if (to_purge < candidates.size()) {
+      std::nth_element(candidates.begin(), partition_end, candidates.end(),
+                       [](rp_histories_t::iterator a, rp_histories_t::iterator b) {
+                          return a->second.last_use_ts.load(std::memory_order_relaxed) <
+                                 b->second.last_use_ts.load(std::memory_order_relaxed);
+                       });
+   }
+
+   for (auto it = candidates.begin(); it != partition_end; ++it) {
+      rp_history &history = (*it)->second;
+      if (history.refcount.load(std::memory_order_relaxed) == 0) {
+         at_log_base("reaping RP history %016" PRIx64, history.hash);
+         rp_histories.erase(*it);
+      }
+   }
+
+   at_log_base("reaped old RP histories %zu -> %zu", og_size, rp_histories.size());
 }
 
 void
@@ -1429,6 +1558,7 @@ tu_autotune::on_submit(struct tu_cmd_buffer **cmd_buffers, uint32_t cmd_buffer_c
     * processed all entries from prior CBs before we submit any new CBs with the same RP to the GPU.
     */
    process_entries();
+   reap_old_rp_histories();
 
    bool has_results = false;
    for (uint32_t i = 0; i < cmd_buffer_count; i++) {
@@ -1547,11 +1677,11 @@ tu_autotune::cmd_buf_ctx::reset()
 
 tu_autotune::rp_entry &
 tu_autotune::cmd_buf_ctx::attach_rp_entry(struct tu_device *device,
-                                          rp_history &history,
+                                          rp_history_handle &&history,
                                           config_t config,
                                           uint32_t drawcall_count)
 {
-   return batch->entries.emplace_back(device, history, config, drawcall_count);
+   return batch->entries.emplace_back(device, std::move(history), config, drawcall_count);
 }
 
 tu_autotune::rp_entry *
@@ -1650,8 +1780,8 @@ tu_autotune::get_optimal_mode(struct tu_cmd_buffer *cmd_buffer, rp_ctx_t *rp_ctx
       key = rp_key(key, entry->duplicates);
    }
 
-   auto &history = find_or_create_rp_history(key);
-   *rp_ctx = &cb_ctx.attach_rp_entry(device, history, config, rp_state->drawcall_count);
+   *rp_ctx = &cb_ctx.attach_rp_entry(device, find_or_create_rp_history(key), config, rp_state->drawcall_count);
+   rp_history &history = *((*rp_ctx)->history);
 
    if (config.test(mod_flag::PREEMPT_OPTIMIZE) && history.preempt_optimize.is_latency_sensitive()) {
       /* Try to mitigate the risk of high preemption latency by always using GMEM, which should break up any larger
diff --git a/src/freedreno/vulkan/tu_autotune.h b/src/freedreno/vulkan/tu_autotune.h
index c3199672d6b..c4cefcd2fd6 100644
--- a/src/freedreno/vulkan/tu_autotune.h
+++ b/src/freedreno/vulkan/tu_autotune.h
@@ -142,6 +142,7 @@ struct tu_autotune {
    /** Renderpass State Tracking **/
 
    struct rp_history;
+   struct rp_history_handle;
 
    /* A strongly typed key which generates a hash to uniquely identify a renderpass instance. This hash is expected to
     * be stable across runs, so it can be used to identify the same renderpass instance consistently.
@@ -182,12 +183,15 @@ struct tu_autotune {
    };
 
    /* A map between the hash of an RP and the historical state of the RP. Synchronized by rp_mutex. */
-   std::unordered_map<rp_key, rp_history, rp_hash> rp_histories;
+   using rp_histories_t = std::unordered_map<rp_key, rp_history, rp_hash>;
+   rp_histories_t rp_histories;
    std::shared_mutex rp_mutex;
+   uint64_t last_reap_ts = 0;
 
    /* Note: These will internally lock rp_mutex internally, no need to lock it. */
-   rp_history *find_rp_history(const rp_key &key);
-   rp_history &find_or_create_rp_history(const rp_key &key);
+   rp_history_handle find_rp_history(const rp_key &key);
+   rp_history_handle find_or_create_rp_history(const rp_key &key);
+   void reap_old_rp_histories();
 
    /** Debug Performance Counters **/
 
@@ -215,7 +219,8 @@ struct tu_autotune {
       std::shared_ptr<rp_entry_batch> batch;
 
       /* Creates a new RP entry attached to this CB. */
-      rp_entry &attach_rp_entry(struct tu_device *device, rp_history &entry, config_t config, uint32_t draw_count);
+      rp_entry &
+      attach_rp_entry(struct tu_device *device, rp_history_handle &&history, config_t config, uint32_t draw_count);
 
       rp_entry *find_rp_entry(const rp_key &key);
 

From 0c4c72ca58bd4b8d3d874af36c7c995beb237841 Mon Sep 17 00:00:00 2001
From: Dhruv Mark Collins <mark@igalia.com>
Date: Thu, 11 Dec 2025 15:11:46 +0000
Subject: [PATCH 19/25] tu/autotune: Only lock RPs sustain certain mode for 30s

Many games have short periods where a certain mode might win
consistently but this trend doesn't hold after that. Only allowing
locking to occur on RPs where a certain mode consistently stays
winning for 30s allows us to partially mitigate these bad locks.

Signed-off-by: Dhruv Mark Collins <mark@igalia.com>
---
 src/freedreno/vulkan/tu_autotune.cc | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/freedreno/vulkan/tu_autotune.cc b/src/freedreno/vulkan/tu_autotune.cc
index d42c3346a6c..e645ad47172 100644
--- a/src/freedreno/vulkan/tu_autotune.cc
+++ b/src/freedreno/vulkan/tu_autotune.cc
@@ -1101,6 +1101,9 @@ struct tu_autotune::rp_history {
       bool locked = false;       /* If true, the probability will no longer be updated. */
       uint64_t seed[2] { 0x3bffb83978e24f88, 0x9238d5d56c71cd35 };
 
+      bool is_sysmem_winning = false;
+      uint64_t winning_since_ts = 0;
+
     public:
       profiled_algo(uint64_t hash)
       {
@@ -1167,6 +1170,15 @@ struct tu_autotune::rp_history {
                constexpr uint32_t MIN_LOCK_DURATION_COUNT = 15;
                constexpr uint64_t MIN_LOCK_THRESHOLD = GPU_TICKS_PER_US * 1'000; /* 1ms */
                constexpr uint32_t LOCK_PERCENT_DIFF = 30;
+               constexpr uint64_t LOCK_TIME_WINDOW_NS = 30'000'000'000; /* 30s */
+
+               uint64_t now = os_time_get_nano();
+               bool current_sysmem_winning = avg_sysmem < avg_gmem;
+
+               if (winning_since_ts == 0 || current_sysmem_winning != is_sysmem_winning) {
+                  winning_since_ts = now;
+                  is_sysmem_winning = current_sysmem_winning;
+               }
 
                bool has_resolved = sysmem_prob == SLOW_MAX_PROBABILITY || sysmem_prob == SLOW_MIN_PROBABILITY;
                bool enough_samples =
@@ -1176,7 +1188,7 @@ struct tu_autotune::rp_history {
                uint64_t percent_diff = (100 * (max_avg - min_avg)) / min_avg;
 
                if (has_resolved && enough_samples && max_avg >= MIN_LOCK_THRESHOLD &&
-                   percent_diff >= LOCK_PERCENT_DIFF) {
+                   percent_diff >= LOCK_PERCENT_DIFF && (now - winning_since_ts) >= LOCK_TIME_WINDOW_NS) {
                   if (avg_gmem < avg_sysmem)
                      sysmem_prob = 0;
                   else
@@ -1747,6 +1759,7 @@ tu_autotune::get_optimal_mode(struct tu_cmd_buffer *cmd_buffer, rp_ctx_t *rp_ctx
    if (!enabled || simultaneous_use || ignore_small_rp)
       return default_mode;
 
+  
    /* We can return early with the decision based on the draw call count, instead of needing to hash the renderpass
     * instance and look up the history, which is far more expensive.
     *

From ac52ee7da6f0050f9c2c6b2155a1ccdc55a9a098 Mon Sep 17 00:00:00 2001
From: Zan Dobersek <zdobersek@igalia.com>
Date: Wed, 17 Dec 2025 08:23:30 +0100
Subject: [PATCH 20/25] tu/autotune: fix handle-dereferencing crashes in
 find_or_create_rp_history()

Don't dereference the possibly-null handle value.

Signed-off-by: Zan Dobersek <zdobersek@igalia.com>
Fixes: d8ff474b70d ("tu: Rewrite autotune in C++")
---
 src/freedreno/vulkan/tu_autotune.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/freedreno/vulkan/tu_autotune.cc b/src/freedreno/vulkan/tu_autotune.cc
index e645ad47172..a78398c0848 100644
--- a/src/freedreno/vulkan/tu_autotune.cc
+++ b/src/freedreno/vulkan/tu_autotune.cc
@@ -1467,9 +1467,9 @@ tu_autotune::find_rp_history(const rp_key &key)
 tu_autotune::rp_history_handle
 tu_autotune::find_or_create_rp_history(const rp_key &key)
 {
-   rp_history *existing = find_rp_history(key);
+   rp_history_handle existing = find_rp_history(key);
    if (existing)
-      return *existing;
+      return existing;
 
    /* If we reach here, we have to create a new history. */
    std::unique_lock lock(rp_mutex);

From 55bd5cc2d29105fa1a24d57814e80d0050c89cef Mon Sep 17 00:00:00 2001
From: Zan Dobersek <zdobersek@igalia.com>
Date: Wed, 17 Dec 2025 08:43:10 +0100
Subject: [PATCH 21/25] tu/autotune: fix tu_queue_init() crashes when disabling
 preempt-optimize mode

Disabling the preempt-optimize mode in autotuner in tu_queue_init() leads
to crashes since the autotuner hasn't been created for the logical device
at that point. To avoid that, gather the global priority information during
queue initialization in tu_CreateDevice(), but delay the preempt-optimize
disablement until after the autotuner is created.

Signed-off-by: Zan Dobersek <zdobersek@igalia.com>
Fixes: e167c2ec4d3 ("tu/autotune: Add "Preempt Optimize" mode")
---
 src/freedreno/vulkan/tu_device.cc | 18 ++++++++++++++++--
 src/freedreno/vulkan/tu_queue.cc  | 12 +-----------
 src/freedreno/vulkan/tu_queue.h   |  1 +
 3 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc
index 820f0bda174..d593fbfc26c 100644
--- a/src/freedreno/vulkan/tu_device.cc
+++ b/src/freedreno/vulkan/tu_device.cc
@@ -2669,6 +2669,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
    VkResult result;
    struct tu_device *device;
    bool border_color_without_format = false;
+   bool autotune_disable_preempt_optimize = false;
 
    vk_foreach_struct_const (ext, pCreateInfo->pNext) {
       switch (ext->sType) {
@@ -2790,6 +2791,13 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
    for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
       const VkDeviceQueueCreateInfo *queue_create =
          &pCreateInfo->pQueueCreateInfos[i];
+      const VkDeviceQueueGlobalPriorityCreateInfoKHR *priority_info =
+         vk_find_struct_const(queue_create->pNext,
+               DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_KHR);
+      const VkQueueGlobalPriorityKHR global_priority = priority_info ?
+         priority_info->globalPriority :
+         (TU_DEBUG(HIPRIO) ? VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR :
+          VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR);
       uint32_t qfi = queue_create->queueFamilyIndex;
       enum tu_queue_type type = physical_device->queue_families[qfi].type;
       device->queues[qfi] = (struct tu_queue *) vk_alloc(
@@ -2809,13 +2817,16 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
       device->queue_count[qfi] = queue_create->queueCount;
 
       for (unsigned q = 0; q < queue_create->queueCount; q++) {
-         result = tu_queue_init(device, &device->queues[qfi][q], type, q,
-                                queue_create);
+         result = tu_queue_init(device, &device->queues[qfi][q], type,
+                                global_priority, q, queue_create);
          if (result != VK_SUCCESS) {
             device->queue_count[qfi] = q;
             goto fail_queues;
          }
       }
+
+      autotune_disable_preempt_optimize |=
+         (global_priority == VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR);
    }
 
    result = vk_meta_device_init(&device->vk, &device->meta);
@@ -3021,6 +3032,9 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
    if (result != VK_SUCCESS)
       goto fail_timeline_cond;
 
+   if (autotune_disable_preempt_optimize)
+      device->autotune->disable_preempt_optimize();
+
    device->use_z24uint_s8uint =
       physical_device->info->props.has_z24uint_s8uint &&
       (!border_color_without_format ||
diff --git a/src/freedreno/vulkan/tu_queue.cc b/src/freedreno/vulkan/tu_queue.cc
index 2f963ce9ee0..7563e2c3b45 100644
--- a/src/freedreno/vulkan/tu_queue.cc
+++ b/src/freedreno/vulkan/tu_queue.cc
@@ -605,17 +605,10 @@ VkResult
 tu_queue_init(struct tu_device *device,
               struct tu_queue *queue,
               enum tu_queue_type type,
+              const VkQueueGlobalPriorityKHR global_priority,
               int idx,
               const VkDeviceQueueCreateInfo *create_info)
 {
-   const VkDeviceQueueGlobalPriorityCreateInfoKHR *priority_info =
-      vk_find_struct_const(create_info->pNext,
-            DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_KHR);
-   const VkQueueGlobalPriorityKHR global_priority = priority_info ?
-      priority_info->globalPriority :
-      (TU_DEBUG(HIPRIO) ? VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR :
-       VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR);
-
    const int priority = tu_get_submitqueue_priority(
          device->physical_device, global_priority, type,
          device->vk.enabled_features.globalPriorityQuery);
@@ -641,9 +634,6 @@ tu_queue_init(struct tu_device *device,
 
    queue->fence = -1;
 
-   if (global_priority == VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR)
-      device->autotune->disable_preempt_optimize();
-
    return VK_SUCCESS;
 }
 
diff --git a/src/freedreno/vulkan/tu_queue.h b/src/freedreno/vulkan/tu_queue.h
index 28925bfcb50..278756a43af 100644
--- a/src/freedreno/vulkan/tu_queue.h
+++ b/src/freedreno/vulkan/tu_queue.h
@@ -43,6 +43,7 @@ VkResult
 tu_queue_init(struct tu_device *device,
               struct tu_queue *queue,
               enum tu_queue_type type,
+              const VkQueueGlobalPriorityKHR global_priority,
               int idx,
               const VkDeviceQueueCreateInfo *create_info);
 

From 555580cb94d6ca856e39ff1bfd20d81dba61e745 Mon Sep 17 00:00:00 2001
From: Danylo Piliaiev <dpiliaiev@igalia.com>
Date: Wed, 17 Dec 2025 09:00:11 +0100
Subject: [PATCH 22/25] tu/autotune: use PACKED to tightly pack
 packed_att_properties struct

Fixes: ff4bb3c658c ("tu/autotune: Improve RP hash")
---
 src/freedreno/vulkan/tu_autotune.cc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/freedreno/vulkan/tu_autotune.cc b/src/freedreno/vulkan/tu_autotune.cc
index a78398c0848..e61045555fe 100644
--- a/src/freedreno/vulkan/tu_autotune.cc
+++ b/src/freedreno/vulkan/tu_autotune.cc
@@ -865,12 +865,12 @@ tu_autotune::rp_key::rp_key(const struct tu_render_pass *pass,
     * Note: Not using image IOVA leads to too many false matches.
     */
 
-   struct packed_att_properties {
+   struct PACKED packed_att_properties {
       uint64_t iova;
-      bool load          : 1;
-      bool store         : 1;
-      bool load_stencil  : 1;
-      bool store_stencil : 1;
+      bool load;
+      bool store;
+      bool load_stencil;
+      bool store_stencil;
    };
 
    auto get_hash = [&](uint32_t *data, size_t size) {

From 1cbc00615a133bb3d23b3600fb3ce6d50adb47c0 Mon Sep 17 00:00:00 2001
From: Danylo Piliaiev <dpiliaiev@igalia.com>
Date: Wed, 17 Dec 2025 09:22:56 +0100
Subject: [PATCH 23/25] tu/autotune: manage rp_entry objects through
 std::unique_ptr

The entries vector can grow and move, so taking reference to rp_entry
objects in the vector was susceptible to UAF.

Instead, the entries vector now manages rp_entry objects through
std::unique_ptr. Move constructor and assignment operators for rp_entry
are deleted since they are not currently needed.

Fixes: d8ff474b70d ("tu: Rewrite autotune in C++")
---
 src/freedreno/vulkan/tu_autotune.cc | 44 ++++++++---------------------
 src/freedreno/vulkan/tu_autotune.h  |  4 +--
 2 files changed, 14 insertions(+), 34 deletions(-)

diff --git a/src/freedreno/vulkan/tu_autotune.cc b/src/freedreno/vulkan/tu_autotune.cc
index e61045555fe..a0566aad7b7 100644
--- a/src/freedreno/vulkan/tu_autotune.cc
+++ b/src/freedreno/vulkan/tu_autotune.cc
@@ -529,33 +529,11 @@ struct tu_autotune::rp_entry {
       }
    }
 
-   /* Disable the copy operators as that shouldn't be done. */
+   /* Disable the copy/move operators as that shouldn't be done. */
    rp_entry(const rp_entry &) = delete;
    rp_entry &operator=(const rp_entry &) = delete;
-
-   rp_entry(rp_entry &&other) noexcept
-       : device(other.device), bo(other.bo), map(other.map), history(std::move(other.history)), config(other.config),
-         sysmem(other.sysmem), tile_count(other.tile_count), draw_count(other.draw_count)
-   {
-      other.map = nullptr; /* Prevent the destructor from freeing the BO. */
-   }
-
-   rp_entry &operator=(rp_entry &&other) noexcept
-   {
-      if (this != &other) {
-         device = other.device;
-         bo = other.bo;
-         map = other.map;
-         history = std::move(other.history);
-         config = other.config;
-         sysmem = other.sysmem;
-         tile_count = other.tile_count;
-         draw_count = other.draw_count;
-
-         other.map = nullptr;
-      }
-      return *this;
-   }
+   rp_entry(rp_entry &&) = delete;
+   rp_entry &operator=(rp_entry &&) = delete;
 
    void allocate(bool sysmem, uint32_t tile_count)
    {
@@ -1549,8 +1527,8 @@ tu_autotune::process_entries()
          break; /* Entries are allocated in sequence, next will be newer and
                    also fail so we can just directly break out of the loop. */
 
-      for (rp_entry &entry : batch->entries)
-         entry.history->process(entry, *this);
+      for (auto &entry : batch->entries)
+         entry->history->process(*entry, *this);
 
       active_batches.pop_front();
    }
@@ -1687,21 +1665,23 @@ tu_autotune::cmd_buf_ctx::reset()
    batch = std::make_shared<rp_entry_batch>();
 }
 
-tu_autotune::rp_entry &
+tu_autotune::rp_entry *
 tu_autotune::cmd_buf_ctx::attach_rp_entry(struct tu_device *device,
                                           rp_history_handle &&history,
                                           config_t config,
                                           uint32_t drawcall_count)
 {
-   return batch->entries.emplace_back(device, std::move(history), config, drawcall_count);
+   std::unique_ptr<rp_entry> &new_entry =
+      batch->entries.emplace_back(std::make_unique<rp_entry>(device, std::move(history), config, drawcall_count));
+   return new_entry.get();
 }
 
 tu_autotune::rp_entry *
 tu_autotune::cmd_buf_ctx::find_rp_entry(const rp_key &key)
 {
    for (auto &entry : batch->entries) {
-      if (entry.history->hash == key.hash)
-         return &entry;
+      if (entry->history->hash == key.hash)
+         return entry.get();
    }
    return nullptr;
 }
@@ -1793,7 +1773,7 @@ tu_autotune::get_optimal_mode(struct tu_cmd_buffer *cmd_buffer, rp_ctx_t *rp_ctx
       key = rp_key(key, entry->duplicates);
    }
 
-   *rp_ctx = &cb_ctx.attach_rp_entry(device, find_or_create_rp_history(key), config, rp_state->drawcall_count);
+   *rp_ctx = cb_ctx.attach_rp_entry(device, find_or_create_rp_history(key), config, rp_state->drawcall_count);
    rp_history &history = *((*rp_ctx)->history);
 
    if (config.test(mod_flag::PREEMPT_OPTIMIZE) && history.preempt_optimize.is_latency_sensitive()) {
diff --git a/src/freedreno/vulkan/tu_autotune.h b/src/freedreno/vulkan/tu_autotune.h
index c4cefcd2fd6..b9bcf6ee0da 100644
--- a/src/freedreno/vulkan/tu_autotune.h
+++ b/src/freedreno/vulkan/tu_autotune.h
@@ -114,7 +114,7 @@ struct tu_autotune {
                          valid fence. */
       uint32_t fence; /* The fence value which is used to signal the completion of the CB submission. This is used to
                          determine when the entries can be processed. */
-      std::vector<rp_entry> entries;
+      std::vector<std::unique_ptr<rp_entry>> entries;
 
       rp_entry_batch();
 
@@ -219,7 +219,7 @@ struct tu_autotune {
       std::shared_ptr<rp_entry_batch> batch;
 
       /* Creates a new RP entry attached to this CB. */
-      rp_entry &
+      rp_entry *
       attach_rp_entry(struct tu_device *device, rp_history_handle &&history, config_t config, uint32_t draw_count);
 
       rp_entry *find_rp_entry(const rp_key &key);

From 9cf331d78b06a13be0e1e14cc783c2f95cc3b4df Mon Sep 17 00:00:00 2001
From: Zan Dobersek <zdobersek@igalia.com>
Date: Wed, 17 Dec 2025 09:51:15 +0100
Subject: [PATCH 24/25] tu/autotune: make autotune algo determination less
 chatty

Only print out autotuner algorithm decision if the specified algorithm
(either through environment or driconf) was meaningful, i.e. not an empty
string.

Signed-off-by: Zan Dobersek <zdobersek@igalia.com>
---
 src/freedreno/vulkan/tu_autotune.cc | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/freedreno/vulkan/tu_autotune.cc b/src/freedreno/vulkan/tu_autotune.cc
index a0566aad7b7..6ddbc8fcfa7 100644
--- a/src/freedreno/vulkan/tu_autotune.cc
+++ b/src/freedreno/vulkan/tu_autotune.cc
@@ -276,14 +276,16 @@ tu_autotune::get_env_config()
    static std::once_flag once;
    static config_t at_config;
    std::call_once(once, [&] {
-      const char *algo_str = os_get_option("TU_AUTOTUNE_ALGO");
       algorithm algo = algorithm::DEFAULT;
+      const char *algo_str = os_get_option("TU_AUTOTUNE_ALGO");
+      std::string_view algo_strv;
 
-      if (!algo_str)
-         algo_str = device->instance->autotune_algo; /* From dri conf. */
+      if (algo_str)
+         algo_strv = algo_str;
+      else if (device->instance->autotune_algo)
+         algo_strv = device->instance->autotune_algo;
 
-      if (algo_str) {
-         std::string_view algo_strv(algo_str);
+      if (!algo_strv.empty()) {
          if (algo_strv == "bandwidth") {
             algo = algorithm::BANDWIDTH;
          } else if (algo_strv == "profiled") {
@@ -293,11 +295,11 @@ tu_autotune::get_env_config()
          } else if (algo_strv == "prefer_sysmem") {
             algo = algorithm::PREFER_SYSMEM;
          } else {
-            mesa_logw("Unknown TU_AUTOTUNE_ALGO '%s', using default", algo_str);
+            mesa_logw("Unknown TU_AUTOTUNE_ALGO '%s', using default", algo_strv.data());
          }
 
          if (TU_DEBUG(STARTUP))
-            mesa_logi("TU_AUTOTUNE_ALGO=%u (%s)", (uint8_t) algo, algo_str);
+            mesa_logi("TU_AUTOTUNE_ALGO=%u (%s)", (uint8_t) algo, algo_strv.data());
       }
 
       /* Parse the flags from the environment variable. */

From 97f6f8b1a094fae02beb1c9c5bb945fd2335c8b6 Mon Sep 17 00:00:00 2001
From: Zan Dobersek <zdobersek@igalia.com>
Date: Wed, 17 Dec 2025 18:09:48 +0100
Subject: [PATCH 25/25] tu/autotune: tweak small-renderpass draw call count
 limit

Tweak the draw call limit value so that renderpasses with less than 5 draw
calls fall under the small-renderpass classification.

Signed-off-by: Zan Dobersek <zdobersek@igalia.com>
---
 src/freedreno/vulkan/tu_autotune.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/freedreno/vulkan/tu_autotune.cc b/src/freedreno/vulkan/tu_autotune.cc
index 6ddbc8fcfa7..0b3dbc5b4f7 100644
--- a/src/freedreno/vulkan/tu_autotune.cc
+++ b/src/freedreno/vulkan/tu_autotune.cc
@@ -1736,7 +1736,7 @@ tu_autotune::get_optimal_mode(struct tu_cmd_buffer *cmd_buffer, rp_ctx_t *rp_ctx
     * a few edge cases such as during deferred rendering G-buffer passes, as we don't have a good way to deal with those
     * edge cases yet, we just disable the autotuner for small RPs entirely for now unless TUNE_SMALL is specified.
     */
-   bool ignore_small_rp = !config.test(mod_flag::TUNE_SMALL) && rp_state->drawcall_count <= 5;
+   bool ignore_small_rp = !config.test(mod_flag::TUNE_SMALL) && rp_state->drawcall_count < 5;
 
    if (!enabled || simultaneous_use || ignore_small_rp)
       return default_mode;