diff --git a/docs/drivers/freedreno.rst b/docs/drivers/freedreno.rst
index f57db5ff18d..3bfbb348bbb 100644
--- a/docs/drivers/freedreno.rst
+++ b/docs/drivers/freedreno.rst
@@ -665,3 +665,66 @@ are supported at the moment: ``nir``, ``nobin``, ``sysmem``, ``gmem``, ``forcebi
 Some of these options will behave differently when toggled at runtime, for example:
 ``nolrz`` will still result in LRZ allocation which would not happen if the option
 was set in the environment variable.
+
+Autotune
+^^^^^^^^
+
+Turnip supports dynamically selecting between SYSMEM and GMEM rendering with the
+autotune system, the behavior of which can be controlled with the following
+environment variables:
+
+.. envvar:: TU_AUTOTUNE_ALGO
+
+  Selects the algorithm used for autotuning. Supported values are:
+
+  ``bandwidth``
+    Estimates the bandwidth usage of rendering in SYSMEM and GMEM modes, and chooses
+    the one with lower estimated bandwidth.
+
+  ``profiled``
+    Dynamically profiles the RP timings in SYSMEM and GMEM modes, and uses that to
+    move a probability distribution towards the optimal choice over time. This
+    algorithm tends to be far more accurate than the bandwidth algorithm at choosing
+    the optimal rendering mode but may result in larger FPS variance due to being
+    based on a probability distribution with random sampling. This is the default
+    algorithm.
+
+  ``profiled_imm``
+    Similar to ``profiled``, but only profiles the first few instances of a RP
+    and then sticks to the chosen mode for subsequent instances. This is meant
+    for single-frame traces run multiple times in a CI where this algorithm can
+    immediately chose the optimal rendering mode for each RP.
+
+  ``prefer_sysmem``
+    Always chooses SYSMEM rendering. This is useful for games that don't benefit
+    from GMEM rendering due to their rendering patterns, setting this is better
+    than using ``TU_DEBUG=sysmem`` when done for performance reasons.
+
+  The algorithm can be set via the driconf option ``tu_autotune_algorithm`` as well.
+
+.. envvar:: TU_AUTOTUNE_FLAGS
+
+  Modifies the behavior of the selected algorithm. Supported flags are:
+
+  ``big_gmem``
+    Always chooses GMEM rendering if the amount of draw calls in the render pass
+    is greater than a certain threshold. Larger RPs generally benefit more from
+    GMEM rendering due to less overhead from tiling. 
+
+  ``small_sysmem``
+    Always chooses SYSMEM rendering if the amount of draw calls in the render pass
+    is lower than a certain threshold. The benefits of GMEM rendering are less
+    pronounced in these smaller RPs and SYSMEM rendering tends to win more often.
+
+  ``preempt_optimize``
+    Tries to keep non-preemptible time in the render pass is below a certain
+    threshold. This is useful for systems with GPU-based compositors where long
+    non-preemptible times can lead to missed frame deadlines, causing noticeable
+    stuttering. This flag will reduce the performance of the render pass in order
+    to improve overall system responsiveness, it should not be used unless the
+    rest of the system is affected by preemption delays.
+
+  Multiple flags can be combined by separating them with commas, e.g.
+  ``TU_AUTOTUNE_FLAGS=big_gmem,small_sysmem``.
+
+  If no flags are specified, the default behavior is used.
\ No newline at end of file
diff --git a/src/freedreno/vulkan/.clang-format b/src/freedreno/vulkan/.clang-format
index f7f9e5755db..256e3ff892f 100644
--- a/src/freedreno/vulkan/.clang-format
+++ b/src/freedreno/vulkan/.clang-format
@@ -4,7 +4,7 @@ DisableFormat: false
 
 AlwaysBreakAfterReturnType: TopLevel
 BinPackParameters: false
-ColumnLimit: 78
+ColumnLimit: 120
 Cpp11BracedListStyle: false
 
 IncludeBlocks: Regroup
diff --git a/src/freedreno/vulkan/tu_autotune.cc b/src/freedreno/vulkan/tu_autotune.cc
index df969834a37..0b3dbc5b4f7 100644
--- a/src/freedreno/vulkan/tu_autotune.cc
+++ b/src/freedreno/vulkan/tu_autotune.cc
@@ -5,113 +5,358 @@
 
 #include "tu_autotune.h"
 
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <optional>
+#include <string>
+#include <string_view>
+
+#include "util/rand_xor.h"
+
+#define XXH_INLINE_ALL
+#include "util/xxhash.h"
+
 #include "tu_cmd_buffer.h"
 #include "tu_cs.h"
 #include "tu_device.h"
 #include "tu_image.h"
 #include "tu_pass.h"
 
-#define XXH_INLINE_ALL
-#include "util/xxhash.h"
+/** Compile-time debug options **/
 
-/* How does it work?
- *
- * - For each renderpass we calculate the number of samples passed
- *   by storing the number before and after in GPU memory.
- * - To store the values each command buffer holds GPU memory which
- *   expands with more renderpasses being written.
- * - For each renderpass we create tu_renderpass_result entry which
- *   points to the results in GPU memory.
- *   - Later on tu_renderpass_result would be added to the
- *     tu_renderpass_history entry which aggregate results for a
- *     given renderpass.
- * - On submission:
- *   - Process results which fence was signalled.
- *   - Free per-submission data which we now don't need.
- *
- *   - Create a command stream to write a fence value. This way we would
- *     know when we could safely read the results.
- *   - We cannot rely on the command buffer's lifetime when referencing
- *     its resources since the buffer could be destroyed before we process
- *     the results.
- *   - For each command buffer:
- *     - Reference its GPU memory.
- *     - Move if ONE_TIME_SUBMIT or copy all tu_renderpass_result to the queue.
- *
- * Since the command buffers could be recorded on different threads
- * we have to maintaining some amount of locking history table,
- * however we change the table only in a single thread at the submission
- * time, so in most cases there will be no locking.
- */
+#define TU_AUTOTUNE_DEBUG_LOG_BASE      0
+#define TU_AUTOTUNE_DEBUG_LOG_BANDWIDTH 0
+#define TU_AUTOTUNE_DEBUG_LOG_PROFILED  0
+#define TU_AUTOTUNE_DEBUG_LOG_PREEMPT   0
 
-void
-tu_autotune_free_results_locked(struct tu_device *dev, struct list_head *results);
+#if TU_AUTOTUNE_DEBUG_LOG_BASE
+#define at_log_base(fmt, ...)         mesa_logi("autotune: " fmt, ##__VA_ARGS__)
+#define at_log_base_h(fmt, hash, ...) mesa_logi("autotune %016" PRIx64 ": " fmt, hash, ##__VA_ARGS__)
+#else
+#define at_log_base(fmt, ...)
+#define at_log_base_h(fmt, hash, ...)
+#endif
 
-#define TU_AUTOTUNE_DEBUG_LOG 0
-/* Dump history entries on autotuner finish,
- * could be used to gather data from traces.
- */
-#define TU_AUTOTUNE_LOG_AT_FINISH 0
+#if TU_AUTOTUNE_DEBUG_LOG_BANDWIDTH
+#define at_log_bandwidth_h(fmt, hash, ...) mesa_logi("autotune-bw %016" PRIx64 ": " fmt, hash, ##__VA_ARGS__)
+#else
+#define at_log_bandwidth_h(fmt, hash, ...)
+#endif
 
-/* How many last renderpass stats are taken into account. */
-#define MAX_HISTORY_RESULTS 5
-/* For how many submissions we store renderpass stats. */
-#define MAX_HISTORY_LIFETIME 128
+#if TU_AUTOTUNE_DEBUG_LOG_PROFILED
+#define at_log_profiled_h(fmt, hash, ...) mesa_logi("autotune-prof %016" PRIx64 ": " fmt, hash, ##__VA_ARGS__)
+#else
+#define at_log_profiled_h(fmt, hash, ...)
+#endif
 
+#if TU_AUTOTUNE_DEBUG_LOG_PREEMPT
+#define at_log_preempt_h(fmt, hash, ...) mesa_logi("autotune-preempt %016" PRIx64 ": " fmt, hash, ##__VA_ARGS__)
+#else
+#define at_log_preempt_h(fmt, hash, ...)
+#endif
 
-/**
- * Tracks results for a given renderpass key
- */
-struct tu_renderpass_history {
-   uint64_t key;
+#if TU_AUTOTUNE_DEBUG_PERFCTR
+#define at_log_perfctr_h(fmt, hash, ...) mesa_logi("autotune-perfctr %016" PRIx64 ": " fmt, hash, ##__VA_ARGS__)
+#else
+#define at_log_perfctr_h(fmt, hash, ...)
+#endif
 
-   /* We would delete old history entries */
-   uint32_t last_fence;
+/* Process any pending entries on autotuner finish, could be used to gather data from traces. */
+#define TU_AUTOTUNE_FLUSH_AT_FINISH 0
 
-   /**
-    * List of recent fd_renderpass_result's
-    */
-   struct list_head results;
-   uint32_t num_results;
+/** Global constants and helpers **/
 
-   uint32_t avg_samples;
-};
+/* GPU always-on timer constants */
+constexpr uint64_t ALWAYS_ON_FREQUENCY_HZ = 19'200'000;
+constexpr uint64_t GPU_TICKS_PER_US = ALWAYS_ON_FREQUENCY_HZ / 1'000'000;
 
-/* Holds per-submission cs which writes the fence. */
-struct tu_submission_data {
-   struct list_head node;
-   uint32_t fence;
-
-   struct tu_cs fence_cs;
-};
-
-static bool
-fence_before(uint32_t a, uint32_t b)
+constexpr uint64_t
+ticks_to_us(uint64_t ticks)
 {
-   /* essentially a < b, but handle wrapped values */
-   return (int32_t)(a - b) < 0;
+   return ticks / GPU_TICKS_PER_US;
 }
 
-static uint32_t
-get_autotune_fence(struct tu_autotune *at)
+constexpr bool
+fence_before(uint32_t a, uint32_t b)
 {
-   return at->device->global_bo_map->autotune_fence;
+   /* Essentially a < b, but handles wrapped values. */
+   return (int32_t) (a - b) < 0;
+}
+
+constexpr const char *
+render_mode_str(tu_autotune::render_mode mode)
+{
+   switch (mode) {
+   case tu_autotune::render_mode::SYSMEM:
+      return "SYSMEM";
+   case tu_autotune::render_mode::GMEM:
+      return "GMEM";
+   default:
+      return "UNKNOWN";
+   }
+}
+
+/** Configuration **/
+
+enum class tu_autotune::algorithm : uint8_t {
+   BANDWIDTH = 0,     /* Uses estimated BW for determining rendering mode. */
+   PROFILED = 1,      /* Uses dynamically profiled results for determining rendering mode. */
+   PROFILED_IMM = 2,  /* Same as PROFILED but immediately resolves the SYSMEM/GMEM probability. */
+   PREFER_SYSMEM = 3, /* Always use SYSMEM unless we have strong evidence that GMEM is better. */
+
+   DEFAULT = PROFILED, /* Default algorithm, used if no other is specified. */
+};
+
+/* Modifier flags, these modify the behavior of the autotuner in a user-defined way. */
+enum class tu_autotune::mod_flag : uint8_t {
+   BIG_GMEM = BIT(1),         /* All RPs with >= 10 draws use GMEM. */
+   TUNE_SMALL = BIT(2),       /* Try tuning all RPs with <= 5 draws, ignored by default. */
+   PREEMPT_OPTIMIZE = BIT(3), /* Attempts to minimize the preemption latency. */
+};
+
+/* Metric flags, for internal tracking of enabled metrics. */
+enum class tu_autotune::metric_flag : uint8_t {
+   SAMPLES = BIT(1), /* Enable tracking samples passed metric. */
+   TS = BIT(2),      /* Enable tracking per-RP timestamp metric. */
+   TS_TILE = BIT(3), /* Enable tracking per-tile timestamp metric. */
+};
+
+struct PACKED tu_autotune::config_t {
+ private:
+   algorithm algo = algorithm::DEFAULT;
+   uint8_t mod_flags = 0;    /* See mod_flag enum. */
+   uint8_t metric_flags = 0; /* See metric_flag enum. */
+
+   constexpr void update_metric_flags()
+   {
+      /* Note: Always keep in sync with rp_history to prevent UB. */
+      if (algo == algorithm::BANDWIDTH) {
+         metric_flags |= (uint8_t) metric_flag::SAMPLES;
+      } else if (algo == algorithm::PROFILED || algo == algorithm::PROFILED_IMM) {
+         metric_flags |= (uint8_t) metric_flag::TS;
+      }
+
+      if (mod_flags & (uint8_t) mod_flag::PREEMPT_OPTIMIZE) {
+         metric_flags |= (uint8_t) metric_flag::TS | (uint8_t) metric_flag::TS_TILE;
+      }
+   }
+
+ public:
+   constexpr config_t() = default;
+
+   constexpr config_t(algorithm algo, uint8_t mod_flags): algo(algo), mod_flags(mod_flags)
+   {
+      update_metric_flags();
+   }
+
+   constexpr bool is_enabled(algorithm a) const
+   {
+      return algo == a;
+   }
+
+   constexpr bool test(mod_flag f) const
+   {
+      return mod_flags & (uint32_t) f;
+   }
+
+   constexpr bool test(metric_flag f) const
+   {
+      return metric_flags & (uint32_t) f;
+   }
+
+   constexpr bool set_algo(algorithm a)
+   {
+      if (algo == a)
+         return false;
+
+      algo = a;
+      update_metric_flags();
+      return true;
+   }
+
+   constexpr bool disable(mod_flag f)
+   {
+      if (!(mod_flags & (uint8_t) f))
+         return false;
+
+      mod_flags &= ~(uint8_t) f;
+      update_metric_flags();
+      return true;
+   }
+
+   constexpr bool enable(mod_flag f)
+   {
+      if (mod_flags & (uint8_t) f)
+         return false;
+
+      mod_flags |= (uint8_t) f;
+      update_metric_flags();
+      return true;
+   }
+
+   std::string to_string() const
+   {
+#define ALGO_STR(algo_name)                                                                                            \
+   if (algo == algorithm::algo_name)                                                                                   \
+      str += #algo_name;
+#define MODF_STR(flag)                                                                                                 \
+   if (mod_flags & (uint8_t) mod_flag::flag) {                                                                         \
+      str += #flag " ";                                                                                                \
+   }
+#define METRICF_STR(flag)                                                                                              \
+   if (metric_flags & (uint8_t) metric_flag::flag) {                                                                   \
+      str += #flag " ";                                                                                                \
+   }
+
+      std::string str = "Algorithm: ";
+
+      ALGO_STR(BANDWIDTH);
+      ALGO_STR(PROFILED);
+      ALGO_STR(PROFILED_IMM);
+      ALGO_STR(PREFER_SYSMEM);
+
+      str += ", Mod Flags: 0x" + std::to_string(mod_flags) + " (";
+      MODF_STR(BIG_GMEM);
+      MODF_STR(TUNE_SMALL);
+      MODF_STR(PREEMPT_OPTIMIZE);
+      str += ")";
+
+      str += ", Metric Flags: 0x" + std::to_string(metric_flags) + " (";
+      METRICF_STR(SAMPLES);
+      METRICF_STR(TS);
+      METRICF_STR(TS_TILE);
+      str += ")";
+
+      return str;
+
+#undef ALGO_STR
+#undef MODF_STR
+#undef METRICF_STR
+   }
+};
+
+union tu_autotune::packed_config_t {
+   config_t config;
+   uint32_t bits = 0;
+   static_assert(sizeof(bits) >= sizeof(config));
+   static_assert(std::is_trivially_copyable<config_t>::value,
+                 "config_t must be trivially copyable to be automatically packed");
+
+   constexpr packed_config_t(config_t config): config(config)
+   {
+   }
+
+   constexpr packed_config_t(uint32_t bits): bits(bits)
+   {
+   }
+};
+
+tu_autotune::atomic_config_t::atomic_config_t(config_t initial): config_bits(packed_config_t { initial }.bits)
+{
+}
+
+tu_autotune::config_t
+tu_autotune::atomic_config_t::load() const
+{
+   return config_t(packed_config_t { config_bits.load(std::memory_order_relaxed) }.config);
+}
+
+bool
+tu_autotune::atomic_config_t::compare_and_store(config_t updated, config_t expected)
+{
+   uint32_t expected_bits = packed_config_t { expected }.bits;
+   return config_bits.compare_exchange_strong(expected_bits, packed_config_t { updated }.bits,
+                                              std::memory_order_acquire, std::memory_order_relaxed);
+}
+
+tu_autotune::config_t
+tu_autotune::get_env_config()
+{
+   static std::once_flag once;
+   static config_t at_config;
+   std::call_once(once, [&] {
+      algorithm algo = algorithm::DEFAULT;
+      const char *algo_str = os_get_option("TU_AUTOTUNE_ALGO");
+      std::string_view algo_strv;
+
+      if (algo_str)
+         algo_strv = algo_str;
+      else if (device->instance->autotune_algo)
+         algo_strv = device->instance->autotune_algo;
+
+      if (!algo_strv.empty()) {
+         if (algo_strv == "bandwidth") {
+            algo = algorithm::BANDWIDTH;
+         } else if (algo_strv == "profiled") {
+            algo = algorithm::PROFILED;
+         } else if (algo_strv == "profiled_imm") {
+            algo = algorithm::PROFILED_IMM;
+         } else if (algo_strv == "prefer_sysmem") {
+            algo = algorithm::PREFER_SYSMEM;
+         } else {
+            mesa_logw("Unknown TU_AUTOTUNE_ALGO '%s', using default", algo_strv.data());
+         }
+
+         if (TU_DEBUG(STARTUP))
+            mesa_logi("TU_AUTOTUNE_ALGO=%u (%s)", (uint8_t) algo, algo_strv.data());
+      }
+
+      /* Parse the flags from the environment variable. */
+      const char *flags_env_str = os_get_option("TU_AUTOTUNE_FLAGS");
+      uint32_t mod_flags = 0;
+      if (flags_env_str) {
+         static const struct debug_control tu_at_flags_control[] = {
+            { "big_gmem", (uint32_t) mod_flag::BIG_GMEM },
+            { "tune_small", (uint32_t) mod_flag::TUNE_SMALL },
+            { "preempt_optimize", (uint32_t) mod_flag::PREEMPT_OPTIMIZE },
+            { NULL, 0 }
+         };
+
+         mod_flags = parse_debug_string(flags_env_str, tu_at_flags_control);
+         if (TU_DEBUG(STARTUP))
+            mesa_logi("TU_AUTOTUNE_FLAGS=0x%x (%s)", mod_flags, flags_env_str);
+      }
+
+      assert((uint8_t) mod_flags == mod_flags);
+      at_config = config_t(algo, (uint8_t) mod_flags);
+   });
+
+   if (TU_DEBUG(STARTUP))
+      mesa_logi("TU_AUTOTUNE: %s", at_config.to_string().c_str());
+
+   return at_config;
+}
+
+/** Global Fence and Internal CS Management **/
+
+tu_autotune::submission_entry::submission_entry(tu_device *device): fence(0)
+{
+   tu_cs_init(&fence_cs, device, TU_CS_MODE_GROW, 5, "autotune fence cs");
+}
+
+tu_autotune::submission_entry::~submission_entry()
+{
+   assert(!is_active());
+   tu_cs_finish(&fence_cs);
+}
+
+bool
+tu_autotune::submission_entry::is_active() const
+{
+   return fence_cs.device->global_bo_map->autotune_fence < fence;
 }
 
 template <chip CHIP>
 static void
-create_submission_fence(struct tu_device *dev,
-                        struct tu_cs *cs,
-                        uint32_t fence)
+write_fence_cs(struct tu_device *dev, struct tu_cs *cs, uint32_t fence)
 {
    uint64_t dst_iova = dev->global_bo->iova + gb_offset(autotune_fence);
    if (CHIP >= A7XX) {
       tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 4);
-      tu_cs_emit(cs,
-         CP_EVENT_WRITE7_0(.event = CACHE_FLUSH_TS,
-                           .write_src = EV_WRITE_USER_32B,
-                           .write_dst = EV_DST_RAM,
-                           .write_enabled = true).value);
+      tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = CACHE_FLUSH_TS, .write_src = EV_WRITE_USER_32B, .write_dst = EV_DST_RAM,
+                                       .write_enabled = true)
+                        .value);
    } else {
       tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 4);
       tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(CACHE_FLUSH_TS));
@@ -121,626 +366,1518 @@ create_submission_fence(struct tu_device *dev,
    tu_cs_emit(cs, fence);
 }
 
-static struct tu_submission_data *
-create_submission_data(struct tu_device *dev, struct tu_autotune *at,
-                       uint32_t fence)
+struct tu_cs *
+tu_autotune::submission_entry::try_get_cs(uint32_t new_fence)
 {
-   struct tu_submission_data *submission_data = NULL;
-   if (!list_is_empty(&at->submission_data_pool)) {
-      submission_data = list_first_entry(&at->submission_data_pool,
-                                         struct tu_submission_data, node);
-      list_del(&submission_data->node);
-   } else {
-      submission_data = (struct tu_submission_data *) calloc(
-         1, sizeof(struct tu_submission_data));
-      tu_cs_init(&submission_data->fence_cs, dev, TU_CS_MODE_GROW, 5, "autotune fence cs");
-   }
-   submission_data->fence = fence;
-
-   struct tu_cs* fence_cs = &submission_data->fence_cs;
-   tu_cs_begin(fence_cs);
-   TU_CALLX(dev, create_submission_fence)(dev, fence_cs, fence);
-   tu_cs_end(fence_cs);
-
-   list_addtail(&submission_data->node, &at->pending_submission_data);
-
-   return submission_data;
-}
-
-static void
-finish_submission_data(struct tu_autotune *at,
-                       struct tu_submission_data *data)
-{
-   list_del(&data->node);
-   list_addtail(&data->node, &at->submission_data_pool);
-   tu_cs_reset(&data->fence_cs);
-}
-
-static void
-free_submission_data(struct tu_submission_data *data)
-{
-   list_del(&data->node);
-   tu_cs_finish(&data->fence_cs);
-
-   free(data);
-}
-
-static uint64_t
-hash_renderpass_instance(const struct tu_render_pass *pass,
-                         const struct tu_framebuffer *framebuffer,
-                         const struct tu_cmd_buffer *cmd) {
-   uint32_t data[3 + pass->attachment_count * 5];
-   uint32_t* ptr = data;
-
-   *ptr++ = framebuffer->width;
-   *ptr++ = framebuffer->height;
-   *ptr++ = framebuffer->layers;
-
-   for (unsigned i = 0; i < pass->attachment_count; i++) {
-      *ptr++ = cmd->state.attachments[i]->view.width;
-      *ptr++ = cmd->state.attachments[i]->view.height;
-      *ptr++ = cmd->state.attachments[i]->image->vk.format;
-      *ptr++ = cmd->state.attachments[i]->image->vk.array_layers;
-      *ptr++ = cmd->state.attachments[i]->image->vk.mip_levels;
+   if (is_active()) {
+      /* If the CS is already active, we cannot write to it. */
+      return nullptr;
    }
 
-   return XXH64(data, sizeof(data), pass->autotune_hash);
+   struct tu_device *device = fence_cs.device;
+   tu_cs_reset(&fence_cs);
+   tu_cs_begin(&fence_cs);
+   TU_CALLX(device, write_fence_cs)(device, &fence_cs, new_fence);
+   tu_cs_end(&fence_cs);
+   assert(fence_cs.entry_count == 1); /* We expect the initial allocation to be large enough. */
+   fence = new_fence;
+
+   return &fence_cs;
 }
 
-static void
-free_result(struct tu_device *dev, struct tu_renderpass_result *result)
+struct tu_cs *
+tu_autotune::get_cs_for_fence(uint32_t fence)
 {
-   tu_suballoc_bo_free(&dev->autotune_suballoc, &result->bo);
-   list_del(&result->node);
-   free(result);
+   for (submission_entry &entry : submission_entries) {
+      struct tu_cs *cs = entry.try_get_cs(fence);
+      if (cs)
+         return cs;
+   }
+
+   /* If we reach here, we have to allocate a new entry. */
+   submission_entry &entry = submission_entries.emplace_back(device);
+   struct tu_cs *cs = entry.try_get_cs(fence);
+   assert(cs); /* We just allocated it, so it should be available. */
+   return cs;
 }
 
-static void
-free_history(struct tu_device *dev, struct tu_renderpass_history *history)
-{
-   tu_autotune_free_results_locked(dev, &history->results);
-   free(history);
-}
+/** RP Entry Management **/
 
-static bool
-get_history(struct tu_autotune *at, uint64_t rp_key, uint32_t *avg_samples)
-{
-   bool has_history = false;
+#if TU_AUTOTUNE_DEBUG_PERFCTR
+struct PACKED tu_perf_ctr_sample {
+   uint64_t begin;
+   uint64_t end;
+   /* The selector value at the beginning/end, used to validate that the countable wasn't changed during a preemption. */
+   uint32_t selector_begin;
+   uint32_t selector_end;
+};
+#endif
 
-   /* If the lock contantion would be found in the wild -
-    * we could use try_lock here.
+/* The part of the per-RP entry which is written by the GPU. */
+struct PACKED tu_autotune::rp_gpu_data {
+   /* HW requires the sample start/stop locations to be 128b aligned. */
+   alignas(16) uint64_t samples_start;
+   alignas(16) uint64_t samples_end;
+   uint64_t ts_start;
+   uint64_t ts_end;
+
+#if TU_AUTOTUNE_DEBUG_PERFCTR
+   struct tu_perf_ctr_sample preemption_reaction_delay, num_preemptions, always_count;
+   uint64_t cntrs_ready;
+   constexpr static uint64_t CNTRS_READY_MAGIC = 0xABCDEFEFE;
+#endif
+};
+
+/* Per-tile values for GMEM rendering, this structure is appended to the end of rp_gpu_data for each tile. */
+struct PACKED tu_autotune::tile_gpu_data {
+   uint64_t ts_start;
+   uint64_t ts_end;
+
+   /* A helper for the offset of this relative to BO start. */
+   static constexpr uint64_t offset(uint32_t tile_index)
+   {
+      return sizeof(rp_gpu_data) + (tile_index * sizeof(tile_gpu_data));
+   }
+};
+
+/* A small wrapper around rp_history to provide ref-counting and usage timestamps. */
+struct tu_autotune::rp_history_handle {
+   rp_history *history;
+
+   /* Note: Must be called with rp_mutex held. */
+   rp_history_handle(rp_history &history);
+
+   constexpr rp_history_handle(std::nullptr_t): history(nullptr)
+   {
+   }
+
+   rp_history_handle(const rp_history_handle &) = delete;
+   rp_history_handle &operator=(const rp_history_handle &) = delete;
+
+   constexpr rp_history_handle(rp_history_handle &&other): history(other.history)
+   {
+      other.history = nullptr;
+   }
+
+   constexpr rp_history_handle &operator=(rp_history_handle &&other)
+   {
+      if (this != &other) {
+         history = other.history;
+         other.history = nullptr;
+      }
+      return *this;
+   }
+
+   constexpr operator bool() const
+   {
+      return history != nullptr;
+   }
+
+   constexpr rp_history &operator*() const
+   {
+      assert(history);
+      return *history;
+   }
+
+   constexpr operator rp_history *() const
+   {
+      assert(history);
+      return history;
+   }
+
+   constexpr rp_history *operator->() const
+   {
+      assert(history);
+      return history;
+   }
+
+   ~rp_history_handle();
+};
+
+/* An "entry" of renderpass autotune results, which is used to store the results of a renderpass autotune run for a
+ * given command buffer. */
+struct tu_autotune::rp_entry {
+ private:
+   struct tu_device *device;
+
+   struct tu_suballoc_bo bo;
+   uint8_t *map; /* A direct pointer to the BO's CPU mapping. */
+
+   static_assert(alignof(rp_gpu_data) == 16);
+   static_assert(offsetof(rp_gpu_data, samples_start) == 0);
+   static_assert(offsetof(rp_gpu_data, samples_end) == 16);
+   static_assert(sizeof(rp_gpu_data) % alignof(tile_gpu_data) == 0);
+
+ public:
+   rp_history_handle history;
+   config_t config; /* Configuration at the time of entry creation. */
+   bool sysmem;
+   uint32_t tile_count;
+   uint32_t draw_count;
+
+   /* Amount of repeated RPs so far, used for uniquely identifying instances of the same RPs. */
+   uint32_t duplicates = 0;
+
+   rp_entry(struct tu_device *device, rp_history_handle &&history, config_t config, uint32_t draw_count)
+       : device(device), map(nullptr), history(std::move(history)), config(config), draw_count(draw_count)
+   {
+   }
+
+   ~rp_entry()
+   {
+      if (map) {
+         std::scoped_lock lock(device->autotune->suballoc_mutex);
+         tu_suballoc_bo_free(&device->autotune->suballoc, &bo);
+      }
+   }
+
+   /* Disable the copy/move operators as that shouldn't be done. */
+   rp_entry(const rp_entry &) = delete;
+   rp_entry &operator=(const rp_entry &) = delete;
+   rp_entry(rp_entry &&) = delete;
+   rp_entry &operator=(rp_entry &&) = delete;
+
+   void allocate(bool sysmem, uint32_t tile_count)
+   {
+      this->sysmem = sysmem;
+      this->tile_count = tile_count;
+      size_t total_size = sizeof(rp_gpu_data) + (tile_count * sizeof(tile_gpu_data));
+
+      std::scoped_lock lock(device->autotune->suballoc_mutex);
+      VkResult result = tu_suballoc_bo_alloc(&bo, &device->autotune->suballoc, total_size, alignof(rp_gpu_data));
+      if (result != VK_SUCCESS) {
+         mesa_loge("Failed to allocate BO for autotune rp_entry: %u", result);
+         return;
+      }
+
+      map = (uint8_t *) tu_suballoc_bo_map(&bo);
+      memset(map, 0, total_size);
+   }
+
+   rp_gpu_data &get_gpu_data()
+   {
+      assert(map);
+      return *(rp_gpu_data *) map;
+   }
+
+   tile_gpu_data &get_tile_gpu_data(uint32_t tile_index)
+   {
+      assert(map);
+      assert(tile_index < tile_count);
+      uint64_t offset = tile_gpu_data::offset(tile_index);
+      return *(tile_gpu_data *) (map + offset);
+   }
+
+   /** Samples-Passed Metric **/
+
+   uint64_t get_samples_passed()
+   {
+      assert(config.test(metric_flag::SAMPLES));
+      rp_gpu_data &gpu = get_gpu_data();
+      return gpu.samples_end - gpu.samples_start;
+   }
+
+   void emit_metric_samples_start(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint64_t start_iova)
+   {
+      tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNTER_CNTL(.copy = true));
+      if (cmd->device->physical_device->info->props.has_event_write_sample_count) {
+         tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
+         tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE, .write_sample_count = true).value);
+         tu_cs_emit_qw(cs, start_iova);
+
+         /* If the renderpass contains an occlusion query with its own ZPASS_DONE, we have to provide a fake ZPASS_DONE
+          * event here to logically close the previous one, preventing firmware from misbehaving due to nested events.
+          * This writes into the samples_end field, which will be overwritten in tu_autotune_end_renderpass.
+          */
+         if (cmd->state.rp.has_zpass_done_sample_count_write_in_rp) {
+            tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
+            tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE, .write_sample_count = true,
+                                             .sample_count_end_offset = true, .write_accum_sample_count_diff = true)
+                              .value);
+            tu_cs_emit_qw(cs, start_iova);
+         }
+      } else {
+         tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNTER_BASE(.qword = start_iova));
+         tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
+         tu_cs_emit(cs, ZPASS_DONE);
+      }
+   }
+
+   void emit_metric_samples_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint64_t start_iova, uint64_t end_iova)
+   {
+      tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNTER_CNTL(.copy = true));
+      if (cmd->device->physical_device->info->props.has_event_write_sample_count) {
+         /* If the renderpass contains ZPASS_DONE events we emit a fake ZPASS_DONE event here, composing a pair of these
+          * events that firmware handles without issue. This first event writes into the samples_end field and the
+          * second event overwrites it. The second event also enables the accumulation flag even when we don't use that
+          * result because the blob always sets it.
+          */
+         if (cmd->state.rp.has_zpass_done_sample_count_write_in_rp) {
+            tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
+            tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE, .write_sample_count = true).value);
+            tu_cs_emit_qw(cs, end_iova);
+         }
+
+         tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
+         tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE, .write_sample_count = true,
+                                          .sample_count_end_offset = true, .write_accum_sample_count_diff = true)
+                           .value);
+         tu_cs_emit_qw(cs, start_iova);
+      } else {
+         tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNTER_BASE(.qword = end_iova));
+         tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
+         tu_cs_emit(cs, ZPASS_DONE);
+      }
+   }
+
+   /** RP/Tile Timestamp Metric **/
+
+   uint64_t get_rp_duration()
+   {
+      assert(config.test(metric_flag::TS));
+      rp_gpu_data &gpu = get_gpu_data();
+      return gpu.ts_end - gpu.ts_start;
+   }
+
+   /* The amount of cycles spent in the longest tile. This is used to calculate the average draw duration for
+    * determining the largest non-preemptible duration for GMEM rendering.
     */
-   u_rwlock_rdlock(&at->ht_lock);
-   struct hash_entry *entry =
-      _mesa_hash_table_search(at->ht, &rp_key);
-   if (entry) {
-      struct tu_renderpass_history *history =
-         (struct tu_renderpass_history *) entry->data;
-      if (history->num_results > 0) {
-         *avg_samples = p_atomic_read(&history->avg_samples);
-         has_history = true;
+   uint64_t get_max_tile_duration()
+   {
+      assert(config.test(metric_flag::TS_TILE));
+      uint64_t max_duration = 0;
+      for (uint32_t i = 0; i < tile_count; i++) {
+         tile_gpu_data &tile = get_tile_gpu_data(i);
+         max_duration = MAX2(max_duration, tile.ts_end - tile.ts_start);
       }
+      return max_duration;
    }
-   u_rwlock_rdunlock(&at->ht_lock);
 
-   return has_history;
+   void emit_metric_timestamp(struct tu_cs *cs, uint64_t timestamp_iova)
+   {
+      tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
+      tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_ALWAYS_ON_COUNTER) | CP_REG_TO_MEM_0_CNT(2) | CP_REG_TO_MEM_0_64B);
+      tu_cs_emit_qw(cs, timestamp_iova);
+   }
+
+   /** Debug Performance Counters **/
+
+#if TU_AUTOTUNE_DEBUG_PERFCTR
+   uint64_t get_preemption_reaction_delay(tu_autotune &at, uint64_t rp_hash)
+   {
+      rp_gpu_data &gpu = get_gpu_data();
+
+      while (p_atomic_read(&gpu.cntrs_ready) != rp_gpu_data::CNTRS_READY_MAGIC) {
+         /* Just spin until the counter values are written out. */
+      }
+
+      auto read_counter = [&](const struct tu_perf_ctr_sample &sample, const struct fd_perfcntr_countable *ctbl,
+                              uint64_t &outValue, const char *name) {
+         if (sample.selector_begin != sample.selector_end || sample.selector_begin != ctbl->selector) {
+            mesa_loge(
+               "autotune %016" PRIx64 ": %s: selector mismatch %" PRIu32 " != %" PRIu32 " (%" PRIu32 " - %" PRIu32 ")",
+               rp_hash, ctbl->name, sample.selector_begin, sample.selector_end, sample.selector_begin, ctbl->selector);
+         }
+
+         outValue = sample.end - sample.begin;
+         if (sample.end < sample.begin) {
+            mesa_loge("autotune %016" PRIx64 ": %s: end < begin %" PRIu64 " < %" PRIu64, rp_hash, name, sample.end,
+                      sample.begin);
+            outValue = 0;
+         }
+      };
+
+      /* We read all counters for logging, even though we only need to return the preemption reaction delay. */
+      uint64_t preemption_reaction_delay;
+      uint64_t num_preemptions;
+      uint64_t always_count;
+      read_counter(gpu.preemption_reaction_delay, at.preemption_reaction_delay, preemption_reaction_delay,
+                   "preemption_reaction_delay");
+      read_counter(gpu.num_preemptions, at.num_preemptions, num_preemptions, "num_preemptions");
+      read_counter(gpu.always_count, at.always_count, always_count, "always_count");
+
+      if (preemption_reaction_delay || num_preemptions) {
+         at_log_perfctr_h("preemption_reaction_delay: %" PRIu64 ", always_count: %" PRIu64
+                          ", num_preemptions: %" PRIu64,
+                          rp_hash, preemption_reaction_delay, always_count, num_preemptions);
+      }
+
+      return preemption_reaction_delay;
+   }
+
+   void emit_debug_perfcntr_start(struct tu_cs *cs, tu_autotune &at, uint64_t bo_iova)
+   {
+      auto countable_begin = [&](const struct fd_perfcntr_countable *ctbl, uint32_t cntr_idx, uint32_t offset) {
+         const struct fd_perfcntr_counter *ctr = &at.cp_group->counters[cntr_idx];
+         uint64_t offset_iova = bo_iova + offset;
+         assert(!ctr->enable); /* CP counters shouldn't use it. */
+
+         tu_cs_emit_pkt4(cs, ctr->select_reg, 1);
+         tu_cs_emit(cs, ctbl->selector);
+
+         tu_cs_emit_wfi(cs);
+
+         tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
+         tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(ctr->select_reg) | CP_REG_TO_MEM_0_CNT(1));
+         tu_cs_emit_qw(cs, offset_iova + offsetof(struct tu_perf_ctr_sample, selector_begin));
+
+         tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
+         tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(ctr->counter_reg_lo) | CP_REG_TO_MEM_0_64B);
+         tu_cs_emit_qw(cs, offset_iova + offsetof(struct tu_perf_ctr_sample, begin));
+      };
+
+      countable_begin(at.preemption_reaction_delay, 10, offsetof(rp_gpu_data, preemption_reaction_delay));
+      countable_begin(at.num_preemptions, 11, offsetof(rp_gpu_data, num_preemptions));
+      countable_begin(at.always_count, 12, offsetof(rp_gpu_data, always_count));
+
+      tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
+      tu_cs_emit_wfi(cs);
+   }
+
+   void emit_debug_perfcntr_end(struct tu_cs *cs, tu_autotune &at, uint64_t bo_iova)
+   {
+      tu_cs_emit_wfi(cs);
+
+      auto countable_end = [&](uint32_t cntr_idx, uint64_t offset) {
+         const struct fd_perfcntr_counter *ctr = &at.cp_group->counters[cntr_idx];
+         uint64_t offset_iova = bo_iova + offset;
+
+         tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
+         tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(ctr->select_reg) | CP_REG_TO_MEM_0_CNT(1));
+         tu_cs_emit_qw(cs, offset_iova + offsetof(struct tu_perf_ctr_sample, selector_end));
+
+         tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
+         tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(ctr->counter_reg_lo) | CP_REG_TO_MEM_0_64B);
+         tu_cs_emit_qw(cs, offset_iova + offsetof(struct tu_perf_ctr_sample, end));
+      };
+
+      countable_end(10, offsetof(rp_gpu_data, preemption_reaction_delay));
+      countable_end(11, offsetof(rp_gpu_data, num_preemptions));
+      countable_end(12, offsetof(rp_gpu_data, always_count));
+
+      tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
+      tu_cs_emit_wfi(cs);
+
+      tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
+      tu_cs_emit_qw(cs, bo_iova + offsetof(rp_gpu_data, cntrs_ready));
+      tu_cs_emit_qw(cs, rp_gpu_data::CNTRS_READY_MAGIC);
+   }
+#endif
+
+   /** CS Emission **/
+
+   void emit_rp_start(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
+   {
+      assert(map && bo.iova);
+      uint64_t bo_iova = bo.iova;
+      if (config.test(metric_flag::SAMPLES))
+         emit_metric_samples_start(cmd, cs, bo_iova + offsetof(rp_gpu_data, samples_start));
+
+      if (config.test(metric_flag::TS))
+         emit_metric_timestamp(cs, bo_iova + offsetof(rp_gpu_data, ts_start));
+
+#if TU_AUTOTUNE_DEBUG_PERFCTR
+      emit_debug_perfcntr_start(cs, *cmd->device->autotune, bo_iova);
+#endif
+   }
+
+   void emit_rp_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
+   {
+      assert(map && bo.iova);
+      uint64_t bo_iova = bo.iova;
+      if (config.test(metric_flag::SAMPLES))
+         emit_metric_samples_end(cmd, cs, bo_iova + offsetof(rp_gpu_data, samples_start),
+                                 bo_iova + offsetof(rp_gpu_data, samples_end));
+
+      if (config.test(metric_flag::TS))
+         emit_metric_timestamp(cs, bo_iova + offsetof(rp_gpu_data, ts_end));
+
+#if TU_AUTOTUNE_DEBUG_PERFCTR
+      emit_debug_perfcntr_end(cs, *cmd->device->autotune, bo_iova);
+#endif
+   }
+
+   void emit_tile_start(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t tile_index)
+   {
+      assert(map && bo.iova);
+      assert(!sysmem);
+      assert(tile_index < tile_count);
+      if (config.test(metric_flag::TS_TILE))
+         emit_metric_timestamp(cs, bo.iova + tile_gpu_data::offset(tile_index) + offsetof(tile_gpu_data, ts_start));
+   }
+
+   void emit_tile_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t tile_index)
+   {
+      assert(map && bo.iova);
+      assert(!sysmem);
+      assert(tile_index < tile_count);
+      if (config.test(metric_flag::TS_TILE))
+         emit_metric_timestamp(cs, bo.iova + tile_gpu_data::offset(tile_index) + offsetof(tile_gpu_data, ts_end));
+   }
+};
+
+tu_autotune::rp_entry_batch::rp_entry_batch(): active(false), fence(0), entries()
+{
 }
 
-static struct tu_renderpass_result *
-create_history_result(struct tu_autotune *at, uint64_t rp_key)
+void
+tu_autotune::rp_entry_batch::assign_fence(uint32_t new_fence)
 {
-   struct tu_renderpass_result *result =
-      (struct tu_renderpass_result *) calloc(1, sizeof(*result));
-   result->rp_key = rp_key;
-
-   return result;
+   assert(!active); /* Cannot assign a fence to an active entry batch. */
+   fence = new_fence;
+   active = true;
 }
 
-static void
-history_add_result(struct tu_device *dev, struct tu_renderpass_history *history,
-                      struct tu_renderpass_result *result)
-{
-   list_delinit(&result->node);
-   list_add(&result->node, &history->results);
+/** Renderpass state tracking. **/
 
-   if (history->num_results < MAX_HISTORY_RESULTS) {
-      history->num_results++;
+tu_autotune::rp_key::rp_key(const struct tu_render_pass *pass,
+                            const struct tu_framebuffer *framebuffer,
+                            const struct tu_cmd_buffer *cmd)
+{
+   /* It may be hard to match the same renderpass between frames, or rather it's hard to strike a
+    * balance between being too lax with identifying different renderpasses as the same one, and
+    * not recognizing the same renderpass between frames when only a small thing changed.
+    *
+    * This is mainly an issue with translation layers (particularly DXVK), because a layer may
+    * break a "renderpass" into smaller ones due to some heuristic that isn't consistent between
+    * frames.
+    *
+    * Note: Not using image IOVA leads to too many false matches.
+    */
+
+   struct PACKED packed_att_properties {
+      uint64_t iova;
+      bool load;
+      bool store;
+      bool load_stencil;
+      bool store_stencil;
+   };
+
+   auto get_hash = [&](uint32_t *data, size_t size) {
+      uint32_t *ptr = data;
+      *ptr++ = framebuffer->width;
+      *ptr++ = framebuffer->height;
+      *ptr++ = framebuffer->layers;
+
+      for (unsigned i = 0; i < pass->attachment_count; i++) {
+         packed_att_properties props = {
+            .iova = cmd->state.attachments[i]->image->iova + cmd->state.attachments[i]->view.offset,
+            .load = pass->attachments[i].load,
+            .store = pass->attachments[i].store,
+            .load_stencil = pass->attachments[i].load_stencil,
+            .store_stencil = pass->attachments[i].store_stencil,
+         };
+
+         memcpy(ptr, &props, sizeof(packed_att_properties));
+         ptr += sizeof(packed_att_properties) / sizeof(uint32_t);
+      }
+      assert(ptr == data + size);
+
+      return XXH64(data, size * sizeof(uint32_t), 0);
+   };
+
+   /* We do a manual Boost-style "small vector" optimization here where the stack is used for the vast majority of
+    * cases, while only extreme cases need to allocate on the heap.
+    */
+   size_t data_count = 3 + (pass->attachment_count * sizeof(packed_att_properties) / sizeof(uint32_t));
+   constexpr size_t STACK_MAX_DATA_COUNT = 3 + (5 * 3); /* in u32 units. */
+
+   if (data_count <= STACK_MAX_DATA_COUNT) {
+      /* If the data is small enough, we can use the stack. */
+      std::array<uint32_t, STACK_MAX_DATA_COUNT> arr;
+      hash = get_hash(arr.data(), data_count);
    } else {
-      /* Once above the limit, start popping old results off the
-       * tail of the list:
+      /* If the data is too large, we have to allocate it on the heap. */
+      std::vector<uint32_t> vec(data_count);
+      hash = get_hash(vec.data(), vec.size());
+   }
+}
+
+tu_autotune::rp_key::rp_key(const rp_key &key, uint32_t duplicates)
+{
+   hash = XXH64(&key.hash, sizeof(key.hash), duplicates);
+}
+
+/* Exponential moving average (EMA) calculator for smoothing successive values of any metric. An alpha (smoothing
+ * factor) of 0.1 means 10% weight to new values (slow adaptation), while 0.9 means 90% weight (fast adaptation).
+ */
+template <typename T = double> class exponential_average {
+ private:
+   std::atomic<double> average = std::numeric_limits<double>::quiet_NaN();
+   double alpha;
+
+ public:
+   explicit exponential_average(double alpha = 0.1) noexcept: alpha(alpha)
+   {
+   }
+
+   bool empty() const noexcept
+   {
+      double current = average.load(std::memory_order_relaxed);
+      return std::isnan(current);
+   }
+
+   void add(T value) noexcept
+   {
+      double v = static_cast<double>(value);
+      double current = average.load(std::memory_order_relaxed);
+      double new_avg;
+      do {
+         new_avg = std::isnan(current) ? v : (1.0 - alpha) * current + alpha * v;
+      } while (!average.compare_exchange_weak(current, new_avg, std::memory_order_relaxed, std::memory_order_relaxed));
+   }
+
+   void clear() noexcept
+   {
+      average.store(std::numeric_limits<double>::quiet_NaN(), std::memory_order_relaxed);
+   }
+
+   T get() const noexcept
+   {
+      double current = average.load(std::memory_order_relaxed);
+      return std::isnan(current) ? T {} : static_cast<T>(current);
+   }
+};
+
+/* An improvement over pure EMA to filter out spikes by using two EMAs:
+ * - A "slow" EMA with a low alpha to track the long-term average.
+ * - A "fast" EMA with a high alpha to track short-term changes.
+ * When retrieving the average, if the fast EMA deviates significantly from the slow EMA, it indicates a spike, and we
+ * fall back to the slow EMA.
+ */
+template <typename T = double> class adaptive_average {
+ private:
+   static constexpr double DEFAULT_SLOW_ALPHA = 0.1, DEFAULT_FAST_ALPHA = 0.5, DEFAULT_DEVIATION_THRESHOLD = 0.3;
+   exponential_average<T> slow;
+   exponential_average<T> fast;
+   double deviationThreshold;
+
+ public:
+   size_t count = 0;
+
+   explicit adaptive_average(double slow_alpha = DEFAULT_SLOW_ALPHA,
+                             double fast_alpha = DEFAULT_FAST_ALPHA,
+                             double deviation_threshold = DEFAULT_DEVIATION_THRESHOLD) noexcept
+       : slow(slow_alpha), fast(fast_alpha), deviationThreshold(deviation_threshold)
+   {
+   }
+
+   void add(T value) noexcept
+   {
+      slow.add(value);
+      fast.add(value);
+      count++;
+   }
+
+   T get() const noexcept
+   {
+      double s = slow.get();
+      double f = fast.get();
+      /* Use fast if it's close to slow (normal variation).
+       * Use slow if fast deviates too much (likely a spike).
        */
-      struct tu_renderpass_result *old_result =
-         list_last_entry(&history->results, struct tu_renderpass_result, node);
-      mtx_lock(&dev->autotune_mutex);
-      free_result(dev, old_result);
-      mtx_unlock(&dev->autotune_mutex);
+      double deviation = std::abs(f - s) / s;
+      return (deviation < deviationThreshold) ? f : s + (f - s) * deviationThreshold;
    }
 
-   /* Do calculations here to avoid locking history in tu_autotune_use_bypass */
-   uint32_t total_samples = 0;
-   list_for_each_entry(struct tu_renderpass_result, result,
-                       &history->results, node) {
-      total_samples += result->samples_passed;
+   void clear() noexcept
+   {
+      slow.clear();
+      fast.clear();
+      count = 0;
+   }
+};
+
+/* All historical state pertaining to a uniquely identified RP. This integrates data from RP entries, accumulating
+ * metrics over the long-term and providing autotune algorithms using the data.
+ */
+struct tu_autotune::rp_history {
+ private:
+   /* Amount of duration samples for profiling before we start averaging. */
+   static constexpr uint32_t MIN_PROFILE_DURATION_COUNT = 5;
+
+   adaptive_average<uint64_t> sysmem_rp_average;
+   adaptive_average<uint64_t> gmem_rp_average;
+
+ public:
+   uint64_t hash; /* The hash of the renderpass, just for debug output. */
+
+   std::atomic<uint32_t> refcount = 0; /* Reference count to prevent deletion when active. */
+   std::atomic<uint64_t> last_use_ts;  /* Last time the reference count was updated, in monotonic nanoseconds. */
+
+   rp_history(uint64_t hash): hash(hash), last_use_ts(os_time_get_nano()), profiled(hash)
+   {
    }
 
-   float avg_samples = (float)total_samples / (float)history->num_results;
-   p_atomic_set(&history->avg_samples, (uint32_t)avg_samples);
-}
+   /** Bandwidth Estimation Algorithm **/
+   struct bandwidth_algo {
+    private:
+      exponential_average<uint32_t> mean_samples_passed;
 
-static void
-process_results(struct tu_autotune *at, uint32_t current_fence)
-{
-   struct tu_device *dev = at->device;
-
-   list_for_each_entry_safe(struct tu_renderpass_result, result,
-                            &at->pending_results, node) {
-      if (fence_before(current_fence, result->fence))
-         break;
-
-      struct tu_renderpass_history *history = result->history;
-      result->samples_passed =
-         result->samples->samples_end - result->samples->samples_start;
-
-      history_add_result(dev, history, result);
-   }
-
-   list_for_each_entry_safe(struct tu_submission_data, submission_data,
-                            &at->pending_submission_data, node) {
-      if (fence_before(current_fence, submission_data->fence))
-         break;
-
-      finish_submission_data(at, submission_data);
-   }
-}
-
-static void
-queue_pending_results(struct tu_autotune *at, struct tu_cmd_buffer *cmdbuf)
-{
-   bool one_time_submit = cmdbuf->usage_flags &
-         VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
-
-   if (one_time_submit) {
-      /* We can just steal the list since it won't be resubmitted again */
-      list_splicetail(&cmdbuf->renderpass_autotune_results,
-                        &at->pending_results);
-      list_inithead(&cmdbuf->renderpass_autotune_results);
-   } else {
-      list_for_each_entry_safe(struct tu_renderpass_result, result,
-                              &cmdbuf->renderpass_autotune_results, node) {
-         /* TODO: copying each result isn't nice */
-         struct tu_renderpass_result *copy =
-            (struct tu_renderpass_result *) malloc(sizeof(*result));
-         *copy = *result;
-         tu_bo_get_ref(copy->bo.bo);
-         list_addtail(&copy->node, &at->pending_results);
+    public:
+      void update(uint32_t samples)
+      {
+         mean_samples_passed.add(samples);
       }
+
+      render_mode get_optimal_mode(rp_history &history,
+                                   const struct tu_cmd_state *cmd_state,
+                                   const struct tu_render_pass *pass,
+                                   const struct tu_framebuffer *framebuffer,
+                                   const struct tu_render_pass_state *rp_state)
+      {
+         const VkExtent2D &extent = cmd_state->render_area.extent;
+         const uint32_t pass_pixel_count = extent.width * extent.height;
+         uint64_t sysmem_bandwidth = (uint64_t) pass->sysmem_bandwidth_per_pixel * pass_pixel_count;
+         uint64_t gmem_bandwidth = (uint64_t) pass->gmem_bandwidth_per_pixel * pass_pixel_count;
+
+         uint64_t total_draw_call_bandwidth = 0;
+         uint64_t mean_samples = mean_samples_passed.get();
+         if (rp_state->drawcall_count && mean_samples > 0.0) {
+            /* The total draw call bandwidth is estimated as the average samples (collected via tracking samples passed
+             * within the CS) multiplied by the drawcall bandwidth per sample, divided by the amount of draw calls.
+             *
+             * This is a rough estimate of the bandwidth used by the draw calls in the renderpass for FB writes which
+             * is used to determine whether to use SYSMEM or GMEM.
+             */
+            total_draw_call_bandwidth =
+               (mean_samples * rp_state->drawcall_bandwidth_per_sample_sum) / rp_state->drawcall_count;
+         }
+
+         /* Drawcalls access the memory in SYSMEM rendering (ignoring CCU). */
+         sysmem_bandwidth += total_draw_call_bandwidth;
+
+         /* Drawcalls access GMEM in GMEM rendering, but we do not want to ignore them completely.  The state changes
+          * between tiles also have an overhead.  The magic numbers of 11 and 10 are randomly chosen.
+          */
+         gmem_bandwidth = (gmem_bandwidth * 11 + total_draw_call_bandwidth) / 10;
+
+         bool select_sysmem = sysmem_bandwidth <= gmem_bandwidth;
+         render_mode mode = select_sysmem ? render_mode::SYSMEM : render_mode::GMEM;
+
+         at_log_bandwidth_h(
+            "%" PRIu32 " selecting %s\n"
+            "   mean_samples=%" PRIu64 ", draw_bandwidth_per_sample=%.2f, total_draw_call_bandwidth=%" PRIu64
+            ", render_area=%" PRIu32 "x%" PRIu32 ", sysmem_bandwidth_per_pixel=%" PRIu32
+            ", gmem_bandwidth_per_pixel=%" PRIu32 ", sysmem_bandwidth=%" PRIu64 ", gmem_bandwidth=%" PRIu64,
+            history.hash, rp_state->drawcall_count, render_mode_str(mode), mean_samples,
+            (float) rp_state->drawcall_bandwidth_per_sample_sum / rp_state->drawcall_count, total_draw_call_bandwidth,
+            extent.width, extent.height, pass->sysmem_bandwidth_per_pixel, pass->gmem_bandwidth_per_pixel,
+            sysmem_bandwidth, gmem_bandwidth);
+
+         return mode;
+      }
+   } bandwidth;
+
+   /** Profiled Algorithms **/
+   struct profiled_algo {
+    private:
+      /* Range [0 (GMEM), 100 (SYSMEM)], where 50 means no preference. */
+      constexpr static uint32_t PROBABILITY_MAX = 100, PROBABILITY_MID = 50;
+      constexpr static uint32_t PROBABILITY_PREFER_SYSMEM = 80, PROBABILITY_PREFER_GMEM = 20;
+
+      std::atomic<uint32_t> sysmem_probability = PROBABILITY_MID;
+      bool should_reset = false; /* If true, will reset sysmem_probability before next update. */
+      bool locked = false;       /* If true, the probability will no longer be updated. */
+      uint64_t seed[2] { 0x3bffb83978e24f88, 0x9238d5d56c71cd35 };
+
+      bool is_sysmem_winning = false;
+      uint64_t winning_since_ts = 0;
+
+    public:
+      profiled_algo(uint64_t hash)
+      {
+         seed[1] = hash;
+      }
+
+      void update(rp_history &history, bool immediate)
+      {
+         if (locked)
+            return;
+
+         auto &sysmem_ema = history.sysmem_rp_average;
+         auto &gmem_ema = history.gmem_rp_average;
+         uint32_t sysmem_prob = sysmem_probability.load(std::memory_order_relaxed);
+         if (immediate) {
+            /* Try to immediately resolve the probability, this is useful for CI running a single trace of frames where
+             * the probabilites aren't expected to change from run to run. This environment also gives us a best case
+             * scenario for autotune performance, since we know the optimal decisions.
+             */
+
+            if (sysmem_ema.count < 1) {
+               sysmem_prob = PROBABILITY_MAX;
+            } else if (gmem_ema.count < 1) {
+               sysmem_prob = 0;
+            } else {
+               sysmem_prob = gmem_ema.get() < sysmem_ema.get() ? 0 : PROBABILITY_MAX;
+               locked = true;
+            }
+         } else {
+            if (sysmem_ema.count < MIN_PROFILE_DURATION_COUNT || gmem_ema.count < MIN_PROFILE_DURATION_COUNT) {
+               /* Not enough data to make a decision, bias towards least used. */
+               sysmem_prob = sysmem_ema.count < gmem_ema.count ? PROBABILITY_PREFER_SYSMEM : PROBABILITY_PREFER_GMEM;
+               should_reset = true;
+            } else {
+               if (should_reset) {
+                  sysmem_prob = PROBABILITY_MID;
+                  should_reset = false;
+               }
+
+               /* Adjust probability based on timing results. */
+               constexpr uint32_t FAST_STEP_DELTA = 5, FAST_MIN_PROBABILITY = 5, FAST_MAX_PROBABILITY = 95;
+               constexpr uint32_t SLOW_STEP_DELTA = 1, SLOW_MIN_PROBABILITY = 1, SLOW_MAX_PROBABILITY = 99;
+
+               uint64_t avg_sysmem = sysmem_ema.get();
+               uint64_t avg_gmem = gmem_ema.get();
+
+               if (avg_gmem < avg_sysmem) {
+                  if (sysmem_prob > FAST_MIN_PROBABILITY && sysmem_prob <= FAST_MAX_PROBABILITY)
+                     sysmem_prob = MAX2(sysmem_prob - FAST_STEP_DELTA, FAST_MIN_PROBABILITY);
+                  else if (sysmem_prob > SLOW_MIN_PROBABILITY)
+                     sysmem_prob = MAX2(sysmem_prob - SLOW_STEP_DELTA, SLOW_MIN_PROBABILITY);
+               } else if (avg_sysmem < avg_gmem) {
+                  if (sysmem_prob >= FAST_MIN_PROBABILITY && sysmem_prob < FAST_MAX_PROBABILITY)
+                     sysmem_prob = MIN2(sysmem_prob + FAST_STEP_DELTA, FAST_MAX_PROBABILITY);
+                  else if (sysmem_prob < SLOW_MAX_PROBABILITY)
+                     sysmem_prob = MIN2(sysmem_prob + SLOW_STEP_DELTA, SLOW_MAX_PROBABILITY);
+               }
+
+               /* If the RP duration exceeds a certain minimum duration threshold (i.e. has a large impact on frametime)
+                * and the percentage difference between the modes is large enough, we lock into the optimal mode. This
+                * avoids performance hazards from switching to an extremely suboptimal mode even if done very rarely.
+                * Note: Due to the potentially huge negative impact of a bad lock, this is a very conservative check.
+                */
+               constexpr uint32_t MIN_LOCK_DURATION_COUNT = 15;
+               constexpr uint64_t MIN_LOCK_THRESHOLD = GPU_TICKS_PER_US * 1'000; /* 1ms */
+               constexpr uint32_t LOCK_PERCENT_DIFF = 30;
+               constexpr uint64_t LOCK_TIME_WINDOW_NS = 30'000'000'000; /* 30s */
+
+               uint64_t now = os_time_get_nano();
+               bool current_sysmem_winning = avg_sysmem < avg_gmem;
+
+               if (winning_since_ts == 0 || current_sysmem_winning != is_sysmem_winning) {
+                  winning_since_ts = now;
+                  is_sysmem_winning = current_sysmem_winning;
+               }
+
+               bool has_resolved = sysmem_prob == SLOW_MAX_PROBABILITY || sysmem_prob == SLOW_MIN_PROBABILITY;
+               bool enough_samples =
+                  sysmem_ema.count >= MIN_LOCK_DURATION_COUNT && gmem_ema.count >= MIN_LOCK_DURATION_COUNT;
+               uint64_t min_avg = MIN2(avg_sysmem, avg_gmem);
+               uint64_t max_avg = MAX2(avg_sysmem, avg_gmem);
+               uint64_t percent_diff = (100 * (max_avg - min_avg)) / min_avg;
+
+               if (has_resolved && enough_samples && max_avg >= MIN_LOCK_THRESHOLD &&
+                   percent_diff >= LOCK_PERCENT_DIFF && (now - winning_since_ts) >= LOCK_TIME_WINDOW_NS) {
+                  if (avg_gmem < avg_sysmem)
+                     sysmem_prob = 0;
+                  else
+                     sysmem_prob = 100;
+                  locked = true;
+               }
+            }
+         }
+
+         sysmem_probability.store(sysmem_prob, std::memory_order_relaxed);
+
+         at_log_profiled_h("update%s avg_gmem: %" PRIu64 " us (%" PRIu64 " samples) avg_sysmem: %" PRIu64
+                           " us (%" PRIu64 " samples) = sysmem_probability: %" PRIu32 " locked: %u",
+                           history.hash, immediate ? "-imm" : "", ticks_to_us(gmem_ema.get()), gmem_ema.count,
+                           ticks_to_us(sysmem_ema.get()), sysmem_ema.count, sysmem_prob, locked);
+      }
+
+    public:
+      render_mode get_optimal_mode(rp_history &history)
+      {
+         uint32_t l_sysmem_probability = sysmem_probability.load(std::memory_order_relaxed);
+         bool select_sysmem = (rand_xorshift128plus(seed) % PROBABILITY_MAX) < l_sysmem_probability;
+         render_mode mode = select_sysmem ? render_mode::SYSMEM : render_mode::GMEM;
+
+         at_log_profiled_h("%" PRIu32 "%% sysmem chance, using %s", history.hash, l_sysmem_probability,
+                           render_mode_str(mode));
+
+         return mode;
+      }
+   } profiled;
+
+   /** Preemption Latency Optimization Mode **/
+   struct preempt_optimize_mode {
+    private:
+      adaptive_average<uint64_t> sysmem_draw_average;
+      adaptive_average<uint64_t> gmem_tile_average;
+
+      /* If the renderpass has long draws which are at risk of causing high preemptible latency. */
+      std::atomic<bool> latency_risk = false;
+      /* The factor by which the tile size should be divided to reduce preemption latency. */
+      std::atomic<uint32_t> tile_size_divisor = 1;
+
+      /* The next timestamp to update the latency sensitivity parameters at. */
+      uint64_t latency_update_ts = 0;
+      /* The next timestamp where it's allowed to decrement the divisor. */
+      uint64_t divisor_decrement_ts = 0;
+      /* The next timestamp where it's allowed to mark the RP as no longer latency sensitive. */
+      uint64_t latency_switch_ts = 0;
+
+      /* Threshold of longest non-preemptible duration before activating latency optimization: 1.5ms */
+      static constexpr uint64_t TARGET_THRESHOLD = GPU_TICKS_PER_US * 1500;
+
+#if TU_AUTOTUNE_DEBUG_PERFCTR
+      /* The highest preemption reaction delay recorded for the RP since the last update. */
+      uint64_t max_preemption_latency = 0;
+
+    public:
+      void update_preemption_latency(uint64_t preemption_latency)
+      {
+         max_preemption_latency = MAX2(max_preemption_latency, preemption_latency);
+      }
+#endif
+
+    public:
+      void update_sysmem(rp_history &history, uint64_t draw_duration)
+      {
+         bool l_latency_risk = latency_risk.load(std::memory_order_relaxed);
+
+         if (!l_latency_risk) {
+            /* Try to estimate the minimum non-preemptible duration for draw-level preemption, by dividing the total
+             * time by the RP by the amount of draws. This isn't very accurate as it's skewed by the time taken by
+             * commands other than draws (e.g. clears or blits), but it's a good enough estimate to catch the worst
+             * offenders.
+             *
+             * If the average draw duration is above a certain threshold, we mark the RP as latency sensitive which
+             * should bias the decision towards GMEM.
+             */
+
+            sysmem_draw_average.add(draw_duration);
+            uint64_t avg_sysmem_draw = sysmem_draw_average.get();
+            uint64_t sysmem_draw_count = sysmem_draw_average.count;
+
+            at_log_preempt_h("avg_sysmem_draw: %" PRIu64 " us (%u), latency_risk: %u"
+#if TU_AUTOTUNE_DEBUG_PERFCTR
+                             ", preemption_latency: %" PRIu64
+#endif
+                             ,
+                             history.hash, ticks_to_us(avg_sysmem_draw), avg_sysmem_draw > TARGET_THRESHOLD,
+                             l_latency_risk
+#if TU_AUTOTUNE_DEBUG_PERFCTR
+                             ,
+                             max_preemption_latency
+#endif
+            );
+
+#if TU_AUTOTUNE_DEBUG_PERFCTR
+            max_preemption_latency = 0;
+#endif
+
+            if (sysmem_draw_count >= MIN_PROFILE_DURATION_COUNT && avg_sysmem_draw > TARGET_THRESHOLD) {
+               latency_risk.store(true, std::memory_order_relaxed);
+               at_log_preempt_h("high sysmem draw duration %" PRIu64 " us, marking as latency sensitive", history.hash,
+                                ticks_to_us(avg_sysmem_draw));
+            }
+         }
+      }
+
+      void update_gmem(rp_history &history, uint64_t tile_duration)
+      {
+         constexpr uint64_t default_update_duration_ns = 100'000'000;         /* 100ms */
+         constexpr uint64_t change_update_duration_ns = 500'000'000;          /* 500ms */
+         constexpr uint64_t downward_update_duration_ns = 10'000'000'000;     /* 10s */
+         constexpr uint64_t latency_insensitive_duration_ns = 30'000'000'000; /* 30s */
+
+         gmem_tile_average.add(tile_duration);
+
+         uint64_t now = os_time_get_nano();
+         if (latency_update_ts > now)
+            return; /* No need to update yet. */
+
+         /* If the RP is latency sensitive and we're using GMEM, we should check if it's worth reducing the tile size to
+          * reduce the latency risk further or if it's already low enough that it's not worth the performance hit.
+          */
+
+         uint64_t update_duration_ns = default_update_duration_ns;
+         if (gmem_tile_average.count > MIN_PROFILE_DURATION_COUNT) {
+            uint64_t avg_gmem_tile = gmem_tile_average.get();
+            bool l_latency_risk = latency_risk.load(std::memory_order_relaxed);
+            if (!l_latency_risk) {
+               if (avg_gmem_tile > TARGET_THRESHOLD) {
+                  latency_risk.store(true, std::memory_order_relaxed);
+                  latency_switch_ts = now + latency_insensitive_duration_ns;
+
+                  at_log_preempt_h("high gmem tile duration %" PRIu64 ", marking as latency sensitive", history.hash,
+                                   avg_gmem_tile);
+               }
+            } else {
+               uint32_t l_tile_size_divisor = tile_size_divisor.load(std::memory_order_relaxed);
+               at_log_preempt_h("avg_gmem_tile: %" PRIu64 " us (%u), latency_risk: %u, tile_size_divisor: %" PRIu32
+#if TU_AUTOTUNE_DEBUG_PERFCTR
+                                ", preemption_latency: %" PRIu64
+#endif
+                                ,
+                                history.hash, ticks_to_us(avg_gmem_tile), avg_gmem_tile > TARGET_THRESHOLD,
+                                l_latency_risk, l_tile_size_divisor
+#if TU_AUTOTUNE_DEBUG_PERFCTR
+                                ,
+                                max_preemption_latency
+#endif
+                  );
+
+#if TU_AUTOTUNE_DEBUG_PERFCTR
+               max_preemption_latency = 0;
+#endif
+
+               int delta = 0;
+               if (avg_gmem_tile > TARGET_THRESHOLD && l_tile_size_divisor < TU_GMEM_LAYOUT_DIVISOR_MAX) {
+                  /* If the average tile duration is high, we should reduce the tile size to reduce the latency risk. */
+                  delta = 1;
+
+                  divisor_decrement_ts = now + downward_update_duration_ns;
+               } else if (avg_gmem_tile * 4 < TARGET_THRESHOLD && l_tile_size_divisor > 1 &&
+                          divisor_decrement_ts <= now) {
+                  /* If the average tile duration is low enough that we can get away with a larger tile size, we should
+                   * increase the tile size to reduce the performance hit of the smaller tiles.
+                   *
+                   * Note: The 4x factor is to account for the tile duration being halved when we increase the tile size
+                   * divisor by 1, with an additional 2x factor to generally be conservative about reducing the divisor
+                   * since it can lead to oscillation between tile sizes.
+                   *
+                   * Similarly, divisor_decrement_ts is used to limit how often we can reduce the divisor to avoid
+                   * oscillation.
+                   */
+                  delta = -1;
+                  latency_switch_ts = now + latency_insensitive_duration_ns;
+               } else if (avg_gmem_tile * 10 < TARGET_THRESHOLD && l_tile_size_divisor == 1 &&
+                          latency_switch_ts <= now) {
+                  /* If the average tile duration is low enough that we no longer consider the RP latency sensitive, we
+                   * can switch it back to non-latency sensitive.
+                   */
+                  latency_risk.store(false, std::memory_order_relaxed);
+               }
+
+               if (delta != 0) {
+                  /* Clear all the results to avoid biasing the decision based on the old tile size. */
+                  gmem_tile_average.clear();
+
+                  uint32_t new_tile_size_divisor = l_tile_size_divisor + delta;
+                  at_log_preempt_h("updating tile size divisor: %" PRIu32 " -> %" PRIu32, history.hash,
+                                   l_tile_size_divisor, new_tile_size_divisor);
+
+                  tile_size_divisor.store(new_tile_size_divisor, std::memory_order_relaxed);
+
+                  update_duration_ns = change_update_duration_ns;
+               }
+            }
+
+            latency_update_ts = now + update_duration_ns;
+         }
+      }
+
+      /* If this RP has a risk of causing high preemption latency. */
+      bool is_latency_sensitive() const
+      {
+         return latency_risk.load(std::memory_order_relaxed);
+      }
+
+      uint32_t get_tile_size_divisor() const
+      {
+         return tile_size_divisor.load(std::memory_order_relaxed);
+      }
+   } preempt_optimize;
+
+   void process(rp_entry &entry, tu_autotune &at)
+   {
+      /* We use entry config to know what metrics it has, autotune config to know what algorithms are enabled. */
+      config_t entry_config = entry.config;
+      config_t at_config = at.active_config.load();
+
+      if (entry_config.test(metric_flag::SAMPLES) && at_config.is_enabled(algorithm::BANDWIDTH))
+         bandwidth.update(entry.get_samples_passed());
+
+#if TU_AUTOTUNE_DEBUG_PERFCTR
+      preempt_optimize.update_preemption_latency(entry.get_preemption_reaction_delay(at, hash));
+#endif
+
+      if (entry_config.test(metric_flag::TS)) {
+         if (entry.sysmem) {
+            uint64_t rp_duration = entry.get_rp_duration();
+
+            sysmem_rp_average.add(rp_duration);
+
+            if (at_config.test(mod_flag::PREEMPT_OPTIMIZE))
+               preempt_optimize.update_sysmem(*this, rp_duration / entry.draw_count);
+         } else {
+            gmem_rp_average.add(entry.get_rp_duration());
+
+            if (entry_config.test(metric_flag::TS_TILE) && at_config.test(mod_flag::PREEMPT_OPTIMIZE))
+               preempt_optimize.update_gmem(*this, entry.get_max_tile_duration());
+         }
+
+         if (at_config.is_enabled(algorithm::PROFILED) || at_config.is_enabled(algorithm::PROFILED_IMM)) {
+            profiled.update(*this, at_config.is_enabled(algorithm::PROFILED_IMM));
+         }
+      }
+   }
+};
+
+tu_autotune::rp_history_handle::~rp_history_handle()
+{
+   if (!history)
+      return;
+
+   history->last_use_ts.store(os_time_get_nano(), std::memory_order_relaxed);
+   ASSERTED uint32_t old_refcount = history->refcount.fetch_sub(1, std::memory_order_relaxed);
+   assert(old_refcount != 0); /* Underflow check. */
+}
+
+tu_autotune::rp_history_handle::rp_history_handle(rp_history &history): history(&history)
+{
+   history.refcount.fetch_add(1, std::memory_order_relaxed);
+   history.last_use_ts.store(os_time_get_nano(), std::memory_order_relaxed);
+}
+
+tu_autotune::rp_history_handle
+tu_autotune::find_rp_history(const rp_key &key)
+{
+   std::shared_lock lock(rp_mutex);
+   auto it = rp_histories.find(key);
+   if (it != rp_histories.end())
+      return rp_history_handle(it->second);
+
+   return rp_history_handle(nullptr);
+}
+
+tu_autotune::rp_history_handle
+tu_autotune::find_or_create_rp_history(const rp_key &key)
+{
+   rp_history_handle existing = find_rp_history(key);
+   if (existing)
+      return existing;
+
+   /* If we reach here, we have to create a new history. */
+   std::unique_lock lock(rp_mutex);
+   auto it = rp_histories.find(key);
+   if (it != rp_histories.end())
+      return it->second; /* Another thread created the history while we were waiting for the lock. */
+   auto history = rp_histories.emplace(std::make_pair(key, key.hash));
+   return rp_history_handle(history.first->second);
+}
+
+void
+tu_autotune::reap_old_rp_histories()
+{
+   constexpr uint64_t REAP_INTERVAL_NS = 10'000'000'000; /* 10s */
+   uint64_t now = os_time_get_nano();
+   if (last_reap_ts + REAP_INTERVAL_NS > now)
+      return;
+   last_reap_ts = now;
+
+   constexpr size_t MAX_RP_HISTORIES = 1024; /* Not a hard limit, we might exceed this if there's many active RPs. */
+   {
+      /* Quicker non-unique lock, should hit this path mostly. */
+      std::shared_lock lock(rp_mutex);
+      if (rp_histories.size() <= MAX_RP_HISTORIES)
+         return;
+   }
+
+   std::unique_lock lock(rp_mutex);
+   size_t og_size = rp_histories.size();
+   if (og_size <= MAX_RP_HISTORIES)
+      return;
+
+   std::vector<rp_histories_t::iterator> candidates;
+   candidates.reserve(og_size);
+   for (auto it = rp_histories.begin(); it != rp_histories.end(); ++it) {
+      if (it->second.refcount.load(std::memory_order_relaxed) == 0)
+         candidates.push_back(it);
+   }
+
+   size_t to_purge = std::min(candidates.size(), og_size - MAX_RP_HISTORIES);
+   if (to_purge == 0) {
+      at_log_base("no RP histories to reap at size %zu, all are active", og_size);
+      return;
+   }
+
+   /* Partition candidates by last use timestamp, oldest first. */
+   auto partition_end = candidates.begin() + to_purge;
+   if (to_purge < candidates.size()) {
+      std::nth_element(candidates.begin(), partition_end, candidates.end(),
+                       [](rp_histories_t::iterator a, rp_histories_t::iterator b) {
+                          return a->second.last_use_ts.load(std::memory_order_relaxed) <
+                                 b->second.last_use_ts.load(std::memory_order_relaxed);
+                       });
+   }
+
+   for (auto it = candidates.begin(); it != partition_end; ++it) {
+      rp_history &history = (*it)->second;
+      if (history.refcount.load(std::memory_order_relaxed) == 0) {
+         at_log_base("reaping RP history %016" PRIx64, history.hash);
+         rp_histories.erase(*it);
+      }
+   }
+
+   at_log_base("reaped old RP histories %zu -> %zu", og_size, rp_histories.size());
+}
+
+void
+tu_autotune::process_entries()
+{
+   uint32_t current_fence = device->global_bo_map->autotune_fence;
+
+   while (!active_batches.empty()) {
+      auto &batch = active_batches.front();
+      assert(batch->active);
+
+      if (fence_before(current_fence, batch->fence))
+         break; /* Entries are allocated in sequence, next will be newer and
+                   also fail so we can just directly break out of the loop. */
+
+      for (auto &entry : batch->entries)
+         entry->history->process(*entry, *this);
+
+      active_batches.pop_front();
+   }
+
+   if (active_batches.size() > 10) {
+      at_log_base("high amount of active batches: %zu, fence: %" PRIu32 " < %" PRIu32, active_batches.size(),
+                  current_fence, active_batches.front()->fence);
    }
 }
 
 struct tu_cs *
-tu_autotune_on_submit(struct tu_device *dev,
-                      struct tu_autotune *at,
-                      struct tu_cmd_buffer **cmd_buffers,
-                      uint32_t cmd_buffer_count)
+tu_autotune::on_submit(struct tu_cmd_buffer **cmd_buffers, uint32_t cmd_buffer_count)
 {
-   /* We are single-threaded here */
 
-   const uint32_t gpu_fence = get_autotune_fence(at);
-   const uint32_t new_fence = at->fence_counter++;
-
-   process_results(at, gpu_fence);
-
-   /* Create history entries here to minimize work and locking being
-    * done on renderpass end.
+   /* This call occurs regularly and we are single-threaded here, so we use this opportunity to process any available
+    * entries. It's also important that any entries are processed here because we always want to ensure that we've
+    * processed all entries from prior CBs before we submit any new CBs with the same RP to the GPU.
     */
+   process_entries();
+   reap_old_rp_histories();
+
+   bool has_results = false;
    for (uint32_t i = 0; i < cmd_buffer_count; i++) {
-      struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
-      list_for_each_entry_safe(struct tu_renderpass_result, result,
-                          &cmdbuf->renderpass_autotune_results, node) {
-         struct tu_renderpass_history *history;
-         struct hash_entry *entry =
-            _mesa_hash_table_search(at->ht, &result->rp_key);
-         if (!entry) {
-            history =
-               (struct tu_renderpass_history *) calloc(1, sizeof(*history));
-            history->key = result->rp_key;
-            list_inithead(&history->results);
-
-            u_rwlock_wrlock(&at->ht_lock);
-            _mesa_hash_table_insert(at->ht, &history->key, history);
-            u_rwlock_wrunlock(&at->ht_lock);
-         } else {
-            history = (struct tu_renderpass_history *) entry->data;
-         }
-
-         history->last_fence = new_fence;
-
-         result->fence = new_fence;
-         result->history = history;
+      auto &batch = cmd_buffers[i]->autotune_ctx.batch;
+      if (!batch->entries.empty()) {
+         has_results = true;
+         break;
       }
    }
+   if (!has_results)
+      return nullptr; /* No results to process, return early. */
 
-   struct tu_submission_data *submission_data =
-      create_submission_data(dev, at, new_fence);
-
+   /* Generate a new fence and the CS for it. */
+   const uint32_t new_fence = next_fence++;
+   auto fence_cs = get_cs_for_fence(new_fence);
    for (uint32_t i = 0; i < cmd_buffer_count; i++) {
+      /* Transfer the entries from the command buffers to the active queue. */
       struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
-      if (list_is_empty(&cmdbuf->renderpass_autotune_results))
+      auto &batch = cmdbuf->autotune_ctx.batch;
+      if (batch->entries.empty())
          continue;
 
-      queue_pending_results(at, cmdbuf);
+      batch->assign_fence(new_fence);
+      if (cmdbuf->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT) {
+         /* If the command buffer is one-time submit, we can move the batch directly into the active batches, as it
+          * won't be used again. This would lead to it being deallocated as early as possible.
+          */
+         active_batches.push_back(std::move(batch));
+      } else {
+         active_batches.push_back(batch);
+      }
    }
 
-   if (TU_AUTOTUNE_DEBUG_LOG)
-      mesa_logi("Total history entries: %u", at->ht->entries);
+   return fence_cs;
+}
 
-   /* Cleanup old entries from history table. The assumption
-    * here is that application doesn't hold many old unsubmitted
-    * command buffers, otherwise this table may grow big.
+tu_autotune::tu_autotune(struct tu_device *device, VkResult &result): device(device), active_config(get_env_config())
+{
+   tu_bo_suballocator_init(&suballoc, device, 128 * 1024, TU_BO_ALLOC_INTERNAL_RESOURCE, "autotune_suballoc");
+
+#if TU_AUTOTUNE_DEBUG_PERFCTR
+   uint32_t group_count;
+   const struct fd_perfcntr_group *groups = fd_perfcntrs(&device->physical_device->dev_id, &group_count);
+
+   for (uint32_t i = 0; i < group_count; i++) {
+      if (strcmp(groups[i].name, "CP") == 0) {
+         cp_group = &groups[i];
+         break;
+      }
+   }
+
+   if (!cp_group) {
+      mesa_loge("autotune: CP group not found");
+      result = VK_ERROR_INITIALIZATION_FAILED;
+      return;
+   } else if (cp_group->num_countables < 5) {
+      mesa_loge("autotune: CP group has too few countables");
+      result = VK_ERROR_INITIALIZATION_FAILED;
+      return;
+   }
+
+   auto get_perfcntr_countable = [](const struct fd_perfcntr_group *group,
+                                    const char *name) -> const struct fd_perfcntr_countable * {
+      for (uint32_t i = 0; i < group->num_countables; i++) {
+         if (strcmp(group->countables[i].name, name) == 0)
+            return &group->countables[i];
+      }
+
+      mesa_loge("autotune: %s not found in group %s", name, group->name);
+      return nullptr;
+   };
+
+   preemption_reaction_delay = get_perfcntr_countable(cp_group, "PERF_CP_PREEMPTION_REACTION_DELAY");
+   num_preemptions = get_perfcntr_countable(cp_group, "PERF_CP_NUM_PREEMPTIONS");
+   always_count = get_perfcntr_countable(cp_group, "PERF_CP_ALWAYS_COUNT");
+
+   if (!preemption_reaction_delay || !num_preemptions || !always_count) {
+      mesa_loge("autotune: preemption countables not found");
+      result = VK_ERROR_INITIALIZATION_FAILED;
+      return;
+   }
+#endif
+
+   result = VK_SUCCESS;
+   return;
+}
+
+tu_autotune::~tu_autotune()
+{
+   if (TU_AUTOTUNE_FLUSH_AT_FINISH) {
+      while (!active_batches.empty())
+         process_entries();
+      at_log_base("finished processing all entries");
+   }
+
+   tu_bo_suballocator_finish(&suballoc);
+}
+
+tu_autotune::cmd_buf_ctx::cmd_buf_ctx(): batch(std::make_shared<rp_entry_batch>())
+{
+}
+
+tu_autotune::cmd_buf_ctx::~cmd_buf_ctx()
+{
+   /* This is empty but it causes the implicit destructor to be compiled within this compilation unit with access to
+    * internal structures. Otherwise, we would need to expose the full definition of autotuner internals in the header
+    * file, which is not desirable.
     */
-   hash_table_foreach(at->ht, entry) {
-      struct tu_renderpass_history *history =
-         (struct tu_renderpass_history *) entry->data;
-      if (fence_before(gpu_fence, history->last_fence + MAX_HISTORY_LIFETIME))
-         continue;
-
-      if (TU_AUTOTUNE_DEBUG_LOG)
-         mesa_logi("Removed old history entry %016" PRIx64 "", history->key);
-
-      u_rwlock_wrlock(&at->ht_lock);
-      _mesa_hash_table_remove_key(at->ht, &history->key);
-      u_rwlock_wrunlock(&at->ht_lock);
-
-      mtx_lock(&dev->autotune_mutex);
-      free_history(dev, history);
-      mtx_unlock(&dev->autotune_mutex);
-   }
-
-   return &submission_data->fence_cs;
-}
-
-static bool
-renderpass_key_equals(const void *_a, const void *_b)
-{
-   return *(uint64_t *)_a == *(uint64_t *)_b;
-}
-
-static uint32_t
-renderpass_key_hash(const void *_a)
-{
-   return *((uint64_t *) _a) & 0xffffffff;
-}
-
-VkResult
-tu_autotune_init(struct tu_autotune *at, struct tu_device *dev)
-{
-   at->enabled = true;
-   at->device = dev;
-   at->ht = _mesa_hash_table_create(NULL,
-                                    renderpass_key_hash,
-                                    renderpass_key_equals);
-   u_rwlock_init(&at->ht_lock);
-
-   list_inithead(&at->pending_results);
-   list_inithead(&at->pending_submission_data);
-   list_inithead(&at->submission_data_pool);
-
-   /* start from 1 because tu6_global::autotune_fence is initialized to 0 */
-   at->fence_counter = 1;
-
-   return VK_SUCCESS;
 }
 
 void
-tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev)
+tu_autotune::cmd_buf_ctx::reset()
 {
-   if (TU_AUTOTUNE_LOG_AT_FINISH) {
-      while (!list_is_empty(&at->pending_results)) {
-         const uint32_t gpu_fence = get_autotune_fence(at);
-         process_results(at, gpu_fence);
-      }
-
-      hash_table_foreach(at->ht, entry) {
-         struct tu_renderpass_history *history =
-            (struct tu_renderpass_history *) entry->data;
-
-         mesa_logi("%016" PRIx64 " \tavg_passed=%u results=%u",
-                   history->key, history->avg_samples, history->num_results);
-      }
-   }
-
-   tu_autotune_free_results(dev, &at->pending_results);
-
-   mtx_lock(&dev->autotune_mutex);
-   hash_table_foreach(at->ht, entry) {
-      struct tu_renderpass_history *history =
-         (struct tu_renderpass_history *) entry->data;
-      free_history(dev, history);
-   }
-   mtx_unlock(&dev->autotune_mutex);
-
-   list_for_each_entry_safe(struct tu_submission_data, submission_data,
-                            &at->pending_submission_data, node) {
-      free_submission_data(submission_data);
-   }
-
-   list_for_each_entry_safe(struct tu_submission_data, submission_data,
-                            &at->submission_data_pool, node) {
-      free_submission_data(submission_data);
-   }
-
-   _mesa_hash_table_destroy(at->ht, NULL);
-   u_rwlock_destroy(&at->ht_lock);
+   batch = std::make_shared<rp_entry_batch>();
 }
 
-bool
-tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers,
-                                  uint32_t cmd_buffer_count)
+tu_autotune::rp_entry *
+tu_autotune::cmd_buf_ctx::attach_rp_entry(struct tu_device *device,
+                                          rp_history_handle &&history,
+                                          config_t config,
+                                          uint32_t drawcall_count)
 {
-   for (uint32_t i = 0; i < cmd_buffer_count; i++) {
-      struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
-      if (!list_is_empty(&cmdbuf->renderpass_autotune_results))
-         return true;
+   std::unique_ptr<rp_entry> &new_entry =
+      batch->entries.emplace_back(std::make_unique<rp_entry>(device, std::move(history), config, drawcall_count));
+   return new_entry.get();
+}
+
+tu_autotune::rp_entry *
+tu_autotune::cmd_buf_ctx::find_rp_entry(const rp_key &key)
+{
+   for (auto &entry : batch->entries) {
+      if (entry->history->hash == key.hash)
+         return entry.get();
    }
-
-   return false;
+   return nullptr;
 }
 
-void
-tu_autotune_free_results_locked(struct tu_device *dev, struct list_head *results)
+tu_autotune::render_mode
+tu_autotune::get_optimal_mode(struct tu_cmd_buffer *cmd_buffer, rp_ctx_t *rp_ctx)
 {
-   list_for_each_entry_safe(struct tu_renderpass_result, result,
-                            results, node) {
-      free_result(dev, result);
-   }
-}
+   const struct tu_cmd_state *cmd_state = &cmd_buffer->state;
+   const struct tu_render_pass *pass = cmd_state->pass;
+   const struct tu_framebuffer *framebuffer = cmd_state->framebuffer;
+   const struct tu_render_pass_state *rp_state = &cmd_state->rp;
+   cmd_buf_ctx &cb_ctx = cmd_buffer->autotune_ctx;
+   config_t config = active_config.load();
 
-void
-tu_autotune_free_results(struct tu_device *dev, struct list_head *results)
-{
-   mtx_lock(&dev->autotune_mutex);
-   tu_autotune_free_results_locked(dev, results);
-   mtx_unlock(&dev->autotune_mutex);
-}
-
-static bool
-fallback_use_bypass(const struct tu_render_pass *pass,
-                    const struct tu_framebuffer *framebuffer,
-                    const struct tu_cmd_buffer *cmd_buffer)
-{
-   if (cmd_buffer->state.rp.drawcall_count > 5)
-      return false;
-
-   for (unsigned i = 0; i < pass->subpass_count; i++) {
-      if (pass->subpasses[i].samples != VK_SAMPLE_COUNT_1_BIT)
-         return false;
-   }
-
-   return true;
-}
-
-static uint32_t
-get_render_pass_pixel_count(const struct tu_cmd_buffer *cmd)
-{
-   const VkExtent2D *extent = &cmd->state.render_area.extent;
-   return extent->width * extent->height;
-}
-
-static uint64_t
-estimate_drawcall_bandwidth(const struct tu_cmd_buffer *cmd,
-                            uint32_t avg_renderpass_sample_count)
-{
-   const struct tu_cmd_state *state = &cmd->state;
-
-   if (!state->rp.drawcall_count)
-      return 0;
-
-   /* sample count times drawcall_bandwidth_per_sample */
-   return (uint64_t)avg_renderpass_sample_count *
-      state->rp.drawcall_bandwidth_per_sample_sum / state->rp.drawcall_count;
-}
-
-bool
-tu_autotune_use_bypass(struct tu_autotune *at,
-                       struct tu_cmd_buffer *cmd_buffer,
-                       struct tu_renderpass_result **autotune_result)
-{
-   const struct tu_render_pass *pass = cmd_buffer->state.pass;
-   const struct tu_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
+   /* Just to ensure a segfault for accesses, in case we don't set it. */
+   *rp_ctx = nullptr;
 
    /* If a feedback loop in the subpass caused one of the pipelines used to set
-    * SINGLE_PRIM_MODE(FLUSH_PER_OVERLAP_AND_OVERWRITE) or even
-    * SINGLE_PRIM_MODE(FLUSH), then that should cause significantly increased
-    * sysmem bandwidth (though we haven't quantified it).
+    * SINGLE_PRIM_MODE(FLUSH_PER_OVERLAP_AND_OVERWRITE) or even SINGLE_PRIM_MODE(FLUSH), then that should cause
+    * significantly increased SYSMEM bandwidth (though we haven't quantified it).
     */
-   if (cmd_buffer->state.rp.sysmem_single_prim_mode)
-      return false;
+   if (rp_state->sysmem_single_prim_mode)
+      return render_mode::GMEM;
 
-   /* If the user is using a fragment density map, then this will cause less
-    * FS invocations with GMEM, which has a hard-to-measure impact on
-    * performance because it depends on how heavy the FS is in addition to how
-    * many invocations there were and the density. Let's assume the user knows
-    * what they're doing when they added the map, because if sysmem is
-    * actually faster then they could've just not used the fragment density
-    * map.
+   /* If the user is using a fragment density map, then this will cause less FS invocations with GMEM, which has a
+    * hard-to-measure impact on performance because it depends on how heavy the FS is in addition to how many
+    * invocations there were and the density. Let's assume the user knows what they're doing when they added the map,
+    * because if SYSMEM is actually faster then they could've just not used the fragment density map.
     */
    if (pass->has_fdm)
-      return false;
+      return render_mode::GMEM;
 
-   /* For VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT buffers
-    * we would have to allocate GPU memory at the submit time and copy
-    * results into it.
-    * Native games ususally don't use it, Zink and DXVK don't use it,
-    * D3D12 doesn't have such concept.
+   /* SYSMEM is always a safe default mode when we can't fully engage the autotuner. From testing, we know that for an
+    * incorrect decision towards SYSMEM tends to be far less impactful than an incorrect decision towards GMEM, which
+    * can cause significant performance issues.
     */
-   bool simultaneous_use =
-      cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
+   constexpr render_mode default_mode = render_mode::SYSMEM;
 
-   if (!at->enabled || simultaneous_use)
-      return fallback_use_bypass(pass, framebuffer, cmd_buffer);
-
-   /* We use 64bit hash as a key since we don't fear rare hash collision,
-    * the worst that would happen is sysmem being selected when it should
-    * have not, and with 64bit it would be extremely rare.
+   /* For VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT buffers, we would have to allocate GPU memory at the submit time
+    * and copy results into it. We just disable complex autotuner in this case, which isn't a big issue since native
+    * games usually don't use it, Zink and DXVK don't use it, while D3D12 doesn't even have such concept.
     *
-    * Q: Why not make the key from framebuffer + renderpass pointers?
-    * A: At least DXVK creates new framebuffers each frame while keeping
-    *    renderpasses the same. Also we want to support replaying a single
-    *    frame in a loop for testing.
+    * We combine this with processing entries at submit time, to avoid a race where the CPU hasn't processed the results
+    * from an earlier submission of the CB while a second submission of the CB is on the GPU queue.
     */
-   uint64_t renderpass_key = hash_renderpass_instance(pass, framebuffer, cmd_buffer);
+   bool simultaneous_use = cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
 
-   *autotune_result = create_history_result(at, renderpass_key);
+   /* These smaller RPs with few draws are too difficult to create a balanced hash for that can independently identify
+    * them while not being so unique to not properly identify them across CBs. They're generally insigificant outside of
+    * a few edge cases such as during deferred rendering G-buffer passes, as we don't have a good way to deal with those
+    * edge cases yet, we just disable the autotuner for small RPs entirely for now unless TUNE_SMALL is specified.
+    */
+   bool ignore_small_rp = !config.test(mod_flag::TUNE_SMALL) && rp_state->drawcall_count < 5;
 
-   uint32_t avg_samples = 0;
-   if (get_history(at, renderpass_key, &avg_samples)) {
-      const uint32_t pass_pixel_count =
-         get_render_pass_pixel_count(cmd_buffer);
-      uint64_t sysmem_bandwidth =
-         (uint64_t)pass->sysmem_bandwidth_per_pixel * pass_pixel_count;
-      uint64_t gmem_bandwidth =
-         (uint64_t)pass->gmem_bandwidth_per_pixel * pass_pixel_count;
+   if (!enabled || simultaneous_use || ignore_small_rp)
+      return default_mode;
 
-      const uint64_t total_draw_call_bandwidth =
-         estimate_drawcall_bandwidth(cmd_buffer, avg_samples);
+  
+   /* We can return early with the decision based on the draw call count, instead of needing to hash the renderpass
+    * instance and look up the history, which is far more expensive.
+    *
+    * However, certain options such as latency sensitive mode take precedence over any of the other autotuner options
+    * and we cannot do so in those cases.
+    */
+   bool can_early_return = !config.test(mod_flag::PREEMPT_OPTIMIZE);
+   auto early_return_mode = [&]() -> std::optional<render_mode> {
+      if (config.test(mod_flag::BIG_GMEM) && rp_state->drawcall_count >= 10)
+         return render_mode::GMEM;
+      if (config.is_enabled(algorithm::PREFER_SYSMEM))
+         return render_mode::SYSMEM;
+      return std::nullopt;
+   }();
 
-      /* drawcalls access the memory in sysmem rendering (ignoring CCU) */
-      sysmem_bandwidth += total_draw_call_bandwidth;
-
-      /* drawcalls access gmem in gmem rendering, but we do not want to ignore
-       * them completely.  The state changes between tiles also have an
-       * overhead.  The magic numbers of 11 and 10 are randomly chosen.
-       */
-      gmem_bandwidth = (gmem_bandwidth * 11 + total_draw_call_bandwidth) / 10;
-
-      const bool select_sysmem = sysmem_bandwidth <= gmem_bandwidth;
-      if (TU_AUTOTUNE_DEBUG_LOG) {
-         const VkExtent2D *extent = &cmd_buffer->state.render_area.extent;
-         const float drawcall_bandwidth_per_sample =
-            (float)cmd_buffer->state.rp.drawcall_bandwidth_per_sample_sum /
-            cmd_buffer->state.rp.drawcall_count;
-
-         mesa_logi("autotune %016" PRIx64 ":%u selecting %s",
-               renderpass_key,
-               cmd_buffer->state.rp.drawcall_count,
-               select_sysmem ? "sysmem" : "gmem");
-         mesa_logi("   avg_samples=%u, draw_bandwidth_per_sample=%.2f, total_draw_call_bandwidth=%" PRIu64,
-               avg_samples,
-               drawcall_bandwidth_per_sample,
-               total_draw_call_bandwidth);
-         mesa_logi("   render_area=%ux%u, sysmem_bandwidth_per_pixel=%u, gmem_bandwidth_per_pixel=%u",
-               extent->width, extent->height,
-               pass->sysmem_bandwidth_per_pixel,
-               pass->gmem_bandwidth_per_pixel);
-         mesa_logi("   sysmem_bandwidth=%" PRIu64 ", gmem_bandwidth=%" PRIu64,
-               sysmem_bandwidth, gmem_bandwidth);
-      }
-
-      return select_sysmem;
+   if (can_early_return && early_return_mode) {
+      at_log_base_h("%" PRIu32 " draw calls, using %s (early)", rp_key(pass, framebuffer, cmd_buffer).hash,
+                    rp_state->drawcall_count, render_mode_str(*early_return_mode));
+      return *early_return_mode;
    }
 
-   return fallback_use_bypass(pass, framebuffer, cmd_buffer);
+   rp_key key(pass, framebuffer, cmd_buffer);
+
+   /* When nearly identical renderpasses appear multiple times within the same command buffer, we need to generate a
+    * unique hash for each instance to distinguish them. While this approach doesn't address identical renderpasses
+    * across different command buffers, it is good enough in most cases.
+    */
+   rp_entry *entry = cb_ctx.find_rp_entry(key);
+   if (entry) {
+      entry->duplicates++;
+      key = rp_key(key, entry->duplicates);
+   }
+
+   *rp_ctx = cb_ctx.attach_rp_entry(device, find_or_create_rp_history(key), config, rp_state->drawcall_count);
+   rp_history &history = *((*rp_ctx)->history);
+
+   if (config.test(mod_flag::PREEMPT_OPTIMIZE) && history.preempt_optimize.is_latency_sensitive()) {
+      /* Try to mitigate the risk of high preemption latency by always using GMEM, which should break up any larger
+       * draws into smaller ones with tiling.
+       */
+      at_log_base_h("high preemption latency risk, using GMEM", key.hash);
+      return render_mode::GMEM;
+   }
+
+   if (early_return_mode) {
+      at_log_base_h("%" PRIu32 " draw calls, using %s (late)", key.hash, rp_state->drawcall_count,
+                    render_mode_str(*early_return_mode));
+      return *early_return_mode;
+   }
+
+   if (config.is_enabled(algorithm::PROFILED) || config.is_enabled(algorithm::PROFILED_IMM))
+      return history.profiled.get_optimal_mode(history);
+
+   if (config.is_enabled(algorithm::BANDWIDTH))
+      return history.bandwidth.get_optimal_mode(history, cmd_state, pass, framebuffer, rp_state);
+
+   return default_mode;
+}
+
+uint32_t
+tu_autotune::get_tile_size_divisor(struct tu_cmd_buffer *cmd_buffer)
+{
+   const struct tu_cmd_state *cmd_state = &cmd_buffer->state;
+   const struct tu_render_pass *pass = cmd_state->pass;
+   const struct tu_framebuffer *framebuffer = cmd_state->framebuffer;
+   const struct tu_render_pass_state *rp_state = &cmd_state->rp;
+
+   if (!enabled || !active_config.load().test(mod_flag::PREEMPT_OPTIMIZE) || rp_state->sysmem_single_prim_mode ||
+       pass->has_fdm || cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)
+      return 1;
+
+   rp_key key(pass, framebuffer, cmd_buffer);
+   rp_history *history = find_rp_history(key);
+   if (!history) {
+      at_log_base_h("no RP history found, using tile_size_divisor=1", key.hash);
+      return 1;
+   }
+
+   uint32_t tile_size_divisor = history->preempt_optimize.get_tile_size_divisor();
+
+   return tile_size_divisor;
 }
 
-template <chip CHIP>
 void
-tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd,
-                             struct tu_cs *cs,
-                             struct tu_renderpass_result *autotune_result)
+tu_autotune::disable_preempt_optimize()
 {
-   if (!autotune_result)
-      return;
-
-   struct tu_device *dev = cmd->device;
-
-   static const uint32_t size = sizeof(struct tu_renderpass_samples);
-
-   mtx_lock(&dev->autotune_mutex);
-   VkResult ret = tu_suballoc_bo_alloc(&autotune_result->bo, &dev->autotune_suballoc, size, size);
-   mtx_unlock(&dev->autotune_mutex);
-   if (ret != VK_SUCCESS) {
-      autotune_result->bo.iova = 0;
-      return;
-   }
-
-   uint64_t result_iova = autotune_result->bo.iova;
-
-   autotune_result->samples =
-      (struct tu_renderpass_samples *) tu_suballoc_bo_map(
-         &autotune_result->bo);
-
-   tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNTER_CNTL(.copy = true));
-   if (cmd->device->physical_device->info->props.has_event_write_sample_count) {
-      tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
-      tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
-                                       .write_sample_count = true).value);
-      tu_cs_emit_qw(cs, result_iova);
-
-      /* If the renderpass contains an occlusion query with its own ZPASS_DONE,
-       * we have to provide a fake ZPASS_DONE event here to logically close the
-       * previous one, preventing firmware from misbehaving due to nested events.
-       * This writes into the samples_end field, which will be overwritten in
-       * tu_autotune_end_renderpass.
-       */
-      if (cmd->state.rp.has_zpass_done_sample_count_write_in_rp) {
-         tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
-         tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
-                                          .write_sample_count = true,
-                                          .sample_count_end_offset = true,
-                                          .write_accum_sample_count_diff = true).value);
-         tu_cs_emit_qw(cs, result_iova);
-      }
-   } else {
-      tu_cs_emit_regs(cs,
-                        A6XX_RB_SAMPLE_COUNTER_BASE(.qword = result_iova));
-      tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
-      tu_cs_emit(cs, ZPASS_DONE);
-   }
+   config_t original, updated;
+   do {
+      original = updated = active_config.load();
+      if (!original.test(mod_flag::PREEMPT_OPTIMIZE))
+         return; /* Already disabled, nothing to do. */
+      updated.disable(mod_flag::PREEMPT_OPTIMIZE);
+   } while (!active_config.compare_and_store(original, updated));
 }
-TU_GENX(tu_autotune_begin_renderpass);
 
-template <chip CHIP>
-void tu_autotune_end_renderpass(struct tu_cmd_buffer *cmd,
-                                struct tu_cs *cs,
-                                struct tu_renderpass_result *autotune_result)
+/** RP-level CS emissions **/
+
+void
+tu_autotune::begin_renderpass(
+   struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx, bool sysmem, uint32_t tile_count)
 {
-   if (!autotune_result)
+   if (!rp_ctx)
       return;
 
-   if (!autotune_result->bo.iova)
-      return;
+   assert(sysmem || tile_count > 0);
+   assert(!sysmem || tile_count == 0);
 
-   uint64_t result_iova = autotune_result->bo.iova;
-
-   tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNTER_CNTL(.copy = true));
-
-   if (cmd->device->physical_device->info->props.has_event_write_sample_count) {
-      /* If the renderpass contains ZPASS_DONE events we emit a fake ZPASS_DONE
-       * event here, composing a pair of these events that firmware handles without
-       * issue. This first event writes into the samples_end field and the second
-       * event overwrites it. The second event also enables the accumulation flag
-       * even when we don't use that result because the blob always sets it.
-       */
-      if (cmd->state.rp.has_zpass_done_sample_count_write_in_rp) {
-         tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
-         tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
-                                          .write_sample_count = true).value);
-         tu_cs_emit_qw(cs, result_iova + offsetof(struct tu_renderpass_samples, samples_end));
-      }
-
-      tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
-      tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
-                                       .write_sample_count = true,
-                                       .sample_count_end_offset = true,
-                                       .write_accum_sample_count_diff = true).value);
-      tu_cs_emit_qw(cs, result_iova);
-   } else {
-      result_iova += offsetof(struct tu_renderpass_samples, samples_end);
-
-      tu_cs_emit_regs(cs,
-                        A6XX_RB_SAMPLE_COUNTER_BASE(.qword = result_iova));
-      tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
-      tu_cs_emit(cs, ZPASS_DONE);
-   }
+   rp_ctx->allocate(sysmem, tile_count);
+   rp_ctx->emit_rp_start(cmd, cs);
+}
+
+void
+tu_autotune::end_renderpass(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx)
+{
+   if (!rp_ctx)
+      return;
+
+   rp_ctx->emit_rp_end(cmd, cs);
+}
+
+/** Tile-level CS emissions **/
+
+void
+tu_autotune::begin_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx, uint32_t tile_idx)
+{
+   if (!rp_ctx)
+      return;
+
+   rp_ctx->emit_tile_start(cmd, cs, tile_idx);
+}
+
+void
+tu_autotune::end_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx, uint32_t tile_idx)
+{
+   if (!rp_ctx)
+      return;
+
+   rp_ctx->emit_tile_end(cmd, cs, tile_idx);
 }
-TU_GENX(tu_autotune_end_renderpass);
diff --git a/src/freedreno/vulkan/tu_autotune.h b/src/freedreno/vulkan/tu_autotune.h
index c374e86ab89..b9bcf6ee0da 100644
--- a/src/freedreno/vulkan/tu_autotune.h
+++ b/src/freedreno/vulkan/tu_autotune.h
@@ -8,150 +8,265 @@
 
 #include "tu_common.h"
 
-#include "util/hash_table.h"
-#include "util/rwlock.h"
+#include <atomic>
+#include <deque>
+#include <memory>
+#include <mutex>
+#include <shared_mutex>
+#include <unordered_map>
+#include <vector>
 
+#include "tu_cs.h"
 #include "tu_suballoc.h"
 
-struct tu_renderpass_history;
+/* Compile-time toggle for debugging preemption latency with CP preemption performance counters. */
+#define TU_AUTOTUNE_DEBUG_PERFCTR 0
 
-/**
- * "autotune" our decisions about bypass vs GMEM rendering, based on historical
- * data about a given render target.
- *
- * In deciding which path to take there are tradeoffs, including some that
- * are not reasonably estimateable without having some additional information:
- *
- *  (1) If you know you are touching every pixel (ie. there is a clear),
- *      then the GMEM path will at least not cost more memory bandwidth than
- *      sysmem[1]
- *
- *  (2) If there is no clear, GMEM could potentially cost *more* bandwidth
- *      if there is sysmem->GMEM restore pass.
- *
- *  (3) If you see a high draw count, that is an indication that there will be
- *      enough pixels accessed multiple times to benefit from the reduced
- *      memory bandwidth that GMEM brings
- *
- *  (4) But high draw count where there is not much overdraw can actually be
- *      faster in bypass mode if it is pushing a lot of state change, due to
- *      not having to go thru the state changes per-tile[1]
- *
- * The approach taken is to measure the samples-passed for the batch to estimate
- * the amount of overdraw to detect cases where the number of pixels touched is
- * low.
- *
- * [1] ignoring early-tile-exit optimizations, but any draw that touches all/
- *     most of the tiles late in the tile-pass can defeat that
+/* Autotune allows for us to tune rendering parameters (such as GMEM vs SYSMEM, tile size divisor, etc.) based on
+ * dynamic analysis of the rendering workload via on-GPU profiling. This lets us make much better decisions than static
+ * analysis, since we can adapt to the actual workload rather than relying on heuristics.
  */
 struct tu_autotune {
-
-   /* We may have to disable autotuner if there are too many
-    * renderpasses in-flight.
-    */
-   bool enabled;
-
+ private:
+   bool enabled = true;
    struct tu_device *device;
 
-   /**
-    * Cache to map renderpass key to historical information about
-    * rendering to that particular render target.
-    */
-   struct hash_table *ht;
-   struct u_rwlock ht_lock;
+   /** Configuration **/
 
-   /**
-    * List of per-renderpass results that we are waiting for the GPU
-    * to finish with before reading back the results.
-    */
-   struct list_head pending_results;
+   enum class algorithm : uint8_t;
+   enum class mod_flag : uint8_t;
+   enum class metric_flag : uint8_t;
+   /* Container for all autotune configuration options. */
+   struct PACKED config_t;
+   union PACKED packed_config_t;
 
-   /**
-    * List of per-submission data that we may want to free after we
-    * processed submission results.
-    * This could happend after command buffers which were in the submission
-    * are destroyed.
-    */
-   struct list_head pending_submission_data;
+   /* Allows for thread-safe access to the configurations. */
+   struct atomic_config_t {
+    private:
+      std::atomic<uint32_t> config_bits = 0;
 
-   /**
-    * List of per-submission data that has been finished and can be reused.
-    */
-   struct list_head submission_data_pool;
+    public:
+      atomic_config_t(config_t initial_config);
 
-   uint32_t fence_counter;
-   uint32_t idx_counter;
+      config_t load() const;
+
+      bool compare_and_store(config_t updated, config_t expected);
+   } active_config;
+
+   config_t get_env_config();
+
+   /** Global Fence and Internal CS Management **/
+
+   /* BO suballocator for reducing BO management for small GMEM/SYSMEM autotune result buffers.
+    * Synchronized by suballoc_mutex.
+    */
+   struct tu_suballocator suballoc;
+   std::mutex suballoc_mutex;
+
+   /* The next value to assign to tu6_global::autotune_fence, this is incremented during on_submit. */
+   uint32_t next_fence = 1;
+
+   /* A wrapper around a CS which sets the global autotune fence to a certain fence value, this allows for ergonomically
+    * managing the lifetime of the CS including recycling it after the fence value has been reached.
+    */
+   struct submission_entry {
+    private:
+      uint32_t fence;
+      struct tu_cs fence_cs;
+
+    public:
+      explicit submission_entry(tu_device *device);
+
+      ~submission_entry();
+
+      /* Disable move/copy, since this holds stable pointers to the fence_cs. */
+      submission_entry(const submission_entry &) = delete;
+      submission_entry &operator=(const submission_entry &) = delete;
+      submission_entry(submission_entry &&) = delete;
+      submission_entry &operator=(submission_entry &&) = delete;
+
+      /* The current state of the submission entry, this is used to track whether the CS is available for reuse, pending
+       * GPU completion or currently being processed.
+       */
+      bool is_active() const;
+
+      /* If the CS is free, returns the CS which will write out the specified fence value. Otherwise, returns nullptr. */
+      struct tu_cs *try_get_cs(uint32_t new_fence);
+   };
+
+   /* Unified pool for submission CSes.
+    * Note: This is a deque rather than a vector due to the lack of move semantics in the submission_entry.
+    */
+   std::deque<submission_entry> submission_entries;
+
+   /* Returns a CS which will write out the specified fence value to the global BO's autotune fence. */
+   struct tu_cs *get_cs_for_fence(uint32_t fence);
+
+   /** RP Entry Management **/
+
+   struct rp_gpu_data;
+   struct tile_gpu_data;
+   struct rp_entry;
+
+   /* A wrapper over all entries associated with a single command buffer. */
+   struct rp_entry_batch {
+      bool active;    /* If the entry is ready to be processed, i.e. the entry is submitted to the GPU queue and has a
+                         valid fence. */
+      uint32_t fence; /* The fence value which is used to signal the completion of the CB submission. This is used to
+                         determine when the entries can be processed. */
+      std::vector<std::unique_ptr<rp_entry>> entries;
+
+      rp_entry_batch();
+
+      /* Disable the copy/move to avoid performance hazards. */
+      rp_entry_batch(const rp_entry_batch &) = delete;
+      rp_entry_batch &operator=(const rp_entry_batch &) = delete;
+      rp_entry_batch(rp_entry_batch &&) = delete;
+      rp_entry_batch &operator=(rp_entry_batch &&) = delete;
+
+      void assign_fence(uint32_t new_fence);
+   };
+
+   /* A deque of entry batches that are strongly ordered by the fence value that was written by the GPU, for efficient
+    * iteration and to ensure that we process the entries in the same order they were submitted.
+    */
+   std::deque<std::shared_ptr<rp_entry_batch>> active_batches;
+
+   /* Handles processing of entry batches that are pending to be processed.
+    *
+    * Note: This must be called regularly to process the entries that have been written by the GPU. We currently do this
+    *       in the on_submit() method, which is called on every submit of a command buffer.
+    */
+   void process_entries();
+
+   /** Renderpass State Tracking **/
+
+   struct rp_history;
+   struct rp_history_handle;
+
+   /* A strongly typed key which generates a hash to uniquely identify a renderpass instance. This hash is expected to
+    * be stable across runs, so it can be used to identify the same renderpass instance consistently.
+    *
+    * Note: We can potentially include the vector of data we extract from the parameters to generate the hash into
+    *       rp_key, which would lead to true value-based equality rather than just hash-based equality which has a cost
+    *       but avoids hash collisions causing issues.
+    */
+   struct rp_key {
+      uint64_t hash;
+
+      rp_key(const struct tu_render_pass *pass,
+             const struct tu_framebuffer *framebuffer,
+             const struct tu_cmd_buffer *cmd);
+
+      /* Further salt the hash to distinguish between multiple instances of the same RP within a single command buffer. */
+      rp_key(const rp_key &key, uint32_t duplicates);
+
+      /* Equality operator, used in unordered_map. */
+      constexpr bool operator==(const rp_key &other) const noexcept
+      {
+         return hash == other.hash;
+      }
+   };
+
+   /* A thin wrapper to satisfy C++'s Hash named requirement for rp_key.
+    *
+    * Note: This should *NEVER* be used to calculate the hash itself as it would lead to the hash being calculated
+    *       multiple times, rather than being calculated once and reused when there's multiple successive lookups like
+    *       with find_or_create_rp_history() and providing the hash to the rp_history constructor.
+    */
+   struct rp_hash {
+      constexpr size_t operator()(const rp_key &key) const noexcept
+      {
+         /* Note: This will throw away the upper 32-bits on 32-bit architectures. */
+         return static_cast<size_t>(key.hash);
+      }
+   };
+
+   /* A map between the hash of an RP and the historical state of the RP. Synchronized by rp_mutex. */
+   using rp_histories_t = std::unordered_map<rp_key, rp_history, rp_hash>;
+   rp_histories_t rp_histories;
+   std::shared_mutex rp_mutex;
+   uint64_t last_reap_ts = 0;
+
+   /* Note: These will internally lock rp_mutex internally, no need to lock it. */
+   rp_history_handle find_rp_history(const rp_key &key);
+   rp_history_handle find_or_create_rp_history(const rp_key &key);
+   void reap_old_rp_histories();
+
+   /** Debug Performance Counters **/
+
+#if TU_AUTOTUNE_DEBUG_PERFCTR
+   const fd_perfcntr_group *cp_group;
+   const fd_perfcntr_countable *preemption_reaction_delay, *num_preemptions, *always_count;
+#endif
+
+ public:
+   tu_autotune(struct tu_device *device, VkResult &result);
+
+   ~tu_autotune();
+
+   /* Opaque pointer to internal structure with RP context that needs to be preserved across begin/end calls. */
+   using rp_ctx_t = rp_entry *;
+
+   /* An internal structure that needs to be held by tu_cmd_buffer to track the state of the autotuner for a given CB.
+    *
+    * Note: tu_cmd_buffer is only responsible for the lifetime of this object, all the access to the context state is
+    *       done through tu_autotune.
+    */
+   struct cmd_buf_ctx {
+    private:
+      /* A batch of all entries from RPs within this CB. */
+      std::shared_ptr<rp_entry_batch> batch;
+
+      /* Creates a new RP entry attached to this CB. */
+      rp_entry *
+      attach_rp_entry(struct tu_device *device, rp_history_handle &&history, config_t config, uint32_t draw_count);
+
+      rp_entry *find_rp_entry(const rp_key &key);
+
+      friend struct tu_autotune;
+
+    public:
+      cmd_buf_ctx();
+      ~cmd_buf_ctx();
+
+      /* Resets the internal context, should be called when tu_cmd_buffer state has been reset. */
+      void reset();
+   };
+
+   enum class render_mode {
+      SYSMEM,
+      GMEM,
+   };
+
+   render_mode get_optimal_mode(struct tu_cmd_buffer *cmd_buffer, rp_ctx_t *rp_ctx);
+
+   /* Returns the optimal tile size divisor for the given CB state. */
+   uint32_t get_tile_size_divisor(struct tu_cmd_buffer *cmd_buffer);
+
+   /* Disables preemption latency optimization within the autotuner, this is used when high-priority queues are used to
+    * ensure that the autotuner does not interfere with the high-priority queue's performance.
+    *
+    * Note: This should be called before any renderpass is started, otherwise it may lead to undefined behavior.
+    */
+   void disable_preempt_optimize();
+
+   void
+   begin_renderpass(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx, bool sysmem, uint32_t tile_count);
+
+   void end_renderpass(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx);
+
+   void begin_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx, uint32_t tile_idx);
+
+   void end_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx, uint32_t tile_idx);
+
+   /* The submit-time hook for autotuner, this may return a CS (can be NULL) which must be amended for autotuner
+    * tracking to function correctly.
+    *
+    * Note: This must be called from a single-threaded context. There should never be multiple threads calling this
+    *       function at the same time.
+    */
+   struct tu_cs *on_submit(struct tu_cmd_buffer **cmd_buffers, uint32_t cmd_buffer_count);
 };
 
-/**
- * From the cmdstream, the captured samples-passed values are recorded
- * at the start and end of the batch.
- *
- * Note that we do the math on the CPU to avoid a WFI.  But pre-emption
- * may force us to revisit that.
- */
-struct PACKED tu_renderpass_samples {
-   uint64_t samples_start;
-   /* hw requires the sample start/stop locations to be 128b aligned. */
-   uint64_t __pad0;
-   uint64_t samples_end;
-   uint64_t __pad1;
-};
-
-/* Necessary when writing sample counts using CP_EVENT_WRITE7::ZPASS_DONE. */
-static_assert(offsetof(struct tu_renderpass_samples, samples_end) == 16);
-
-/**
- * Tracks the results from an individual renderpass. Initially created
- * per renderpass, and appended to the tail of at->pending_results. At a later
- * time, when the GPU has finished writing the results, we fill samples_passed.
- */
-struct tu_renderpass_result {
-   /* Points into GPU memory */
-   struct tu_renderpass_samples* samples;
-
-   struct tu_suballoc_bo bo;
-
-   /*
-    * Below here, only used internally within autotune
-    */
-   uint64_t rp_key;
-   struct tu_renderpass_history *history;
-   struct list_head node;
-   uint32_t fence;
-   uint64_t samples_passed;
-};
-
-VkResult tu_autotune_init(struct tu_autotune *at, struct tu_device *dev);
-void tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev);
-
-bool tu_autotune_use_bypass(struct tu_autotune *at,
-                            struct tu_cmd_buffer *cmd_buffer,
-                            struct tu_renderpass_result **autotune_result);
-void tu_autotune_free_results(struct tu_device *dev, struct list_head *results);
-
-bool tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers,
-                                       uint32_t cmd_buffer_count);
-
-/**
- * A magic 8-ball that tells the gmem code whether we should do bypass mode
- * for moar fps.
- */
-struct tu_cs *tu_autotune_on_submit(struct tu_device *dev,
-                                    struct tu_autotune *at,
-                                    struct tu_cmd_buffer **cmd_buffers,
-                                    uint32_t cmd_buffer_count);
-
-struct tu_autotune_results_buffer;
-
-template <chip CHIP>
-void tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd,
-                                  struct tu_cs *cs,
-                                  struct tu_renderpass_result *autotune_result);
-
-template <chip CHIP>
-void tu_autotune_end_renderpass(struct tu_cmd_buffer *cmd,
-                                struct tu_cs *cs,
-                                struct tu_renderpass_result *autotune_result);
-
-#endif /* TU_AUTOTUNE_H */
+#endif /* TU_AUTOTUNE_H */
\ No newline at end of file
diff --git a/src/freedreno/vulkan/tu_clear_blit.cc b/src/freedreno/vulkan/tu_clear_blit.cc
index 5ba807930ef..a2c75744266 100644
--- a/src/freedreno/vulkan/tu_clear_blit.cc
+++ b/src/freedreno/vulkan/tu_clear_blit.cc
@@ -5466,7 +5466,10 @@ tu_choose_gmem_layout(struct tu_cmd_buffer *cmd)
       }
    }
 
-   cmd->state.tiling = &cmd->state.framebuffer->tiling[cmd->state.gmem_layout];
+   cmd->state.gmem_layout_divisor = cmd->device->autotune->get_tile_size_divisor(cmd);
+
+   cmd->state.tiling = tu_framebuffer_get_tiling_config(cmd->state.framebuffer, cmd->device, cmd->state.pass,
+                                                        cmd->state.gmem_layout, cmd->state.gmem_layout_divisor);
 }
 
 struct apply_store_coords_state {
diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc
index e734241aeaa..0281ce7b857 100644
--- a/src/freedreno/vulkan/tu_cmd_buffer.cc
+++ b/src/freedreno/vulkan/tu_cmd_buffer.cc
@@ -14,6 +14,7 @@
 #include "vk_render_pass.h"
 #include "vk_util.h"
 
+#include "tu_autotune.h"
 #include "tu_buffer.h"
 #include "tu_clear_blit.h"
 #include "tu_cs.h"
@@ -1262,8 +1263,9 @@ tu_vsc_config(struct tu_cmd_buffer *cmd, const struct tu_tiling_config *tiling)
 static bool
 use_hw_binning(struct tu_cmd_buffer *cmd)
 {
-   const struct tu_framebuffer *fb = cmd->state.framebuffer;
-   const struct tu_tiling_config *tiling = &fb->tiling[cmd->state.gmem_layout];
+   struct tu_framebuffer *fb = cmd->state.framebuffer;
+   const struct tu_tiling_config *tiling =
+      tu_framebuffer_get_tiling_config(fb, cmd->device, cmd->state.pass, cmd->state.gmem_layout, cmd->state.gmem_layout_divisor);
    const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling);
 
    /* XFB commands are emitted for BINNING || SYSMEM, which makes it
@@ -1288,12 +1290,12 @@ use_hw_binning(struct tu_cmd_buffer *cmd)
       return true;
    }
 
-   return vsc->binning;
+   return vsc->binning_possible && vsc->binning_useful;
 }
 
 static bool
 use_sysmem_rendering(struct tu_cmd_buffer *cmd,
-                     struct tu_renderpass_result **autotune_result)
+                     tu_autotune::rp_ctx_t *rp_ctx)
 {
    if (TU_DEBUG(SYSMEM)) {
       cmd->state.rp.gmem_disable_reason = "TU_DEBUG(SYSMEM)";
@@ -1343,18 +1345,20 @@ use_sysmem_rendering(struct tu_cmd_buffer *cmd,
       return true;
    }
 
-   if (TU_DEBUG(GMEM))
+   if (TU_DEBUG(GMEM)) {
+      cmd->state.rp.gmem_disable_reason = "TU_DEBUG(GMEM)";
       return false;
-
-   bool use_sysmem = tu_autotune_use_bypass(&cmd->device->autotune,
-                                            cmd, autotune_result);
-   if (*autotune_result) {
-      list_addtail(&(*autotune_result)->node, &cmd->renderpass_autotune_results);
    }
 
-   if (use_sysmem) {
+   /* This is a case where it's better to avoid GMEM, too many tiles but no HW binning possible. */
+   if (!vsc->binning_possible && vsc->binning_useful) {
+      cmd->state.rp.gmem_disable_reason = "Too many tiles and HW binning is not possible";
+      return true;
+   }
+
+   bool use_sysmem = cmd->device->autotune->get_optimal_mode(cmd, rp_ctx) == tu_autotune::render_mode::SYSMEM;
+   if (use_sysmem)
       cmd->state.rp.gmem_disable_reason = "Autotune selected sysmem";
-   }
 
    return use_sysmem;
 }
@@ -3035,7 +3039,7 @@ tu7_emit_concurrent_binning_sysmem(struct tu_cmd_buffer *cmd,
 template <chip CHIP>
 static void
 tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
-                        struct tu_renderpass_result *autotune_result)
+                        tu_autotune::rp_ctx_t rp_ctx)
 {
    const struct tu_framebuffer *fb = cmd->state.framebuffer;
 
@@ -3089,7 +3093,7 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
       tu_cs_emit_regs(cs, RB_BIN_FOVEAT(CHIP));
    }
 
-   tu_autotune_begin_renderpass<CHIP>(cmd, cs, autotune_result);
+   cmd->device->autotune->begin_renderpass(cmd, cs, rp_ctx, true, 0);
 
    tu_cs_sanity_check(cs);
 }
@@ -3097,10 +3101,8 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
 template <chip CHIP>
 static void
 tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
-                      struct tu_renderpass_result *autotune_result)
+                      tu_autotune::rp_ctx_t rp_ctx)
 {
-   tu_autotune_end_renderpass<CHIP>(cmd, cs, autotune_result);
-
    /* Do any resolves of the last subpass. These are handled in the
     * tile_store_cs in the gmem path.
     */
@@ -3127,6 +3129,8 @@ tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
       tu_cs_emit(cs, 0); /* value */
    }
 
+   cmd->device->autotune->end_renderpass(cmd, cs, rp_ctx);
+
    tu_cs_sanity_check(cs);
 }
 
@@ -3275,7 +3279,7 @@ tu7_emit_concurrent_binning_gmem(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
 template <chip CHIP>
 static void
 tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
-                      struct tu_renderpass_result *autotune_result,
+                      tu_autotune::rp_ctx_t rp_ctx,
                       const VkOffset2D *fdm_offsets)
 {
    struct tu_physical_device *phys_dev = cmd->device->physical_device;
@@ -3462,7 +3466,8 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
    if (use_cb)
       tu_trace_start_render_pass(cmd);
 
-   tu_autotune_begin_renderpass<CHIP>(cmd, cs, autotune_result);
+   uint32_t tile_count = vsc->tile_count.width * vsc->tile_count.height;
+   cmd->device->autotune->begin_renderpass(cmd, cs, rp_ctx, false, tile_count);
 
    tu_cs_sanity_check(cs);
 }
@@ -3471,13 +3476,18 @@ template <chip CHIP>
 static void
 tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
                 const struct tu_tile_config *tile,
-                bool fdm, const VkOffset2D *fdm_offsets)
+                bool fdm, const VkOffset2D *fdm_offsets,
+                tu_autotune::rp_ctx_t rp_ctx,
+                const struct tu_vsc_config *vsc)
 {
+   uint32_t tile_idx = (tile->pos.y * vsc->tile_count.width) + tile->pos.x;
    tu6_emit_tile_select<CHIP>(cmd, &cmd->cs, tile, fdm, fdm_offsets);
    tu_lrz_before_tile<CHIP>(cmd, &cmd->cs);
 
    trace_start_draw_ib_gmem(&cmd->trace, &cmd->cs, cmd);
 
+   cmd->device->autotune->begin_tile(cmd, cs, rp_ctx, tile_idx);
+
    /* Primitives that passed all tests are still counted in in each
     * tile even with HW binning beforehand. Do not permit it.
     */
@@ -3489,6 +3499,8 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
    if (cmd->state.prim_generated_query_running_before_rp)
       tu_emit_event_write<CHIP>(cmd, cs, FD_START_PRIMITIVE_CTRS);
 
+   cmd->device->autotune->end_tile(cmd, cs, rp_ctx, tile_idx);
+
    if (use_hw_binning(cmd)) {
       tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
       tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_END_OF_DRAWS) |
@@ -3528,10 +3540,8 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
 template <chip CHIP>
 static void
 tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
-                    struct tu_renderpass_result *autotune_result)
+                    tu_autotune::rp_ctx_t rp_ctx)
 {
-   tu_autotune_end_renderpass<CHIP>(cmd, cs, autotune_result);
-
    tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
 
    tu_lrz_tiling_end<CHIP>(cmd, cs);
@@ -3560,6 +3570,8 @@ tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
 
    tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_BLIT_CACHE);
 
+   cmd->device->autotune->end_renderpass(cmd, cs, rp_ctx);
+
    tu_cs_sanity_check(cs);
 }
 
@@ -3796,7 +3808,9 @@ void
 tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe,
                    uint32_t tx1, uint32_t ty1, uint32_t tx2, uint32_t ty2,
                    const struct tu_image_view *fdm,
-                   const VkOffset2D *fdm_offsets)
+                   const VkOffset2D *fdm_offsets,
+                   tu_autotune::rp_ctx_t rp_ctx,
+                   const struct tu_vsc_config *vsc)
 {
    uint32_t width = tx2 - tx1;
    uint32_t height = ty2 - ty1;
@@ -3859,7 +3873,8 @@ tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe,
             continue;
 
          tu6_render_tile<CHIP>(cmd, &cmd->cs, &tiles[tile_idx],
-                               true, fdm_offsets);
+                               true, fdm_offsets,
+                               rp_ctx, vsc);
       }
    }
 }
@@ -3892,7 +3907,7 @@ tu_allocate_transient_attachments(struct tu_cmd_buffer *cmd, bool sysmem)
 template <chip CHIP>
 static void
 tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
-                    struct tu_renderpass_result *autotune_result,
+                    tu_autotune::rp_ctx_t rp_ctx,
                     const VkOffset2D *fdm_offsets)
 {
    const struct tu_tiling_config *tiling = cmd->state.tiling;
@@ -3926,7 +3941,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
    tu6_emit_tile_store_cs<CHIP>(cmd, &cmd->tile_store_cs);
    tu_cs_end(&cmd->tile_store_cs);
 
-   tu6_tile_render_begin<CHIP>(cmd, &cmd->cs, autotune_result, fdm_offsets);
+   tu6_tile_render_begin<CHIP>(cmd, &cmd->cs, rp_ctx, fdm_offsets);
 
    /* Note: we reverse the order of walking the pipes and tiles on every
     * other row, to improve texture cache locality compared to raster order.
@@ -3947,7 +3962,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
 
          if (merge_tiles) {
             tu_render_pipe_fdm<CHIP>(cmd, pipe, tx1, ty1, tx2, ty2, fdm,
-                                     fdm_offsets);
+                                     fdm_offsets, rp_ctx, vsc);
             continue;
          }
 
@@ -3971,14 +3986,15 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
                   tu_calc_frag_area(cmd, &tile, fdm, fdm_offsets);
 
                tu6_render_tile<CHIP>(cmd, &cmd->cs, &tile, has_fdm,
-                                     fdm_offsets);
+                                     fdm_offsets,
+                                     rp_ctx, vsc);
             }
             slot_row += tile_row_stride;
          }
       }
    }
 
-   tu6_tile_render_end<CHIP>(cmd, &cmd->cs, autotune_result);
+   tu6_tile_render_end<CHIP>(cmd, &cmd->cs, rp_ctx);
 
    tu_trace_end_render_pass<CHIP>(cmd, true);
 
@@ -3998,7 +4014,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
 template <chip CHIP>
 static void
 tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,
-                     struct tu_renderpass_result *autotune_result)
+                     tu_autotune::rp_ctx_t rp_ctx)
 {
    VkResult result = tu_allocate_transient_attachments(cmd, true);
 
@@ -4009,7 +4025,7 @@ tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,
 
    tu_trace_start_render_pass(cmd);
 
-   tu6_sysmem_render_begin<CHIP>(cmd, &cmd->cs, autotune_result);
+   tu6_sysmem_render_begin<CHIP>(cmd, &cmd->cs, rp_ctx);
 
    trace_start_draw_ib_sysmem(&cmd->trace, &cmd->cs, cmd);
 
@@ -4017,7 +4033,7 @@ tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,
 
    trace_end_draw_ib_sysmem(&cmd->trace, &cmd->cs);
 
-   tu6_sysmem_render_end<CHIP>(cmd, &cmd->cs, autotune_result);
+   tu6_sysmem_render_end<CHIP>(cmd, &cmd->cs, rp_ctx);
 
    tu_clone_trace_range(cmd, &cmd->cs, &cmd->trace,
                         cmd->trace_renderpass_start,
@@ -4034,11 +4050,11 @@ tu_cmd_render(struct tu_cmd_buffer *cmd_buffer,
    if (cmd_buffer->state.rp.has_tess)
       tu6_lazy_emit_tessfactor_addr<CHIP>(cmd_buffer);
 
-   struct tu_renderpass_result *autotune_result = NULL;
-   if (use_sysmem_rendering(cmd_buffer, &autotune_result))
-      tu_cmd_render_sysmem<CHIP>(cmd_buffer, autotune_result);
+   tu_autotune::rp_ctx_t rp_ctx = NULL;
+   if (use_sysmem_rendering(cmd_buffer, &rp_ctx))
+      tu_cmd_render_sysmem<CHIP>(cmd_buffer, rp_ctx);
    else
-      tu_cmd_render_tiles<CHIP>(cmd_buffer, autotune_result, fdm_offsets);
+      tu_cmd_render_tiles<CHIP>(cmd_buffer, rp_ctx, fdm_offsets);
 
    /* Outside of renderpasses we assume all draw states are disabled. We do
     * this outside the draw CS for the normal case where 3d gmem stores aren't
@@ -4063,6 +4079,7 @@ static void tu_reset_render_pass(struct tu_cmd_buffer *cmd_buffer)
    cmd_buffer->state.attachments = NULL;
    cmd_buffer->state.clear_values = NULL;
    cmd_buffer->state.gmem_layout = TU_GMEM_LAYOUT_COUNT; /* invalid value to prevent looking up gmem offsets */
+   cmd_buffer->state.gmem_layout_divisor = 0;
    cmd_buffer->state.renderpass_cb_disabled = false;
    memset(&cmd_buffer->state.rp, 0, sizeof(cmd_buffer->state.rp));
 
@@ -4111,7 +4128,7 @@ tu_create_cmd_buffer(struct vk_command_pool *pool,
    u_trace_init(&cmd_buffer->rp_trace, &device->trace_context);
    cmd_buffer->trace_renderpass_start =
       u_trace_begin_iterator(&cmd_buffer->rp_trace);
-   list_inithead(&cmd_buffer->renderpass_autotune_results);
+   new (&cmd_buffer->autotune_ctx) tu_autotune::cmd_buf_ctx();
 
    if (TU_DEBUG_START(CHECK_CMD_BUFFER_STATUS)) {
       cmd_buffer->status_bo = tu_cmd_buffer_setup_status_tracking(device);
@@ -4160,7 +4177,7 @@ tu_cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer)
    u_trace_fini(&cmd_buffer->trace);
    u_trace_fini(&cmd_buffer->rp_trace);
 
-   tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results);
+   cmd_buffer->autotune_ctx.~cmd_buf_ctx();
 
    for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
       if (cmd_buffer->descriptors[i].push_set.layout)
@@ -4238,7 +4255,7 @@ tu_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer,
    tu_cs_reset(&cmd_buffer->pre_chain.draw_cs);
    tu_cs_reset(&cmd_buffer->pre_chain.draw_epilogue_cs);
 
-   tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results);
+   cmd_buffer->autotune_ctx.reset();
 
    for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
       memset(&cmd_buffer->descriptors[i].sets, 0, sizeof(cmd_buffer->descriptors[i].sets));
@@ -6100,7 +6117,9 @@ tu_restore_suspended_pass(struct tu_cmd_buffer *cmd,
    cmd->state.clear_values = suspended->state.suspended_pass.clear_values;
    cmd->state.render_area = suspended->state.suspended_pass.render_area;
    cmd->state.gmem_layout = suspended->state.suspended_pass.gmem_layout;
-   cmd->state.tiling = &cmd->state.framebuffer->tiling[cmd->state.gmem_layout];
+   cmd->state.gmem_layout_divisor = suspended->state.suspended_pass.gmem_layout_divisor;
+   cmd->state.tiling = tu_framebuffer_get_tiling_config(cmd->state.framebuffer, cmd->device, cmd->state.pass,
+                                                        cmd->state.gmem_layout, cmd->state.gmem_layout_divisor);
    cmd->state.lrz = suspended->state.suspended_pass.lrz;
 }
 
@@ -6483,7 +6502,7 @@ tu_emit_subpass_begin_gmem(struct tu_cmd_buffer *cmd, struct tu_resolve_group *r
     * (perf queries), then we can't do this optimization since the
     * start-of-the-CS geometry condition will have been overwritten.
     */
-   bool cond_load_allowed = vsc->binning &&
+   bool cond_load_allowed = vsc->binning_possible &&
                             cmd->state.pass->has_cond_load_store &&
                             !cmd->state.rp.draw_cs_writes_to_cond_pred;
 
@@ -7051,6 +7070,7 @@ tu_CmdBeginRendering(VkCommandBuffer commandBuffer,
       cmd->state.suspended_pass.attachments = cmd->state.attachments;
       cmd->state.suspended_pass.clear_values = cmd->state.clear_values;
       cmd->state.suspended_pass.gmem_layout = cmd->state.gmem_layout;
+      cmd->state.suspended_pass.gmem_layout_divisor = cmd->state.gmem_layout_divisor;
    }
 
    tu_fill_render_pass_state(&cmd->state.vk_rp, cmd->state.pass, cmd->state.subpass);
diff --git a/src/freedreno/vulkan/tu_cmd_buffer.h b/src/freedreno/vulkan/tu_cmd_buffer.h
index 4e974e12827..0f8aa1500d6 100644
--- a/src/freedreno/vulkan/tu_cmd_buffer.h
+++ b/src/freedreno/vulkan/tu_cmd_buffer.h
@@ -524,11 +524,12 @@ struct tu_cmd_state
    /* Decides which GMEM layout to use from the tu_pass, based on whether the CCU
     * might get used by tu_store_gmem_attachment().
     */
-   enum tu_gmem_layout gmem_layout;
+   tu_gmem_layout gmem_layout;
+   uint32_t gmem_layout_divisor;
 
    const struct tu_render_pass *pass;
    const struct tu_subpass *subpass;
-   const struct tu_framebuffer *framebuffer;
+   struct tu_framebuffer *framebuffer;
    const struct tu_tiling_config *tiling;
    VkRect2D render_area;
 
@@ -543,9 +544,10 @@ struct tu_cmd_state
    struct {
       const struct tu_render_pass *pass;
       const struct tu_subpass *subpass;
-      const struct tu_framebuffer *framebuffer;
+      struct tu_framebuffer *framebuffer;
       VkRect2D render_area;
       enum tu_gmem_layout gmem_layout;
+      uint32_t gmem_layout_divisor;
 
       const struct tu_image_view **attachments;
       VkClearValue *clear_values;
@@ -644,8 +646,7 @@ struct tu_cmd_buffer
    struct u_trace_iterator trace_renderpass_start;
    struct u_trace trace, rp_trace;
 
-   struct list_head renderpass_autotune_results;
-   struct tu_autotune_results_buffer* autotune_buffer;
+   tu_autotune::cmd_buf_ctx autotune_ctx;
 
    void *patchpoints_ctx;
    struct util_dynarray fdm_bin_patchpoints;
diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc
index dceb5227116..d593fbfc26c 100644
--- a/src/freedreno/vulkan/tu_device.cc
+++ b/src/freedreno/vulkan/tu_device.cc
@@ -1795,6 +1795,7 @@ static const driOptionDescription tu_dri_options[] = {
       DRI_CONF_TU_USE_TEX_COORD_ROUND_NEAREST_EVEN_MODE(false)
       DRI_CONF_TU_IGNORE_FRAG_DEPTH_DIRECTION(false)
       DRI_CONF_TU_ENABLE_SOFTFLOAT32(false)
+      DRI_CONF_TU_AUTOTUNE_ALGORITHM()
    DRI_CONF_SECTION_END
 };
 
@@ -1825,6 +1826,8 @@ tu_init_dri_options(struct tu_instance *instance)
          driQueryOptionb(&instance->dri_options, "tu_ignore_frag_depth_direction");
    instance->enable_softfloat32 =
          driQueryOptionb(&instance->dri_options, "tu_enable_softfloat32");
+   instance->autotune_algo =
+         driQueryOptionstr(&instance->dri_options, "tu_autotune_algorithm");
 }
 
 static uint32_t instance_count = 0;
@@ -2633,7 +2636,6 @@ tu_device_destroy_mutexes(struct tu_device *device)
 {
    mtx_destroy(&device->bo_mutex);
    mtx_destroy(&device->pipeline_mutex);
-   mtx_destroy(&device->autotune_mutex);
    mtx_destroy(&device->kgsl_profiling_mutex);
    mtx_destroy(&device->event_mutex);
    mtx_destroy(&device->trace_mutex);
@@ -2667,6 +2669,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
    VkResult result;
    struct tu_device *device;
    bool border_color_without_format = false;
+   bool autotune_disable_preempt_optimize = false;
 
    vk_foreach_struct_const (ext, pCreateInfo->pNext) {
       switch (ext->sType) {
@@ -2743,7 +2746,6 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
 
    mtx_init(&device->bo_mutex, mtx_plain);
    mtx_init(&device->pipeline_mutex, mtx_plain);
-   mtx_init(&device->autotune_mutex, mtx_plain);
    mtx_init(&device->kgsl_profiling_mutex, mtx_plain);
    mtx_init(&device->event_mutex, mtx_plain);
    mtx_init(&device->trace_mutex, mtx_plain);
@@ -2789,6 +2791,13 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
    for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
       const VkDeviceQueueCreateInfo *queue_create =
          &pCreateInfo->pQueueCreateInfos[i];
+      const VkDeviceQueueGlobalPriorityCreateInfoKHR *priority_info =
+         vk_find_struct_const(queue_create->pNext,
+               DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_KHR);
+      const VkQueueGlobalPriorityKHR global_priority = priority_info ?
+         priority_info->globalPriority :
+         (TU_DEBUG(HIPRIO) ? VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR :
+          VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR);
       uint32_t qfi = queue_create->queueFamilyIndex;
       enum tu_queue_type type = physical_device->queue_families[qfi].type;
       device->queues[qfi] = (struct tu_queue *) vk_alloc(
@@ -2808,13 +2817,16 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
       device->queue_count[qfi] = queue_create->queueCount;
 
       for (unsigned q = 0; q < queue_create->queueCount; q++) {
-         result = tu_queue_init(device, &device->queues[qfi][q], type, q,
-                                queue_create);
+         result = tu_queue_init(device, &device->queues[qfi][q], type,
+                                global_priority, q, queue_create);
          if (result != VK_SUCCESS) {
             device->queue_count[qfi] = q;
             goto fail_queues;
          }
       }
+
+      autotune_disable_preempt_optimize |=
+         (global_priority == VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR);
    }
 
    result = vk_meta_device_init(&device->vk, &device->meta);
@@ -2868,9 +2880,6 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
                                 TU_BO_ALLOC_ALLOW_DUMP |
                                 TU_BO_ALLOC_INTERNAL_RESOURCE),
       "pipeline_suballoc");
-   tu_bo_suballocator_init(&device->autotune_suballoc, device,
-                           128 * 1024, TU_BO_ALLOC_INTERNAL_RESOURCE,
-                           "autotune_suballoc");
    if (is_kgsl(physical_device->instance)) {
       tu_bo_suballocator_init(&device->kgsl_profiling_suballoc, device,
                               128 * 1024, TU_BO_ALLOC_INTERNAL_RESOURCE,
@@ -3019,10 +3028,12 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
    }
    pthread_condattr_destroy(&condattr);
 
-   result = tu_autotune_init(&device->autotune, device);
-   if (result != VK_SUCCESS) {
+   device->autotune = new tu_autotune(device, result);
+   if (result != VK_SUCCESS)
       goto fail_timeline_cond;
-   }
+
+   if (autotune_disable_preempt_optimize)
+      device->autotune->disable_preempt_optimize();
 
    device->use_z24uint_s8uint =
       physical_device->info->props.has_z24uint_s8uint &&
@@ -3180,10 +3191,9 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
       free(device->dbg_renderpass_stomp_cs);
    }
 
-   tu_autotune_fini(&device->autotune, device);
+   delete device->autotune;
 
    tu_bo_suballocator_finish(&device->pipeline_suballoc);
-   tu_bo_suballocator_finish(&device->autotune_suballoc);
    tu_bo_suballocator_finish(&device->kgsl_profiling_suballoc);
    tu_bo_suballocator_finish(&device->event_suballoc);
    tu_bo_suballocator_finish(&device->vis_stream_suballocator);
@@ -4009,7 +4019,7 @@ tu_CreateFramebuffer(VkDevice _device,
       }
    }
 
-   tu_framebuffer_tiling_config(framebuffer, device, pass);
+   tu_framebuffer_init_tiling_config(framebuffer, device, pass);
 
    /* For MSRTSS, allocate extra images that are tied to the VkFramebuffer */
    if (msrtss_attachment_count > 0) {
@@ -4071,7 +4081,7 @@ tu_setup_dynamic_framebuffer(struct tu_cmd_buffer *cmd_buffer,
          view->image->max_tile_h_constraint_fdm;
    }
 
-   tu_framebuffer_tiling_config(framebuffer, cmd_buffer->device, pass);
+   tu_framebuffer_init_tiling_config(framebuffer, cmd_buffer->device, pass);
 }
 
 VkResult
diff --git a/src/freedreno/vulkan/tu_device.h b/src/freedreno/vulkan/tu_device.h
index 08c102ae145..dffb2c3f001 100644
--- a/src/freedreno/vulkan/tu_device.h
+++ b/src/freedreno/vulkan/tu_device.h
@@ -28,6 +28,7 @@
 #include "common/freedreno_rd_output.h"
 #include "util/vma.h"
 #include "util/u_vector.h"
+#include "util/rwlock.h"
 
 /* queue types */
 #define TU_QUEUE_GENERAL 0
@@ -233,6 +234,9 @@ struct tu_instance
     * However we don't want native Vulkan apps using this.
     */
    bool enable_softfloat32;
+
+   /* Configuration option to use a specific autotune algorithm by default. */
+   const char *autotune_algo;
 };
 VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance,
                        VK_OBJECT_TYPE_INSTANCE)
@@ -265,7 +269,12 @@ struct tu6_global
 
    volatile uint32_t vtx_stats_query_not_running;
 
-   /* To know when renderpass stats for autotune are valid */
+   /* A fence with a monotonically increasing value that is
+    * incremented by the GPU on each submission that includes
+    * a tu_autotune::submission_entry CS. This is used to track
+    * which submissions have been processed by the GPU before
+    * processing the autotune packet on the CPU.
+    */
    volatile uint32_t autotune_fence;
 
    /* For recycling command buffers for dynamic suspend/resume comamnds */
@@ -355,12 +364,6 @@ struct tu_device
    struct tu_suballocator pipeline_suballoc;
    mtx_t pipeline_mutex;
 
-   /* Device-global BO suballocator for reducing BO management for small
-    * gmem/sysmem autotune result buffers.  Synchronized by autotune_mutex.
-    */
-   struct tu_suballocator autotune_suballoc;
-   mtx_t autotune_mutex;
-
    /* KGSL requires a small chunk of GPU mem to retrieve raw GPU time on
     * each submission.
     */
@@ -462,7 +465,7 @@ struct tu_device
    pthread_cond_t timeline_cond;
    pthread_mutex_t submit_mutex;
 
-   struct tu_autotune autotune;
+   struct tu_autotune *autotune;
 
    struct breadcrumbs_context *breadcrumbs_ctx;
 
@@ -547,8 +550,11 @@ struct tu_vsc_config {
    /* Whether binning could be used for gmem rendering using this framebuffer. */
    bool binning_possible;
 
-   /* Whether binning should be used for gmem rendering using this framebuffer. */
-   bool binning;
+   /* Whether binning is useful for GMEM rendering performance using this framebuffer. This is independent of whether
+    * binning is possible, and is determined by the tile count. Not binning when it's useful would be a performance
+    * hazard, and GMEM rendering should be avoided in the case where it's useful to bin but not possible to do so.
+    */
+   bool binning_useful;
 
    /* pipe register values */
    uint32_t pipe_config[MAX_VSC_PIPES];
@@ -577,7 +583,8 @@ struct tu_framebuffer
 
    uint32_t max_tile_w_constraint;
    uint32_t max_tile_h_constraint;
-   struct tu_tiling_config tiling[TU_GMEM_LAYOUT_COUNT];
+   uint32_t initd_divisor; /* The tile divisors up to this have been initialized, for lazy init. */
+   struct tu_tiling_config tiling[TU_GMEM_LAYOUT_COUNT * TU_GMEM_LAYOUT_DIVISOR_MAX];
 
    uint32_t attachment_count;
    const struct tu_image_view *attachments[0];
diff --git a/src/freedreno/vulkan/tu_pass.h b/src/freedreno/vulkan/tu_pass.h
index da92babc657..5dc515f8db6 100644
--- a/src/freedreno/vulkan/tu_pass.h
+++ b/src/freedreno/vulkan/tu_pass.h
@@ -22,6 +22,8 @@ enum tu_gmem_layout
    TU_GMEM_LAYOUT_COUNT,
 };
 
+constexpr uint32_t TU_GMEM_LAYOUT_DIVISOR_MAX = 6; /* 1x (no divisor), 2 (1/2), 3 (1/3) */
+
 struct tu_subpass_barrier {
    VkPipelineStageFlags2 src_stage_mask;
    VkPipelineStageFlags2 dst_stage_mask;
diff --git a/src/freedreno/vulkan/tu_queue.cc b/src/freedreno/vulkan/tu_queue.cc
index a87a73f0cd4..7563e2c3b45 100644
--- a/src/freedreno/vulkan/tu_queue.cc
+++ b/src/freedreno/vulkan/tu_queue.cc
@@ -418,6 +418,7 @@ queue_submit(struct vk_queue *_queue, struct vk_queue_submit *vk_submit)
    struct tu_device *device = queue->device;
    bool u_trace_enabled = u_trace_should_process(&queue->device->trace_context);
    struct util_dynarray dump_cmds;
+   struct tu_cs *autotune_cs = NULL;
 
    if (vk_submit->buffer_bind_count ||
        vk_submit->image_bind_count ||
@@ -495,9 +496,8 @@ queue_submit(struct vk_queue *_queue, struct vk_queue_submit *vk_submit)
       }
    }
 
-   if (tu_autotune_submit_requires_fence(cmd_buffers, cmdbuf_count)) {
-      struct tu_cs *autotune_cs = tu_autotune_on_submit(
-         device, &device->autotune, cmd_buffers, cmdbuf_count);
+   autotune_cs = device->autotune->on_submit(cmd_buffers, cmdbuf_count);
+   if (autotune_cs) {
       submit_add_entries(device, submit, &dump_cmds, autotune_cs->entries,
                          autotune_cs->entry_count);
    }
@@ -605,17 +605,10 @@ VkResult
 tu_queue_init(struct tu_device *device,
               struct tu_queue *queue,
               enum tu_queue_type type,
+              const VkQueueGlobalPriorityKHR global_priority,
               int idx,
               const VkDeviceQueueCreateInfo *create_info)
 {
-   const VkDeviceQueueGlobalPriorityCreateInfoKHR *priority_info =
-      vk_find_struct_const(create_info->pNext,
-            DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_KHR);
-   const VkQueueGlobalPriorityKHR global_priority = priority_info ?
-      priority_info->globalPriority :
-      (TU_DEBUG(HIPRIO) ? VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR :
-       VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR);
-
    const int priority = tu_get_submitqueue_priority(
          device->physical_device, global_priority, type,
          device->vk.enabled_features.globalPriorityQuery);
diff --git a/src/freedreno/vulkan/tu_queue.h b/src/freedreno/vulkan/tu_queue.h
index 28925bfcb50..278756a43af 100644
--- a/src/freedreno/vulkan/tu_queue.h
+++ b/src/freedreno/vulkan/tu_queue.h
@@ -43,6 +43,7 @@ VkResult
 tu_queue_init(struct tu_device *device,
               struct tu_queue *queue,
               enum tu_queue_type type,
+              const VkQueueGlobalPriorityKHR global_priority,
               int idx,
               const VkDeviceQueueCreateInfo *create_info);
 
diff --git a/src/freedreno/vulkan/tu_util.cc b/src/freedreno/vulkan/tu_util.cc
index e19d43bb8a9..ffd2975659b 100644
--- a/src/freedreno/vulkan/tu_util.cc
+++ b/src/freedreno/vulkan/tu_util.cc
@@ -365,6 +365,51 @@ is_hw_binning_possible(const struct tu_vsc_config *vsc)
    return tiles_per_pipe <= 32;
 }
 
+static void
+tu_tiling_config_divide_tile(const struct tu_device *dev,
+                             const struct tu_render_pass *pass,
+                             const struct tu_framebuffer *fb,
+                             const struct tu_tiling_config *tiling,
+                             struct tu_tiling_config *new_tiling,
+                             uint32_t divisor)
+{
+   assert(divisor > 0);
+
+   *new_tiling = *tiling;
+   if (divisor == 1 || !tiling->possible || tiling->tile0.width == ~0) {
+      /* If the divisor is 1, or if the tiling is not possible, or if the
+       * tiling is invalid, just return the original tiling. */
+      return;
+   }
+
+   /* Get the hardware-specified alignment values. */
+   const uint32_t tile_align_w = pass->tile_align_w;
+   const uint32_t tile_align_h = dev->physical_device->info->tile_align_h;
+
+   /* Divide the current tile dimensions by the divisor. */
+   uint32_t new_tile_width = tiling->tile0.width / divisor;
+   uint32_t new_tile_height = tiling->tile0.height / divisor;
+
+   /* Clamp to the minimum alignment if necessary and align down. */
+   if (new_tile_width < tile_align_w)
+      new_tile_width = tile_align_w;
+   else
+      new_tile_width = ROUND_DOWN_TO_NPOT(new_tile_width, tile_align_w);
+
+   if (new_tile_height < tile_align_h)
+      new_tile_height = tile_align_h;
+   else
+      new_tile_height = ROUND_DOWN_TO_NPOT(new_tile_height, tile_align_h);
+
+   new_tiling->tile0.width = new_tile_width;
+   new_tiling->tile0.height = new_tile_height;
+
+   /* Recalculate the tile count from the framebuffer dimensions to ensure
+    * full coverage. */
+   new_tiling->vsc.tile_count.width = DIV_ROUND_UP(fb->width, new_tile_width);
+   new_tiling->vsc.tile_count.height = DIV_ROUND_UP(fb->height, new_tile_height);
+}
+
 static void
 tu_tiling_config_update_pipe_layout(struct tu_vsc_config *vsc,
                                     const struct tu_device *dev,
@@ -460,22 +505,18 @@ tu_tiling_config_update_pipes(struct tu_vsc_config *vsc,
 static void
 tu_tiling_config_update_binning(struct tu_vsc_config *vsc, const struct tu_device *device)
 {
-   if (vsc->binning_possible) {
-      vsc->binning = (vsc->tile_count.width * vsc->tile_count.height) > 2;
+   vsc->binning_useful = (vsc->tile_count.width * vsc->tile_count.height) > 2;
 
-      if (TU_DEBUG(FORCEBIN))
-         vsc->binning = true;
-      if (TU_DEBUG(NOBIN))
-         vsc->binning = false;
-   } else {
-      vsc->binning = false;
-   }
+   if (TU_DEBUG(FORCEBIN))
+      vsc->binning_useful = true;
+   if (TU_DEBUG(NOBIN))
+      vsc->binning_useful = false;
 }
 
 void
-tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
-                             const struct tu_device *device,
-                             const struct tu_render_pass *pass)
+tu_framebuffer_init_tiling_config(struct tu_framebuffer *fb,
+                                  const struct tu_device *device,
+                                  const struct tu_render_pass *pass)
 {
    for (int gmem_layout = 0; gmem_layout < TU_GMEM_LAYOUT_COUNT; gmem_layout++) {
       struct tu_tiling_config *tiling = &fb->tiling[gmem_layout];
@@ -499,6 +540,49 @@ tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
          tu_tiling_config_update_binning(fdm_offset_vsc, device);
       }
    }
+
+   fb->initd_divisor = 1;
+}
+
+const struct tu_tiling_config *
+tu_framebuffer_get_tiling_config(struct tu_framebuffer *fb,
+                                 const struct tu_device *device,
+                                 const struct tu_render_pass *pass,
+                                 int gmem_layout,
+                                 uint32_t divisor)
+{
+   assert(divisor >= 1 && divisor <= TU_GMEM_LAYOUT_DIVISOR_MAX);
+   assert(divisor == 1 || !pass->has_fdm); /* For FDM, it's expected that FDM alone will be sufficient to
+                                              appropriately size the tiles for the framebuffer.*/
+   struct tu_tiling_config *tiling = &fb->tiling[(TU_GMEM_LAYOUT_COUNT * (divisor - 1)) + gmem_layout];
+
+   if (divisor > fb->initd_divisor) {
+      const struct tu_tiling_config *base_tiling =
+         tu_framebuffer_get_tiling_config(fb, device, pass, gmem_layout, divisor - 1);
+      tu_tiling_config_divide_tile(device, pass, fb, base_tiling, tiling, divisor);
+
+      struct tu_vsc_config *vsc = &tiling->vsc;
+      if (tiling->possible) {
+         tu_tiling_config_update_pipe_layout(vsc, device, false);
+         tu_tiling_config_update_pipes(vsc, device);
+         tu_tiling_config_update_binning(vsc, device);
+
+         struct tu_vsc_config *fdm_offset_vsc = &tiling->fdm_offset_vsc;
+         fdm_offset_vsc->tile_count = (VkExtent2D) { ~1, ~1 };
+      }
+
+      if (!tiling->possible ||                               /* If tiling is no longer possible, this is pointless. */
+          (vsc->binning_useful && !vsc->binning_possible) || /* Dividing further without HW binning is a bad idea.  */
+          (vsc->tile_count.width * vsc->tile_count.height > 100) /* 100 tiles are too many, even with HW binning.   */
+      ) {
+         /* Revert to the previous level's tiling configuration. */
+         *tiling = *base_tiling;
+      }
+
+      fb->initd_divisor = divisor;
+   }
+
+   return tiling;
 }
 
 void
diff --git a/src/freedreno/vulkan/tu_util.h b/src/freedreno/vulkan/tu_util.h
index 7ce6d3e053a..b1ed4354e39 100644
--- a/src/freedreno/vulkan/tu_util.h
+++ b/src/freedreno/vulkan/tu_util.h
@@ -136,9 +136,16 @@ __tu_finishme(const char *file, int line, const char *format, ...)
    } while (0)
 
 void
-tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
-                             const struct tu_device *device,
-                             const struct tu_render_pass *pass);
+tu_framebuffer_init_tiling_config(struct tu_framebuffer *fb,
+                                  const struct tu_device *device,
+                                  const struct tu_render_pass *pass);
+
+const struct tu_tiling_config *
+tu_framebuffer_get_tiling_config(struct tu_framebuffer *fb,
+                                 const struct tu_device *device,
+                                 const struct tu_render_pass *pass,
+                                 int gmem_layout,
+                                 uint32_t divisor);
 
 #define TU_STAGE_MASK ((1 << MESA_SHADER_STAGES) - 1)
 
diff --git a/src/util/driconf.h b/src/util/driconf.h
index 42a1c213df1..1ec69d1bd09 100644
--- a/src/util/driconf.h
+++ b/src/util/driconf.h
@@ -657,6 +657,10 @@
    DRI_CONF_OPT_B(tu_enable_softfloat32, def, \
                   "Enable softfloat emulation for float32 denormals")
 
+#define DRI_CONF_TU_AUTOTUNE_ALGORITHM() \
+   DRI_CONF_OPT_S_NODEF(tu_autotune_algorithm, \
+                        "Set the preferred autotune algorithm")
+
 /**
  * \brief Honeykrisp specific configuration options
  */
diff --git a/src/util/rand_xor.h b/src/util/rand_xor.h
index b55598f228a..830c6c3e727 100644
--- a/src/util/rand_xor.h
+++ b/src/util/rand_xor.h
@@ -28,10 +28,18 @@
 #include <stdint.h>
 #include <stdbool.h>
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 uint64_t
 rand_xorshift128plus(uint64_t seed[2]);
 
 void
 s_rand_xorshift128plus(uint64_t seed[2], bool randomised_seed);
 
+#ifdef __cplusplus
+} /* end of extern "C" */
+#endif
+
 #endif /* RAND_XOR_H */
diff --git a/src/util/u_math.h b/src/util/u_math.h
index 354683bb4ce..2c5f97b9875 100644
--- a/src/util/u_math.h
+++ b/src/util/u_math.h
@@ -674,6 +674,12 @@ ROUND_DOWN_TO(uint64_t value, uint32_t alignment)
    return ((value) & ~(uint64_t)(alignment - 1));
 }
 
+static inline uint64_t
+ROUND_DOWN_TO_NPOT(uint64_t value, uint32_t alignment)
+{
+   return value - (value % alignment);
+}
+
 /**
  * Align a value, only works pot alignemnts.
  */