diff --git a/src/freedreno/vulkan/meson.build b/src/freedreno/vulkan/meson.build
index 07185a93b93..abd34c2e965 100644
--- a/src/freedreno/vulkan/meson.build
+++ b/src/freedreno/vulkan/meson.build
@@ -31,6 +31,7 @@ tu_entrypoints = custom_target(
 
 
 libtu_files = files(
+  'tu_autotune.c',
   'tu_clear_blit.c',
   'tu_cmd_buffer.c',
   'tu_cs.c',
diff --git a/src/freedreno/vulkan/tu_autotune.c b/src/freedreno/vulkan/tu_autotune.c
new file mode 100644
index 00000000000..7ed78e1727e
--- /dev/null
+++ b/src/freedreno/vulkan/tu_autotune.c
@@ -0,0 +1,547 @@
+/*
+ * Copyright © 2021 Igalia S.L.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <vulkan/vulkan_core.h>
+
+#include "tu_autotune.h"
+#include "tu_private.h"
+#include "tu_cs.h"
+
+/* In Vulkan application may fill command buffer from many threads
+ * and expect no locking to occur. We do introduce the possibility of
+ * locking on renderpass end, however assuming that application
+ * doesn't have a huge amount of slightly different renderpasses,
+ * there would be minimal to none contention.
+ *
+ * Other assumptions are:
+ * - Application does submit command buffers soon after their creation.
+ *
+ * Breaking the above may lead to some decrease in performance or
+ * autotuner turning itself off.
+ */
+
+#define TU_AUTOTUNE_DEBUG_LOG 0
+/* Dump history entries on autotuner finish,
+ * could be used to gather data from traces.
+ */
+#define TU_AUTOTUNE_LOG_AT_FINISH 0
+
+#define MAX_HISTORY_RESULTS 5
+#define MAX_HISTORY_LIFETIME 128
+
+/**
+ * Tracks results for a given renderpass key
+ */
+struct tu_renderpass_history {
+   uint64_t key;
+
+   /* We would delete old history entries */
+   uint32_t last_fence;
+
+   /**
+    * List of recent fd_renderpass_result's
+    */
+   struct list_head results;
+   uint32_t num_results;
+
+   uint32_t avg_samples;
+};
+
+/* Holds per-submission cs which writes the fence. */
+struct tu_submission_fence_cs {
+   struct list_head node;
+   struct tu_cs cs;
+   uint32_t fence;
+};
+
+#define APPEND_TO_HASH(state, field) \
+   XXH64_update(state, &field, sizeof(field));
+
+static uint64_t
+hash_renderpass_instance(const struct tu_render_pass *pass,
+                         const struct tu_framebuffer *framebuffer,
+                         const struct tu_cmd_buffer *cmd) {
+   XXH64_state_t hash_state;
+   XXH64_reset(&hash_state, 0);
+
+   APPEND_TO_HASH(&hash_state, framebuffer->width);
+   APPEND_TO_HASH(&hash_state, framebuffer->height);
+   APPEND_TO_HASH(&hash_state, framebuffer->layers);
+
+   APPEND_TO_HASH(&hash_state, pass->attachment_count);
+   XXH64_update(&hash_state, pass->attachments, pass->attachment_count * sizeof(pass->attachments[0]));
+
+   for (unsigned i = 0; i < pass->attachment_count; i++) {
+      APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->view.width);
+      APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->view.height);
+      APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->image->vk_format);
+      APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->image->layer_count);
+      APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->image->level_count);
+   }
+
+   APPEND_TO_HASH(&hash_state, pass->subpass_count);
+   for (unsigned i = 0; i < pass->subpass_count; i++) {
+      APPEND_TO_HASH(&hash_state, pass->subpasses[i].samples);
+      APPEND_TO_HASH(&hash_state, pass->subpasses[i].input_count);
+      APPEND_TO_HASH(&hash_state, pass->subpasses[i].color_count);
+      APPEND_TO_HASH(&hash_state, pass->subpasses[i].resolve_count);
+   }
+
+   return XXH64_digest(&hash_state);
+}
+
+static void
+history_destructor(void *h)
+{
+   struct tu_renderpass_history *history = h;
+
+   list_for_each_entry_safe(struct tu_renderpass_result, result,
+                            &history->results, node) {
+      ralloc_free(result);
+   }
+}
+
+static void
+result_destructor(void *r)
+{
+   struct tu_renderpass_result *result = r;
+
+   /* Just in case we manage to somehow still be on the pending_results list: */
+   list_del(&result->node);
+}
+
+static bool
+get_history(struct tu_autotune *at, uint64_t rp_key, uint32_t *avg_samples)
+{
+   bool has_history = false;
+
+   /* If the lock contantion would be found in the wild -
+    * we could use try_lock here.
+    */
+   u_rwlock_rdlock(&at->ht_lock);
+   struct hash_entry *entry =
+      _mesa_hash_table_search(at->ht, &rp_key);
+   if (entry) {
+      struct tu_renderpass_history *history = entry->data;
+      if (history->num_results > 0) {
+         *avg_samples = p_atomic_read(&history->avg_samples);
+         has_history = true;
+      }
+   }
+   u_rwlock_rdunlock(&at->ht_lock);
+
+   return has_history;
+}
+
+static struct tu_renderpass_result *
+create_history_result(struct tu_autotune *at, uint64_t rp_key)
+{
+   struct tu_renderpass_result *result = rzalloc_size(NULL, sizeof(*result));
+
+   result->idx = p_atomic_inc_return(&at->idx_counter);
+   result->rp_key = rp_key;
+
+   ralloc_set_destructor(result, result_destructor);
+
+   return result;
+}
+
+static void
+history_add_result(struct tu_renderpass_history *history,
+                      struct tu_renderpass_result *result)
+{
+   list_delinit(&result->node);
+   list_add(&result->node, &history->results);
+
+   if (history->num_results < MAX_HISTORY_RESULTS) {
+      history->num_results++;
+   } else {
+      /* Once above the limit, start popping old results off the
+       * tail of the list:
+       */
+      struct tu_renderpass_result *old_result =
+         list_last_entry(&history->results, struct tu_renderpass_result, node);
+      list_delinit(&old_result->node);
+      ralloc_free(old_result);
+   }
+
+   /* Do calculations here to avoid locking history in tu_autotune_use_bypass */
+   uint32_t total_samples = 0;
+   list_for_each_entry(struct tu_renderpass_result, result,
+                       &history->results, node) {
+      total_samples += result->samples_passed;
+   }
+
+   float avg_samples = (float)total_samples / (float)history->num_results;
+   p_atomic_set(&history->avg_samples, (uint32_t)avg_samples);
+}
+
+static void
+process_results(struct tu_autotune *at)
+{
+   uint32_t current_fence = at->results->fence;
+
+   uint32_t min_idx = ~0;
+   uint32_t max_idx = 0;
+
+   list_for_each_entry_safe(struct tu_renderpass_result, result,
+                            &at->pending_results, node) {
+      if (result->fence > current_fence)
+         break;
+
+      struct tu_renderpass_history *history = result->history;
+
+      min_idx = MIN2(min_idx, result->idx);
+      max_idx = MAX2(max_idx, result->idx);
+      uint32_t idx = result->idx % ARRAY_SIZE(at->results->result);
+
+      result->samples_passed = at->results->result[idx].samples_end -
+                               at->results->result[idx].samples_start;
+
+      history_add_result(history, result);
+   }
+
+   list_for_each_entry_safe(struct tu_submission_fence_cs, submission_cs,
+                            &at->pending_submission_cs, node) {
+      if (submission_cs->fence > current_fence)
+         break;
+
+      list_del(&submission_cs->node);
+      tu_cs_finish(&submission_cs->cs);
+      free(submission_cs);
+   }
+
+   if (max_idx - min_idx > TU_AUTOTUNE_MAX_RESULTS) {
+      /* If results start to trample each other it's better to bail out */
+      at->enabled = false;
+      mesa_logw("disabling sysmem vs gmem autotuner because results "
+                "are trampling each other: min_idx=%u, max_idx=%u",
+                min_idx, max_idx);
+   }
+}
+
+static struct tu_cs *
+create_fence_cs(struct tu_device *dev, struct tu_autotune *at)
+{
+   struct tu_submission_fence_cs *submission_cs =
+      calloc(1, sizeof(struct tu_submission_fence_cs));
+   submission_cs->fence = at->fence_counter;
+
+   tu_cs_init(&submission_cs->cs, dev, TU_CS_MODE_GROW, 5);
+   tu_cs_begin(&submission_cs->cs);
+
+   tu_cs_emit_pkt7(&submission_cs->cs, CP_EVENT_WRITE, 4);
+   tu_cs_emit(&submission_cs->cs, CP_EVENT_WRITE_0_EVENT(CACHE_FLUSH_TS));
+   tu_cs_emit_qw(&submission_cs->cs, autotune_results_ptr(at, fence));
+   tu_cs_emit(&submission_cs->cs, at->fence_counter);
+
+   tu_cs_end(&submission_cs->cs);
+
+   list_addtail(&submission_cs->node, &at->pending_submission_cs);
+
+   return &submission_cs->cs;
+}
+
+struct tu_cs *
+tu_autotune_on_submit(struct tu_device *dev,
+                      struct tu_autotune *at,
+                      struct tu_cmd_buffer **cmd_buffers,
+                      uint32_t cmd_buffer_count)
+{
+   /* We are single-threaded here */
+
+   process_results(at);
+
+   /* pre-increment so zero isn't valid fence */
+   uint32_t new_fence = ++at->fence_counter;
+
+   /* Create history entries here to minimize work and locking being
+    * done on renderpass end.
+    */
+   for (uint32_t i = 0; i < cmd_buffer_count; i++) {
+      struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
+      list_for_each_entry_safe(struct tu_renderpass_result, result,
+                          &cmdbuf->renderpass_autotune_results, node) {
+         struct tu_renderpass_history *history;
+         struct hash_entry *entry =
+            _mesa_hash_table_search(at->ht, &result->rp_key);
+         if (!entry) {
+            history = rzalloc_size(NULL, sizeof(*history));
+            ralloc_set_destructor(history, history_destructor);
+            history->key = result->rp_key;
+            list_inithead(&history->results);
+
+            u_rwlock_wrlock(&at->ht_lock);
+            _mesa_hash_table_insert(at->ht, &history->key, history);
+            u_rwlock_wrunlock(&at->ht_lock);
+         } else {
+            history = (struct tu_renderpass_history *) entry->data;
+         }
+
+         history->last_fence = new_fence;
+
+         result->fence = new_fence;
+         result->history = history;
+      }
+
+      if (!list_is_empty(&cmdbuf->renderpass_autotune_results)) {
+         list_splicetail(&cmdbuf->renderpass_autotune_results,
+                         &at->pending_results);
+         list_inithead(&cmdbuf->renderpass_autotune_results);
+      }
+   }
+
+#if TU_AUTOTUNE_DEBUG_LOG != 0
+   mesa_logi("Total history entries: %u", at->ht->entries);
+#endif
+
+   /* Cleanup old entries from history table. The assumption
+    * here is that application doesn't hold many old unsubmitted
+    * command buffers, otherwise this table may grow big.
+    */
+   hash_table_foreach(at->ht, entry) {
+      struct tu_renderpass_history *history = entry->data;
+      if (history->last_fence == 0 ||
+          (new_fence - history->last_fence) <= MAX_HISTORY_LIFETIME)
+         continue;
+
+#if TU_AUTOTUNE_DEBUG_LOG != 0
+      mesa_logi("Removed old history entry %016"PRIx64"", history->key);
+#endif
+
+      u_rwlock_wrlock(&at->ht_lock);
+      _mesa_hash_table_remove_key(at->ht, &history->key);
+      u_rwlock_wrunlock(&at->ht_lock);
+
+      ralloc_free(history);
+   }
+
+   return create_fence_cs(dev, at);
+}
+
+static bool
+renderpass_key_equals(const void *_a, const void *_b)
+{
+   return *(uint64_t *)_a == *(uint64_t *)_b;
+}
+
+static uint32_t
+renderpass_key_hash(const void *_a)
+{
+   return *((uint64_t *) _a) & 0xffffffff;
+}
+
+VkResult
+tu_autotune_init(struct tu_autotune *at, struct tu_device *dev)
+{
+   VkResult result;
+
+   at->enabled = true;
+   at->ht = _mesa_hash_table_create(NULL,
+                                    renderpass_key_hash,
+                                    renderpass_key_equals);
+   u_rwlock_init(&at->ht_lock);
+
+   at->results_bo = malloc(sizeof(struct tu_bo));
+   result = tu_bo_init_new(dev, at->results_bo,
+                           sizeof(struct tu_autotune_results),
+                           TU_BO_ALLOC_NO_FLAGS);
+   if (result != VK_SUCCESS) {
+      vk_startup_errorf(dev->instance, result, "Autotune BO init");
+      goto fail_bo;
+   }
+
+   result = tu_bo_map(dev, at->results_bo);
+
+   if (result != VK_SUCCESS) {
+      vk_startup_errorf(dev->instance, result, "Autotune BO map");
+      goto fail_map_bo;
+   }
+
+   at->results = at->results_bo->map;
+
+   list_inithead(&at->pending_results);
+   list_inithead(&at->pending_submission_cs);
+
+   return VK_SUCCESS;
+
+fail_map_bo:
+   tu_bo_finish(dev, at->results_bo);
+
+fail_bo:
+   free(at->results_bo);
+   u_rwlock_destroy(&at->ht_lock);
+   _mesa_hash_table_destroy(at->ht, NULL);
+
+   return result;
+}
+
+void
+tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev)
+{
+#if TU_AUTOTUNE_LOG_AT_FINISH != 0
+   while (!list_is_empty(&at->pending_results)) {
+      process_results(at);
+   }
+
+   hash_table_foreach(at->ht, entry) {
+      struct tu_renderpass_history *history = entry->data;
+
+      mesa_logi("%016"PRIx64" \tavg_passed=%u results=%u",
+                history->key, history->avg_samples, history->num_results);
+   }
+#endif
+
+   tu_autotune_free_results(&at->pending_results);
+
+   hash_table_foreach(at->ht, entry) {
+      struct tu_renderpass_history *history = entry->data;
+      ralloc_free(history);
+   }
+
+   list_for_each_entry_safe(struct tu_submission_fence_cs, submission_cs,
+                            &at->pending_submission_cs, node) {
+      tu_cs_finish(&submission_cs->cs);
+      free(submission_cs);
+   }
+
+   _mesa_hash_table_destroy(at->ht, NULL);
+   u_rwlock_destroy(&at->ht_lock);
+   tu_bo_finish(dev, at->results_bo);
+   free(at->results_bo);
+}
+
+bool
+tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers,
+                                  uint32_t cmd_buffer_count)
+{
+   for (uint32_t i = 0; i < cmd_buffer_count; i++) {
+      struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
+      if (!list_is_empty(&cmdbuf->renderpass_autotune_results))
+         return true;
+   }
+
+   return false;
+}
+
+void
+tu_autotune_free_results(struct list_head *results)
+{
+   list_for_each_entry_safe(struct tu_renderpass_result, result,
+                            results, node) {
+      ralloc_free(result);
+   }
+}
+
+static bool
+fallback_use_bypass(const struct tu_render_pass *pass,
+                    const struct tu_framebuffer *framebuffer,
+                    const struct tu_cmd_buffer *cmd_buffer)
+{
+   if (cmd_buffer->state.drawcall_count > 5)
+      return false;
+
+   for (unsigned i = 0; i < pass->subpass_count; i++) {
+      if (pass->subpasses[i].samples != VK_SAMPLE_COUNT_1_BIT)
+         return false;
+   }
+
+   return true;
+}
+
+bool
+tu_autotune_use_bypass(struct tu_autotune *at,
+                       struct tu_cmd_buffer *cmd_buffer,
+                       struct tu_renderpass_result **autotune_result)
+{
+   const struct tu_render_pass *pass = cmd_buffer->state.pass;
+   const struct tu_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
+
+   /* If we would want to support buffers that could be submitted
+    * several times we would have to copy the sample counts of renderpasses
+    * after each submission of such buffer (like with u_trace support).
+    * This is rather messy and since almost all apps use ONE_TIME_SUBMIT
+    * we choose to unconditionally use fallback.
+    */
+   bool one_time_submit = cmd_buffer->usage_flags &
+      VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+
+   if (!at->enabled || !one_time_submit)
+      return fallback_use_bypass(pass, framebuffer, cmd_buffer);
+
+   /* We use 64bit hash as a key since we don't fear rare hash collision,
+    * the worst that would happen is sysmem being selected when it should
+    * have not, and with 64bit it would be extremely rare.
+    *
+    * Q: Why not make the key from framebuffer + renderpass pointers?
+    * A: At least DXVK creates new framebuffers each frame while keeping
+    *    renderpasses the same. Also we want to support replaying a single
+    *    frame in a loop for testing.
+    */
+   uint64_t renderpass_key = hash_renderpass_instance(pass, framebuffer, cmd_buffer);
+
+   *autotune_result = create_history_result(at, renderpass_key);
+
+   uint32_t avg_samples = 0;
+   if (get_history(at, renderpass_key, &avg_samples)) {
+      /* TODO we should account for load/stores/clears/resolves especially
+       * with low drawcall count and ~fb_size samples passed, in D3D11 games
+       * we are seeing many renderpasses like:
+       *  - color attachment load
+       *  - single fullscreen draw
+       *  - color attachment store
+       */
+
+      /* Low sample count could mean there was only a clear.. or there was
+       * a clear plus draws that touch no or few samples
+       */
+      if (avg_samples < 500) {
+#if TU_AUTOTUNE_DEBUG_LOG != 0
+         mesa_logi("%016"PRIx64":%u\t avg_samples=%u selecting sysmem",
+            renderpass_key, cmd_buffer->state.drawcall_count, avg_samples);
+#endif
+         return true;
+      }
+
+      /* Cost-per-sample is an estimate for the average number of reads+
+       * writes for a given passed sample.
+       */
+      float sample_cost = cmd_buffer->state.total_drawcalls_cost;
+      sample_cost /= cmd_buffer->state.drawcall_count;
+
+      float single_draw_cost = (avg_samples * sample_cost) / cmd_buffer->state.drawcall_count;
+
+      bool select_sysmem = single_draw_cost < 6000.0;
+
+#if TU_AUTOTUNE_DEBUG_LOG != 0
+      mesa_logi("%016"PRIx64":%u\t avg_samples=%u, "
+          "sample_cost=%f, single_draw_cost=%f selecting %s",
+          renderpass_key, cmd_buffer->state.drawcall_count, avg_samples,
+          sample_cost, single_draw_cost, select_sysmem ? "sysmem" : "gmem");
+#endif
+
+      return select_sysmem;
+   }
+
+   return fallback_use_bypass(pass, framebuffer, cmd_buffer);
+}
diff --git a/src/freedreno/vulkan/tu_autotune.h b/src/freedreno/vulkan/tu_autotune.h
new file mode 100644
index 00000000000..3344b3e854d
--- /dev/null
+++ b/src/freedreno/vulkan/tu_autotune.h
@@ -0,0 +1,187 @@
+/*
+ * Copyright © 2021 Igalia S.L.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef TU_AUTOTUNE_H
+#define TU_AUTOTUNE_H
+
+#include "util/hash_table.h"
+#include "util/list.h"
+#include "util/rwlock.h"
+
+#define autotune_offset(base, ptr) ((uint8_t *)(ptr) - (uint8_t *)(base))
+#define autotune_results_ptr(at, member)             \
+   (at->results_bo->iova +                           \
+      autotune_offset((at)->results, &(at)->results->member))
+
+struct tu_device;
+struct tu_cmd_buffer;
+
+struct tu_autotune_results;
+struct tu_renderpass_history;
+
+/**
+ * "autotune" our decisions about bypass vs GMEM rendering, based on historical
+ * data about a given render target.
+ *
+ * In deciding which path to take there are tradeoffs, including some that
+ * are not reasonably estimateable without having some additional information:
+ *
+ *  (1) If you know you are touching every pixel (ie. there is a clear),
+ *      then the GMEM path will at least not cost more memory bandwidth than
+ *      sysmem[1]
+ *
+ *  (2) If there is no clear, GMEM could potentially cost *more* bandwidth
+ *      if there is sysmem->GMEM restore pass.
+ *
+ *  (3) If you see a high draw count, that is an indication that there will be
+ *      enough pixels accessed multiple times to benefit from the reduced
+ *      memory bandwidth that GMEM brings
+ *
+ *  (4) But high draw count where there is not much overdraw can actually be
+ *      faster in bypass mode if it is pushing a lot of state change, due to
+ *      not having to go thru the state changes per-tile[1]
+ *
+ * The approach taken is to measure the samples-passed for the batch to estimate
+ * the amount of overdraw to detect cases where the number of pixels touched is
+ * low.
+ *
+ * [1] ignoring early-tile-exit optimizations, but any draw that touches all/
+ *     most of the tiles late in the tile-pass can defeat that
+ */
+struct tu_autotune {
+
+   /* We may have to disable autotuner if there are too many
+    * renderpasses in-flight.
+    */
+   bool enabled;
+
+   /**
+    * Cache to map renderpass key to historical information about
+    * rendering to that particular render target.
+    */
+   struct hash_table *ht;
+   struct u_rwlock ht_lock;
+
+   /**
+    * GPU buffer used to communicate back results to the CPU
+    */
+   struct tu_bo *results_bo;
+   struct tu_autotune_results *results;
+
+   /**
+    * List of per-renderpass results that we are waiting for the GPU
+    * to finish with before reading back the results.
+    */
+   struct list_head pending_results;
+
+   /**
+    * List of per-submission CS that we are waiting for the GPU
+    * to finish using.
+    */
+   struct list_head pending_submission_cs;
+
+   uint32_t fence_counter;
+   uint32_t idx_counter;
+};
+
+#define TU_AUTOTUNE_MAX_RESULTS 256
+
+/**
+ * The layout of the memory used to read back per-batch results from the
+ * GPU
+ *
+ * Note this struct is intentionally aligned to 4k.  And hw requires the
+ * sample start/stop locations to be 128b aligned.
+ */
+struct tu_autotune_results {
+
+   /**
+    * The GPU writes back a "fence" seqno value from the cmdstream after
+    * it finishes the submission, so that the CPU knows when
+    * results are valid.
+    */
+   uint32_t fence;
+
+   uint32_t __pad0;
+   uint64_t __pad1;
+
+   /**
+    * From the cmdstream, the captured samples-passed values are recorded
+    * at the start and end of the batch.
+    *
+    * Note that we do the math on the CPU to avoid a WFI.  But pre-emption
+    * may force us to revisit that.
+    */
+   struct {
+      uint64_t samples_start;
+      uint64_t __pad0;
+      uint64_t samples_end;
+      uint64_t __pad1;
+   } result[TU_AUTOTUNE_MAX_RESULTS];
+};
+
+/**
+ * Tracks the results from an individual renderpass. Initially created
+ * per renderpass, and appended to the tail of at->pending_results. At a later
+ * time, when the GPU has finished writing the results, we fill samples_passed.
+ */
+struct tu_renderpass_result {
+
+   /**
+    * The index/slot in tu_autotune_results::result[] to write start/end
+    * counter to
+    */
+   unsigned idx;
+
+   /*
+    * Below here, only used internally within autotune
+    */
+   uint64_t rp_key;
+   struct tu_renderpass_history *history;
+   struct list_head node;
+   uint32_t fence;
+   uint64_t samples_passed;
+};
+
+VkResult tu_autotune_init(struct tu_autotune *at, struct tu_device *dev);
+void tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev);
+
+bool tu_autotune_use_bypass(struct tu_autotune *at,
+                            struct tu_cmd_buffer *cmd_buffer,
+                            struct tu_renderpass_result **autotune_result);
+void tu_autotune_free_results(struct list_head *results);
+
+bool tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers,
+                                       uint32_t cmd_buffer_count);
+
+/**
+ * A magic 8-ball that tells the gmem code whether we should do bypass mode
+ * for moar fps.
+ */
+struct tu_cs *tu_autotune_on_submit(struct tu_device *dev,
+                                    struct tu_autotune *at,
+                                    struct tu_cmd_buffer **cmd_buffers,
+                                    uint32_t cmd_buffer_count);
+
+
+#endif /* TU_AUTOTUNE_H */
diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c
index bd067d3af16..d9394bcf872 100644
--- a/src/freedreno/vulkan/tu_cmd_buffer.c
+++ b/src/freedreno/vulkan/tu_cmd_buffer.c
@@ -592,7 +592,8 @@ use_hw_binning(struct tu_cmd_buffer *cmd)
 }
 
 static bool
-use_sysmem_rendering(struct tu_cmd_buffer *cmd)
+use_sysmem_rendering(struct tu_cmd_buffer *cmd,
+                     struct tu_renderpass_result **autotune_result)
 {
    if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_SYSMEM))
       return true;
@@ -615,7 +616,13 @@ use_sysmem_rendering(struct tu_cmd_buffer *cmd)
    if (cmd->state.disable_gmem)
       return true;
 
-   return false;
+   bool use_sysmem = tu_autotune_use_bypass(&cmd->device->autotune,
+                                            cmd, autotune_result);
+   if (*autotune_result) {
+      list_addtail(&(*autotune_result)->node, &cmd->renderpass_autotune_results);
+   }
+
+   return use_sysmem;
 }
 
 static void
@@ -1210,7 +1217,50 @@ tu_emit_renderpass_begin(struct tu_cmd_buffer *cmd,
 }
 
 static void
-tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
+tu6_autotune_begin(struct tu_cs *cs, struct tu_autotune *at,
+                   const struct tu_renderpass_result *autotune_result)
+{
+   if (!autotune_result)
+      return;
+
+   uint32_t result_idx = autotune_result->idx % TU_AUTOTUNE_MAX_RESULTS;
+   uint64_t begin_iova = autotune_results_ptr(at, result[result_idx].samples_start);
+
+   tu_cs_emit_regs(cs,
+                   A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
+
+   tu_cs_emit_regs(cs,
+                   A6XX_RB_SAMPLE_COUNT_ADDR(.qword = begin_iova));
+
+   tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
+   tu_cs_emit(cs, ZPASS_DONE);
+}
+
+static void
+tu6_autotune_end(struct tu_cs *cs, struct tu_autotune *at,
+                 const struct tu_renderpass_result *autotune_result)
+{
+   if (!autotune_result)
+      return;
+
+   uint32_t result_idx = autotune_result->idx % TU_AUTOTUNE_MAX_RESULTS;
+   uint64_t end_iova = autotune_results_ptr(at, result[result_idx].samples_end);
+
+   tu_cs_emit_regs(cs,
+                   A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
+
+   tu_cs_emit_regs(cs,
+                   A6XX_RB_SAMPLE_COUNT_ADDR(.qword = end_iova));
+
+   tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
+   tu_cs_emit(cs, ZPASS_DONE);
+
+   /* A fence would be emitted at the submission time */
+}
+
+static void
+tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
+                        const struct tu_renderpass_result *autotune_result)
 {
    const struct tu_framebuffer *fb = cmd->state.framebuffer;
 
@@ -1240,12 +1290,17 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
    tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
    tu_cs_emit(cs, 0x0);
 
+   tu6_autotune_begin(cs, &cmd->device->autotune, autotune_result);
+
    tu_cs_sanity_check(cs);
 }
 
 static void
-tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
+tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
+                      const struct tu_renderpass_result *autotune_result)
 {
+   tu6_autotune_end(cs, &cmd->device->autotune, autotune_result);
+
    /* Do any resolves of the last subpass. These are handled in the
     * tile_store_cs in the gmem path.
     */
@@ -1262,7 +1317,8 @@ tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 }
 
 static void
-tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
+tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
+                      const struct tu_renderpass_result *autotune_result)
 {
    struct tu_physical_device *phys_dev = cmd->device->physical_device;
 
@@ -1312,6 +1368,8 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
                         A6XX_RB_BIN_CONTROL_LRZ_FEEDBACK_ZMODE_MASK(0x6));
    }
 
+   tu6_autotune_begin(cs, &cmd->device->autotune, autotune_result);
+
    tu_cs_sanity_check(cs);
 }
 
@@ -1340,8 +1398,11 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 }
 
 static void
-tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
+tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
+                    const struct tu_renderpass_result *autotune_result)
 {
+   tu6_autotune_end(cs, &cmd->device->autotune, autotune_result);
+
    tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
 
    tu_cs_emit_regs(cs,
@@ -1355,11 +1416,12 @@ tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 }
 
 static void
-tu_cmd_render_tiles(struct tu_cmd_buffer *cmd)
+tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
+                    const struct tu_renderpass_result *autotune_result)
 {
    const struct tu_framebuffer *fb = cmd->state.framebuffer;
 
-   tu6_tile_render_begin(cmd, &cmd->cs);
+   tu6_tile_render_begin(cmd, &cmd->cs, autotune_result);
 
    uint32_t pipe = 0;
    for (uint32_t py = 0; py < fb->pipe_count.height; py++) {
@@ -1381,7 +1443,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd)
       }
    }
 
-   tu6_tile_render_end(cmd, &cmd->cs);
+   tu6_tile_render_end(cmd, &cmd->cs, autotune_result);
 
    trace_end_render_pass(&cmd->trace, &cmd->cs, fb);
 
@@ -1391,9 +1453,10 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd)
 }
 
 static void
-tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd)
+tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,
+                     const struct tu_renderpass_result *autotune_result)
 {
-   tu6_sysmem_render_begin(cmd, &cmd->cs);
+   tu6_sysmem_render_begin(cmd, &cmd->cs, autotune_result);
 
    trace_start_draw_ib_sysmem(&cmd->trace, &cmd->cs);
 
@@ -1401,7 +1464,7 @@ tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd)
 
    trace_end_draw_ib_sysmem(&cmd->trace, &cmd->cs);
 
-   tu6_sysmem_render_end(cmd, &cmd->cs);
+   tu6_sysmem_render_end(cmd, &cmd->cs, autotune_result);
 
    trace_end_render_pass(&cmd->trace, &cmd->cs, cmd->state.framebuffer);
 }
@@ -1442,7 +1505,9 @@ tu_create_cmd_buffer(struct tu_device *device,
       cmd_buffer->queue_family_index = TU_QUEUE_GENERAL;
    }
 
+
    u_trace_init(&cmd_buffer->trace, &device->trace_context);
+   list_inithead(&cmd_buffer->renderpass_autotune_results);
 
    tu_cs_init(&cmd_buffer->cs, device, TU_CS_MODE_GROW, 4096);
    tu_cs_init(&cmd_buffer->draw_cs, device, TU_CS_MODE_GROW, 4096);
@@ -1468,6 +1533,8 @@ tu_cmd_buffer_destroy(struct tu_cmd_buffer *cmd_buffer)
 
    u_trace_fini(&cmd_buffer->trace);
 
+   tu_autotune_free_results(&cmd_buffer->renderpass_autotune_results);
+
    for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
       if (cmd_buffer->descriptors[i].push_set.layout)
          tu_descriptor_set_layout_unref(cmd_buffer->device,
@@ -1492,6 +1559,8 @@ tu_reset_cmd_buffer(struct tu_cmd_buffer *cmd_buffer)
    tu_cs_reset(&cmd_buffer->draw_epilogue_cs);
    tu_cs_reset(&cmd_buffer->sub_cs);
 
+   tu_autotune_free_results(&cmd_buffer->renderpass_autotune_results);
+
    for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
       memset(&cmd_buffer->descriptors[i].sets, 0, sizeof(cmd_buffer->descriptors[i].sets));
       if (cmd_buffer->descriptors[i].push_set.layout)
@@ -3818,6 +3887,15 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
 {
    const struct tu_pipeline *pipeline = cmd->state.pipeline;
 
+   /* Fill draw stats for autotuner */
+   cmd->state.drawcall_count++;
+
+   cmd->state.total_drawcalls_cost += cmd->state.pipeline->drawcall_base_cost;
+   if (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE)
+      cmd->state.total_drawcalls_cost++;
+   if (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE)
+      cmd->state.total_drawcalls_cost++;
+
    tu_emit_cache_flush_renderpass(cmd, cs);
 
    bool primitive_restart_enabled = pipeline->ia.primitive_restart;
@@ -4584,10 +4662,11 @@ tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
 
    cmd_buffer->trace_renderpass_end = u_trace_end_iterator(&cmd_buffer->trace);
 
-   if (use_sysmem_rendering(cmd_buffer))
-      tu_cmd_render_sysmem(cmd_buffer);
+   struct tu_renderpass_result *autotune_result = NULL;
+   if (use_sysmem_rendering(cmd_buffer, &autotune_result))
+      tu_cmd_render_sysmem(cmd_buffer, autotune_result);
    else
-      tu_cmd_render_tiles(cmd_buffer);
+      tu_cmd_render_tiles(cmd_buffer, autotune_result);
 
    /* Outside of renderpasses we assume all draw states are disabled. We do
     * this outside the draw CS for the normal case where 3d gmem stores aren't
@@ -4617,6 +4696,8 @@ tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
    cmd_buffer->state.has_tess = false;
    cmd_buffer->state.has_subpass_predication = false;
    cmd_buffer->state.disable_gmem = false;
+   cmd_buffer->state.drawcall_count = 0;
+   cmd_buffer->state.total_drawcalls_cost = 0;
 
    /* LRZ is not valid next time we use it */
    cmd_buffer->state.lrz.valid = false;
diff --git a/src/freedreno/vulkan/tu_device.c b/src/freedreno/vulkan/tu_device.c
index 7f7656fa58b..5f60a4a65b1 100644
--- a/src/freedreno/vulkan/tu_device.c
+++ b/src/freedreno/vulkan/tu_device.c
@@ -1810,6 +1810,11 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
 
    device->mem_cache = tu_pipeline_cache_from_handle(pc);
 
+   result = tu_autotune_init(&device->autotune, device);
+   if (result != VK_SUCCESS) {
+      goto fail_timeline_cond;
+   }
+
    for (unsigned i = 0; i < ARRAY_SIZE(device->scratch_bos); i++)
       mtx_init(&device->scratch_bos[i].construct_mtx, mtx_plain);
 
@@ -1891,6 +1896,8 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
       free(device->perfcntrs_pass_cs);
    }
 
+   tu_autotune_fini(&device->autotune, device);
+
    pthread_cond_destroy(&device->timeline_cond);
    vk_free(&device->vk.alloc, device->bo_list);
    vk_free(&device->vk.alloc, device->bo_idx);
diff --git a/src/freedreno/vulkan/tu_drm.c b/src/freedreno/vulkan/tu_drm.c
index 93307300a3c..150cbe97612 100644
--- a/src/freedreno/vulkan/tu_drm.c
+++ b/src/freedreno/vulkan/tu_drm.c
@@ -53,6 +53,8 @@ struct tu_queue_submit
    uint32_t nr_out_syncobjs;
    uint32_t entry_count;
    uint32_t perf_pass_index;
+
+   bool     autotune_fence;
 };
 
 struct tu_u_trace_syncobj
@@ -746,8 +748,14 @@ tu_queue_submit_create_locked(struct tu_queue *queue,
       }
    }
 
+
    memset(new_submit, 0, sizeof(struct tu_queue_submit));
 
+   new_submit->autotune_fence =
+      tu_autotune_submit_requires_fence(cmd_buffers, vk_submit->command_buffer_count);
+   if (new_submit->autotune_fence)
+      entry_count++;
+
    new_submit->cmds = vk_zalloc(&queue->device->vk.alloc,
          entry_count * sizeof(*new_submit->cmds), 8,
          VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
@@ -818,9 +826,26 @@ tu_queue_submit_finish(struct tu_queue *queue, struct tu_queue_submit *submit)
 }
 
 static void
-tu_queue_build_msm_gem_submit_cmds(struct tu_queue *queue,
-                                   struct tu_queue_submit *submit)
+tu_fill_msm_gem_submit(struct tu_device *dev,
+                       struct drm_msm_gem_submit_cmd *cmd,
+                       struct tu_cs_entry *cs_entry)
 {
+   cmd->type = MSM_SUBMIT_CMD_BUF;
+   cmd->submit_idx =
+      dev->bo_idx[cs_entry->bo->gem_handle];
+   cmd->submit_offset = cs_entry->offset;
+   cmd->size = cs_entry->size;
+   cmd->pad = 0;
+   cmd->nr_relocs = 0;
+   cmd->relocs = 0;
+}
+
+static void
+tu_queue_build_msm_gem_submit_cmds(struct tu_queue *queue,
+                                   struct tu_queue_submit *submit,
+                                   struct tu_cs *autotune_cs)
+{
+   struct tu_device *dev = queue->device;
    struct drm_msm_gem_submit_cmd *cmds = submit->cmds;
 
    struct vk_command_buffer **vk_cmd_buffers = submit->vk_submit->command_buffers;
@@ -836,45 +861,27 @@ tu_queue_build_msm_gem_submit_cmds(struct tu_queue *queue,
          struct tu_cs_entry *perf_cs_entry =
             &dev->perfcntrs_pass_cs_entries[submit->perf_pass_index];
 
-         cmds[entry_idx].type = MSM_SUBMIT_CMD_BUF;
-         cmds[entry_idx].submit_idx =
-            dev->bo_idx[perf_cs_entry->bo->gem_handle];
-         cmds[entry_idx].submit_offset = perf_cs_entry->offset;
-         cmds[entry_idx].size = perf_cs_entry->size;
-         cmds[entry_idx].pad = 0;
-         cmds[entry_idx].nr_relocs = 0;
-         cmds[entry_idx++].relocs = 0;
+         tu_fill_msm_gem_submit(dev, &cmds[entry_idx], perf_cs_entry);
       }
 
       for (unsigned i = 0; i < cs->entry_count; ++i, ++entry_idx) {
-         cmds[entry_idx].type = MSM_SUBMIT_CMD_BUF;
-         cmds[entry_idx].submit_idx =
-            dev->bo_idx[cs->entries[i].bo->gem_handle];
-         cmds[entry_idx].submit_offset = cs->entries[i].offset;
-         cmds[entry_idx].size = cs->entries[i].size;
-         cmds[entry_idx].pad = 0;
-         cmds[entry_idx].nr_relocs = 0;
-         cmds[entry_idx].relocs = 0;
+         tu_fill_msm_gem_submit(dev, &cmds[entry_idx], &cs->entries[i]);
       }
 
       if (submit->u_trace_submission_data) {
          struct tu_cs *ts_cs =
             submit->u_trace_submission_data->cmd_trace_data[j].timestamp_copy_cs;
          if (ts_cs) {
-            cmds[entry_idx].type = MSM_SUBMIT_CMD_BUF;
-            cmds[entry_idx].submit_idx =
-               queue->device->bo_idx[ts_cs->entries[0].bo->gem_handle];
-
-            assert(cmds[entry_idx].submit_idx < queue->device->bo_count);
-
-            cmds[entry_idx].submit_offset = ts_cs->entries[0].offset;
-            cmds[entry_idx].size = ts_cs->entries[0].size;
-            cmds[entry_idx].pad = 0;
-            cmds[entry_idx].nr_relocs = 0;
-            cmds[entry_idx++].relocs = 0;
+            tu_fill_msm_gem_submit(dev, &cmds[entry_idx], &ts_cs->entries[0]);
          }
       }
    }
+
+   if (autotune_cs) {
+      assert(autotune_cs->entry_count == 1);
+      tu_fill_msm_gem_submit(dev, &cmds[entry_idx], &autotune_cs->entries[0]);
+      entry_idx++;
+   }
 }
 
 static VkResult
@@ -882,6 +889,15 @@ tu_queue_submit_locked(struct tu_queue *queue, struct tu_queue_submit *submit)
 {
    queue->device->submit_count++;
 
+   struct tu_cs *autotune_cs = NULL;
+   if (submit->autotune_fence) {
+      struct tu_cmd_buffer **cmd_buffers = (void *)submit->vk_submit->command_buffers;
+      autotune_cs = tu_autotune_on_submit(queue->device,
+                                          &queue->device->autotune,
+                                          cmd_buffers,
+                                          submit->vk_submit->command_buffer_count);
+   }
+
    uint32_t flags = MSM_PIPE_3D0;
 
    if (submit->vk_submit->wait_count)
@@ -896,7 +912,7 @@ tu_queue_submit_locked(struct tu_queue *queue, struct tu_queue_submit *submit)
     * time when bo_mutex is not locked. So we build submit cmds here the real
     * place to submit.
     */
-   tu_queue_build_msm_gem_submit_cmds(queue, submit);
+   tu_queue_build_msm_gem_submit_cmds(queue, submit, autotune_cs);
 
    struct drm_msm_gem_submit req = {
       .flags = flags,
diff --git a/src/freedreno/vulkan/tu_kgsl.c b/src/freedreno/vulkan/tu_kgsl.c
index e93e04d8a6a..861c55d6e02 100644
--- a/src/freedreno/vulkan/tu_kgsl.c
+++ b/src/freedreno/vulkan/tu_kgsl.c
@@ -358,6 +358,10 @@ tu_QueueSubmit(VkQueue _queue,
             entry_count++;
       }
 
+      struct tu_cmd_buffer **cmd_buffers = (void *)submit->pCommandBuffers;
+      if (tu_autotune_submit_requires_fence(cmd_buffers, submit->commandBufferCount))
+         entry_count++;
+
       max_entry_count = MAX2(max_entry_count, entry_count);
    }
 
@@ -404,6 +408,22 @@ tu_QueueSubmit(VkQueue _queue,
          }
       }
 
+      struct tu_cmd_buffer **cmd_buffers = (void *)submit->pCommandBuffers;
+      if (tu_autotune_submit_requires_fence(cmd_buffers, submit->commandBufferCount)) {
+         struct tu_cs *autotune_cs =
+            tu_autotune_on_submit(queue->device,
+                                  &queue->device->autotune,
+                                  cmd_buffers,
+                                  submit->commandBufferCount);
+         cmds[entry_idx++] = (struct kgsl_command_object) {
+            .offset = autotune_cs->entries[0].offset,
+            .gpuaddr = autotune_cs->entries[0].bo->iova,
+            .size = autotune_cs->entries[0].size,
+            .flags = KGSL_CMDLIST_IB,
+            .id = autotune_cs->entries[0].bo->gem_handle,
+         };
+      }
+
       struct tu_syncobj s = sync_merge(submit->pWaitSemaphores,
                                        submit->waitSemaphoreCount,
                                        true, true);
diff --git a/src/freedreno/vulkan/tu_pipeline.c b/src/freedreno/vulkan/tu_pipeline.c
index a0e18991596..b52ba8cdbd7 100644
--- a/src/freedreno/vulkan/tu_pipeline.c
+++ b/src/freedreno/vulkan/tu_pipeline.c
@@ -1576,6 +1576,9 @@ tu6_emit_fs_outputs(struct tu_cs *cs,
           (fs->no_earlyz || fs->has_kill || fs->writes_pos || fs->writes_stencilref || no_earlyz || fs->writes_smask)) {
          pipeline->lrz.force_late_z = true;
       }
+
+      pipeline->drawcall_base_cost +=
+         util_bitcount(fs_render_components) / util_bitcount(0xf);
    }
 }
 
@@ -3121,6 +3124,10 @@ tu_pipeline_builder_parse_multisample_and_color_blend(
          if (blendAttachment.blendEnable || blendAttachment.colorWriteMask != 0xf) {
             pipeline->lrz.force_disable_mask |= TU_LRZ_FORCE_DISABLE_WRITE;
          }
+
+         if (blendAttachment.blendEnable) {
+            pipeline->drawcall_base_cost++;
+         }
       }
    }
 
diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h
index 70d74066d37..d0bf7072a36 100644
--- a/src/freedreno/vulkan/tu_private.h
+++ b/src/freedreno/vulkan/tu_private.h
@@ -77,6 +77,7 @@
 #include "perfcntrs/freedreno_perfcntr.h"
 
 #include "tu_descriptor_set.h"
+#include "tu_autotune.h"
 #include "tu_util.h"
 #include "tu_perfetto.h"
 
@@ -462,6 +463,8 @@ struct tu_device
    pthread_cond_t timeline_cond;
    pthread_mutex_t submit_mutex;
 
+   struct tu_autotune autotune;
+
 #ifdef ANDROID
    const void *gralloc;
    enum {
@@ -1063,6 +1066,35 @@ struct tu_cmd_state
    bool disable_gmem;
    enum a5xx_line_mode line_mode;
 
+   uint32_t drawcall_count;
+
+   /* A calculated "draw cost" value for renderpass, which tries to
+    * estimate the bandwidth-per-sample of all the draws according
+    * to:
+    *
+    *    foreach_draw (...) {
+    *      cost += num_frag_outputs;
+    *      if (blend_enabled)
+    *        cost += num_blend_enabled;
+    *      if (depth_test_enabled)
+    *        cost++;
+    *      if (depth_write_enabled)
+    *        cost++;
+    *    }
+    *
+    * The idea is that each sample-passed minimally does one write
+    * per MRT.  If blend is enabled, the hw will additionally do
+    * a framebuffer read per sample-passed (for each MRT with blend
+    * enabled).  If depth-test is enabled, the hw will additionally
+    * a depth buffer read.  If depth-write is enable, the hw will
+    * additionally do a depth buffer write.
+    *
+    * This does ignore depth buffer traffic for samples which do not
+    * pass do to depth-test fail, and some other details.  But it is
+    * just intended to be a rough estimate that is easy to calculate.
+    */
+   uint32_t total_drawcalls_cost;
+
    struct tu_lrz_state lrz;
 
    struct tu_draw_state depth_plane_state;
@@ -1102,6 +1134,8 @@ struct tu_cmd_buffer
    struct u_trace_iterator trace_renderpass_start;
    struct u_trace_iterator trace_renderpass_end;
 
+   struct list_head renderpass_autotune_results;
+
    VkCommandBufferUsageFlags usage_flags;
    VkCommandBufferLevel level;
    enum tu_cmd_buffer_status status;
@@ -1300,6 +1334,9 @@ struct tu_pipeline
 
    struct tu_lrz_pipeline lrz;
 
+   /* Base drawcall cost for sysmem vs gmem autotuner */
+   uint8_t drawcall_base_cost;
+
    void *executables_mem_ctx;
    /* tu_pipeline_executable */
    struct util_dynarray executables;