diff --git a/src/freedreno/vulkan/tu_autotune.cc b/src/freedreno/vulkan/tu_autotune.cc
index 971cc1a9503..cfc145e3286 100644
--- a/src/freedreno/vulkan/tu_autotune.cc
+++ b/src/freedreno/vulkan/tu_autotune.cc
@@ -434,6 +434,9 @@ struct tu_autotune::rp_entry {
    bool sysmem;
    uint32_t draw_count;
 
+   /* Amount of repeated RPs so far, used for uniquely identifying instances of the same RPs. */
+   uint32_t duplicates = 0;
+
    rp_entry(struct tu_device *device, rp_history_handle &&history, config_t config, uint32_t draw_count)
        : device(device), map(nullptr), history(std::move(history)), config(config), draw_count(draw_count)
    {
@@ -583,12 +586,25 @@ tu_autotune::rp_key::rp_key(const struct tu_render_pass *pass,
                             const struct tu_framebuffer *framebuffer,
                             const struct tu_cmd_buffer *cmd)
 {
-   /* Q: Why not make the key from framebuffer + renderpass pointers?
-    * A: At least DXVK creates new framebuffers each frame while keeping renderpasses the same. Hashing the contents
-    *    of the framebuffer and renderpass is more stable, and it maintains stability across runs, so we can reliably
-    *    identify the same renderpass instance.
+   /* It may be hard to match the same renderpass between frames, or rather it's hard to strike a
+    * balance between being too lax with identifying different renderpasses as the same one, and
+    * not recognizing the same renderpass between frames when only a small thing changed.
+    *
+    * This is mainly an issue with translation layers (particularly DXVK), because a layer may
+    * break a "renderpass" into smaller ones due to some heuristic that isn't consistent between
+    * frames.
+    *
+    * Note: Not using image IOVA leads to too many false matches.
     */
 
+   struct PACKED packed_att_properties {
+      uint64_t iova;
+      bool load;
+      bool store;
+      bool load_stencil;
+      bool store_stencil;
+   };
+
    auto get_hash = [&](uint32_t *data, size_t size) {
       uint32_t *ptr = data;
       *ptr++ = framebuffer->width;
@@ -596,12 +612,18 @@ tu_autotune::rp_key::rp_key(const struct tu_render_pass *pass,
       *ptr++ = framebuffer->layers;
 
       for (unsigned i = 0; i < pass->attachment_count; i++) {
-         *ptr++ = cmd->state.attachments[i]->view.width;
-         *ptr++ = cmd->state.attachments[i]->view.height;
-         *ptr++ = cmd->state.attachments[i]->image->vk.format;
-         *ptr++ = cmd->state.attachments[i]->image->vk.array_layers;
-         *ptr++ = cmd->state.attachments[i]->image->vk.mip_levels;
+         packed_att_properties props = {
+            .iova = cmd->state.attachments[i]->image->iova + cmd->state.attachments[i]->view.offset,
+            .load = pass->attachments[i].load,
+            .store = pass->attachments[i].store,
+            .load_stencil = pass->attachments[i].load_stencil,
+            .store_stencil = pass->attachments[i].store_stencil,
+         };
+
+         memcpy(ptr, &props, sizeof(packed_att_properties));
+         ptr += sizeof(packed_att_properties) / sizeof(uint32_t);
       }
+      assert(ptr == data + size);
 
       return XXH3_64bits(data, size * sizeof(uint32_t));
    };
@@ -609,8 +631,8 @@ tu_autotune::rp_key::rp_key(const struct tu_render_pass *pass,
    /* We do a manual Boost-style "small vector" optimization here where the stack is used for the vast majority of
     * cases, while only extreme cases need to allocate on the heap.
     */
-   size_t data_count = 3 + (pass->attachment_count * 5);
-   constexpr size_t STACK_MAX_DATA_COUNT = 3 + (5 * 5); /* in u32 units. */
+   size_t data_count = 3 + (pass->attachment_count * sizeof(packed_att_properties) / sizeof(uint32_t));
+   constexpr size_t STACK_MAX_DATA_COUNT = 3 + (5 * 3); /* in u32 units. */
 
    if (data_count <= STACK_MAX_DATA_COUNT) {
       /* If the data is small enough, we can use the stack. */
@@ -623,6 +645,11 @@ tu_autotune::rp_key::rp_key(const struct tu_render_pass *pass,
    }
 }
 
+tu_autotune::rp_key::rp_key(const rp_key &key, uint32_t duplicates)
+{
+   hash = XXH3_64bits_withSeed(&key.hash, sizeof(key.hash), duplicates);
+}
+
 /* Exponential moving average (EMA) calculator for smoothing successive values of any metric. An alpha (smoothing
  * factor) of 0.1 means 10% weight to new values (slow adaptation), while 0.9 means 90% weight (fast adaptation).
  */
@@ -670,6 +697,7 @@ template <typename T = double> class exponential_average {
 struct tu_autotune::rp_history {
  public:
    uint64_t hash; /* The hash of the renderpass, just for debug output. */
+   uint32_t duplicates; /* The amount of times we've seen this RP, used for identifying repeated RPs. */
 
    std::atomic<uint32_t> refcount = 0; /* Reference count to prevent deletion when active. */
    std::atomic<uint64_t> last_use_ts;  /* Last time the reference count was updated, in monotonic nanoseconds. */
@@ -979,6 +1007,16 @@ tu_autotune::cmd_buf_ctx::attach_rp_entry(struct tu_device *device,
    return new_entry.get();
 }
 
+tu_autotune::rp_entry *
+tu_autotune::cmd_buf_ctx::find_rp_entry(const rp_key &key)
+{
+   for (auto &entry : batch->entries) {
+      if (entry->history->hash == key.hash)
+         return entry.get();
+   }
+   return nullptr;
+}
+
 tu_autotune::render_mode
 tu_autotune::get_optimal_mode(struct tu_cmd_buffer *cmd_buffer, rp_ctx_t *rp_ctx)
 {
@@ -1031,6 +1069,17 @@ tu_autotune::get_optimal_mode(struct tu_cmd_buffer *cmd_buffer, rp_ctx_t *rp_ctx
       return render_mode::SYSMEM;
 
    rp_key key(pass, framebuffer, cmd_buffer);
+
+   /* When nearly identical renderpasses appear multiple times within the same command buffer, we need to generate a
+    * unique hash for each instance to distinguish them. While this approach doesn't address identical renderpasses
+    * across different command buffers, it is good enough in most cases.
+    */
+   rp_entry *entry = cb_ctx.find_rp_entry(key);
+   if (entry) {
+      entry->duplicates++;
+      key = rp_key(key, entry->duplicates);
+   }
+
    *rp_ctx = cb_ctx.attach_rp_entry(device, find_or_create_rp_history(key), config, rp_state->drawcall_count);
    rp_history &history = *((*rp_ctx)->history);
 
diff --git a/src/freedreno/vulkan/tu_autotune.h b/src/freedreno/vulkan/tu_autotune.h
index 333236eee29..ec0d03b6d31 100644
--- a/src/freedreno/vulkan/tu_autotune.h
+++ b/src/freedreno/vulkan/tu_autotune.h
@@ -157,6 +157,9 @@ struct tu_autotune {
              const struct tu_framebuffer *framebuffer,
              const struct tu_cmd_buffer *cmd);
 
+      /* Further salt the hash to distinguish between multiple instances of the same RP within a single command buffer. */
+      rp_key(const rp_key &key, uint32_t duplicates);
+
       /* Equality operator, used in unordered_map. */
       constexpr bool operator==(const rp_key &other) const noexcept
       {
@@ -211,6 +214,8 @@ struct tu_autotune {
       rp_entry *
       attach_rp_entry(struct tu_device *device, rp_history_handle &&history, config_t config, uint32_t draw_count);
 
+      rp_entry *find_rp_entry(const rp_key &key);
+
       friend struct tu_autotune;
 
     public: