tu/autotune: Improve RP hash

Makes RP hash more unique by using attachment IOVAs and considering multiple instances of the same RP within a CB. Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37802>
2026-05-07 04:58:05 +02:00 · 2025-09-01 17:45:18 +02:00 · 2025-09-01 17:45:18 +02:00 · 44564b966d
commit 44564b966d
parent 40ffc052af
2 changed files with 65 additions and 11 deletions
--- a/src/freedreno/vulkan/tu_autotune.cc
+++ b/src/freedreno/vulkan/tu_autotune.cc
@ -434,6 +434,9 @@ struct tu_autotune::rp_entry {
   bool sysmem;
   uint32_t draw_count;

+   /* Amount of repeated RPs so far, used for uniquely identifying instances of the same RPs. */
+   uint32_t duplicates = 0;
+
   rp_entry(struct tu_device *device, rp_history_handle &&history, config_t config, uint32_t draw_count)
       : device(device), map(nullptr), history(std::move(history)), config(config), draw_count(draw_count)
   {
@ -583,12 +586,25 @@ tu_autotune::rp_key::rp_key(const struct tu_render_pass *pass,
                            const struct tu_framebuffer *framebuffer,
                            const struct tu_cmd_buffer *cmd)
 {
-   /* Q: Why not make the key from framebuffer + renderpass pointers?
-    * A: At least DXVK creates new framebuffers each frame while keeping renderpasses the same. Hashing the contents
-    *    of the framebuffer and renderpass is more stable, and it maintains stability across runs, so we can reliably
-    *    identify the same renderpass instance.
+   /* It may be hard to match the same renderpass between frames, or rather it's hard to strike a
+    * balance between being too lax with identifying different renderpasses as the same one, and
+    * not recognizing the same renderpass between frames when only a small thing changed.
+    *
+    * This is mainly an issue with translation layers (particularly DXVK), because a layer may
+    * break a "renderpass" into smaller ones due to some heuristic that isn't consistent between
+    * frames.
+    *
+    * Note: Not using image IOVA leads to too many false matches.
    */

+   struct PACKED packed_att_properties {
+      uint64_t iova;
+      bool load;
+      bool store;
+      bool load_stencil;
+      bool store_stencil;
+   };
+
   auto get_hash = [&](uint32_t *data, size_t size) {
      uint32_t *ptr = data;
      *ptr++ = framebuffer->width;
@ -596,12 +612,18 @@ tu_autotune::rp_key::rp_key(const struct tu_render_pass *pass,
      *ptr++ = framebuffer->layers;

      for (unsigned i = 0; i < pass->attachment_count; i++) {
-         *ptr++ = cmd->state.attachments[i]->view.width;
-         *ptr++ = cmd->state.attachments[i]->view.height;
-         *ptr++ = cmd->state.attachments[i]->image->vk.format;
-         *ptr++ = cmd->state.attachments[i]->image->vk.array_layers;
-         *ptr++ = cmd->state.attachments[i]->image->vk.mip_levels;
+         packed_att_properties props = {
+            .iova = cmd->state.attachments[i]->image->iova + cmd->state.attachments[i]->view.offset,
+            .load = pass->attachments[i].load,
+            .store = pass->attachments[i].store,
+            .load_stencil = pass->attachments[i].load_stencil,
+            .store_stencil = pass->attachments[i].store_stencil,
+         };
+
+         memcpy(ptr, &props, sizeof(packed_att_properties));
+         ptr += sizeof(packed_att_properties) / sizeof(uint32_t);
      }
+      assert(ptr == data + size);

      return XXH3_64bits(data, size * sizeof(uint32_t));
   };
@ -609,8 +631,8 @@ tu_autotune::rp_key::rp_key(const struct tu_render_pass *pass,
   /* We do a manual Boost-style "small vector" optimization here where the stack is used for the vast majority of
    * cases, while only extreme cases need to allocate on the heap.
    */
-   size_t data_count = 3 + (pass->attachment_count * 5);
-   constexpr size_t STACK_MAX_DATA_COUNT = 3 + (5 * 5); /* in u32 units. */
+   size_t data_count = 3 + (pass->attachment_count * sizeof(packed_att_properties) / sizeof(uint32_t));
+   constexpr size_t STACK_MAX_DATA_COUNT = 3 + (5 * 3); /* in u32 units. */

   if (data_count <= STACK_MAX_DATA_COUNT) {
      /* If the data is small enough, we can use the stack. */
@ -623,6 +645,11 @@ tu_autotune::rp_key::rp_key(const struct tu_render_pass *pass,
   }
 }

+tu_autotune::rp_key::rp_key(const rp_key &key, uint32_t duplicates)
+{
+   hash = XXH3_64bits_withSeed(&key.hash, sizeof(key.hash), duplicates);
+}
+
 /* Exponential moving average (EMA) calculator for smoothing successive values of any metric. An alpha (smoothing
 * factor) of 0.1 means 10% weight to new values (slow adaptation), while 0.9 means 90% weight (fast adaptation).
 */
@ -670,6 +697,7 @@ template <typename T = double> class exponential_average {
 struct tu_autotune::rp_history {
 public:
   uint64_t hash; /* The hash of the renderpass, just for debug output. */
+   uint32_t duplicates; /* The amount of times we've seen this RP, used for identifying repeated RPs. */

   std::atomic<uint32_t> refcount = 0; /* Reference count to prevent deletion when active. */
   std::atomic<uint64_t> last_use_ts;  /* Last time the reference count was updated, in monotonic nanoseconds. */
@ -979,6 +1007,16 @@ tu_autotune::cmd_buf_ctx::attach_rp_entry(struct tu_device *device,
   return new_entry.get();
 }

+tu_autotune::rp_entry *
+tu_autotune::cmd_buf_ctx::find_rp_entry(const rp_key &key)
+{
+   for (auto &entry : batch->entries) {
+      if (entry->history->hash == key.hash)
+         return entry.get();
+   }
+   return nullptr;
+}
+
 tu_autotune::render_mode
 tu_autotune::get_optimal_mode(struct tu_cmd_buffer *cmd_buffer, rp_ctx_t *rp_ctx)
 {
@ -1031,6 +1069,17 @@ tu_autotune::get_optimal_mode(struct tu_cmd_buffer *cmd_buffer, rp_ctx_t *rp_ctx
      return render_mode::SYSMEM;

   rp_key key(pass, framebuffer, cmd_buffer);
+
+   /* When nearly identical renderpasses appear multiple times within the same command buffer, we need to generate a
+    * unique hash for each instance to distinguish them. While this approach doesn't address identical renderpasses
+    * across different command buffers, it is good enough in most cases.
+    */
+   rp_entry *entry = cb_ctx.find_rp_entry(key);
+   if (entry) {
+      entry->duplicates++;
+      key = rp_key(key, entry->duplicates);
+   }
+
   *rp_ctx = cb_ctx.attach_rp_entry(device, find_or_create_rp_history(key), config, rp_state->drawcall_count);
   rp_history &history = *((*rp_ctx)->history);

--- a/src/freedreno/vulkan/tu_autotune.h
+++ b/src/freedreno/vulkan/tu_autotune.h
@ -157,6 +157,9 @@ struct tu_autotune {
             const struct tu_framebuffer *framebuffer,
             const struct tu_cmd_buffer *cmd);

+      /* Further salt the hash to distinguish between multiple instances of the same RP within a single command buffer. */
+      rp_key(const rp_key &key, uint32_t duplicates);
+
      /* Equality operator, used in unordered_map. */
      constexpr bool operator==(const rp_key &other) const noexcept
      {
@ -211,6 +214,8 @@ struct tu_autotune {
      rp_entry *
      attach_rp_entry(struct tu_device *device, rp_history_handle &&history, config_t config, uint32_t draw_count);

+      rp_entry *find_rp_entry(const rp_key &key);
+
      friend struct tu_autotune;

    public: