diff --git a/src/freedreno/vulkan/tu_autotune.cc b/src/freedreno/vulkan/tu_autotune.cc index 971cc1a9503..cfc145e3286 100644 --- a/src/freedreno/vulkan/tu_autotune.cc +++ b/src/freedreno/vulkan/tu_autotune.cc @@ -434,6 +434,9 @@ struct tu_autotune::rp_entry { bool sysmem; uint32_t draw_count; + /* Amount of repeated RPs so far, used for uniquely identifying instances of the same RPs. */ + uint32_t duplicates = 0; + rp_entry(struct tu_device *device, rp_history_handle &&history, config_t config, uint32_t draw_count) : device(device), map(nullptr), history(std::move(history)), config(config), draw_count(draw_count) { @@ -583,12 +586,25 @@ tu_autotune::rp_key::rp_key(const struct tu_render_pass *pass, const struct tu_framebuffer *framebuffer, const struct tu_cmd_buffer *cmd) { - /* Q: Why not make the key from framebuffer + renderpass pointers? - * A: At least DXVK creates new framebuffers each frame while keeping renderpasses the same. Hashing the contents - * of the framebuffer and renderpass is more stable, and it maintains stability across runs, so we can reliably - * identify the same renderpass instance. + /* It may be hard to match the same renderpass between frames, or rather it's hard to strike a + * balance between being too lax with identifying different renderpasses as the same one, and + * not recognizing the same renderpass between frames when only a small thing changed. + * + * This is mainly an issue with translation layers (particularly DXVK), because a layer may + * break a "renderpass" into smaller ones due to some heuristic that isn't consistent between + * frames. + * + * Note: Not using image IOVA leads to too many false matches. */ + struct PACKED packed_att_properties { + uint64_t iova; + bool load; + bool store; + bool load_stencil; + bool store_stencil; + }; + auto get_hash = [&](uint32_t *data, size_t size) { uint32_t *ptr = data; *ptr++ = framebuffer->width; @@ -596,12 +612,18 @@ tu_autotune::rp_key::rp_key(const struct tu_render_pass *pass, *ptr++ = framebuffer->layers; for (unsigned i = 0; i < pass->attachment_count; i++) { - *ptr++ = cmd->state.attachments[i]->view.width; - *ptr++ = cmd->state.attachments[i]->view.height; - *ptr++ = cmd->state.attachments[i]->image->vk.format; - *ptr++ = cmd->state.attachments[i]->image->vk.array_layers; - *ptr++ = cmd->state.attachments[i]->image->vk.mip_levels; + packed_att_properties props = { + .iova = cmd->state.attachments[i]->image->iova + cmd->state.attachments[i]->view.offset, + .load = pass->attachments[i].load, + .store = pass->attachments[i].store, + .load_stencil = pass->attachments[i].load_stencil, + .store_stencil = pass->attachments[i].store_stencil, + }; + + memcpy(ptr, &props, sizeof(packed_att_properties)); + ptr += sizeof(packed_att_properties) / sizeof(uint32_t); } + assert(ptr == data + size); return XXH3_64bits(data, size * sizeof(uint32_t)); }; @@ -609,8 +631,8 @@ tu_autotune::rp_key::rp_key(const struct tu_render_pass *pass, /* We do a manual Boost-style "small vector" optimization here where the stack is used for the vast majority of * cases, while only extreme cases need to allocate on the heap. */ - size_t data_count = 3 + (pass->attachment_count * 5); - constexpr size_t STACK_MAX_DATA_COUNT = 3 + (5 * 5); /* in u32 units. */ + size_t data_count = 3 + (pass->attachment_count * sizeof(packed_att_properties) / sizeof(uint32_t)); + constexpr size_t STACK_MAX_DATA_COUNT = 3 + (5 * 3); /* in u32 units. */ if (data_count <= STACK_MAX_DATA_COUNT) { /* If the data is small enough, we can use the stack. */ @@ -623,6 +645,11 @@ tu_autotune::rp_key::rp_key(const struct tu_render_pass *pass, } } +tu_autotune::rp_key::rp_key(const rp_key &key, uint32_t duplicates) +{ + hash = XXH3_64bits_withSeed(&key.hash, sizeof(key.hash), duplicates); +} + /* Exponential moving average (EMA) calculator for smoothing successive values of any metric. An alpha (smoothing * factor) of 0.1 means 10% weight to new values (slow adaptation), while 0.9 means 90% weight (fast adaptation). */ @@ -670,6 +697,7 @@ template class exponential_average { struct tu_autotune::rp_history { public: uint64_t hash; /* The hash of the renderpass, just for debug output. */ + uint32_t duplicates; /* The amount of times we've seen this RP, used for identifying repeated RPs. */ std::atomic refcount = 0; /* Reference count to prevent deletion when active. */ std::atomic last_use_ts; /* Last time the reference count was updated, in monotonic nanoseconds. */ @@ -979,6 +1007,16 @@ tu_autotune::cmd_buf_ctx::attach_rp_entry(struct tu_device *device, return new_entry.get(); } +tu_autotune::rp_entry * +tu_autotune::cmd_buf_ctx::find_rp_entry(const rp_key &key) +{ + for (auto &entry : batch->entries) { + if (entry->history->hash == key.hash) + return entry.get(); + } + return nullptr; +} + tu_autotune::render_mode tu_autotune::get_optimal_mode(struct tu_cmd_buffer *cmd_buffer, rp_ctx_t *rp_ctx) { @@ -1031,6 +1069,17 @@ tu_autotune::get_optimal_mode(struct tu_cmd_buffer *cmd_buffer, rp_ctx_t *rp_ctx return render_mode::SYSMEM; rp_key key(pass, framebuffer, cmd_buffer); + + /* When nearly identical renderpasses appear multiple times within the same command buffer, we need to generate a + * unique hash for each instance to distinguish them. While this approach doesn't address identical renderpasses + * across different command buffers, it is good enough in most cases. + */ + rp_entry *entry = cb_ctx.find_rp_entry(key); + if (entry) { + entry->duplicates++; + key = rp_key(key, entry->duplicates); + } + *rp_ctx = cb_ctx.attach_rp_entry(device, find_or_create_rp_history(key), config, rp_state->drawcall_count); rp_history &history = *((*rp_ctx)->history); diff --git a/src/freedreno/vulkan/tu_autotune.h b/src/freedreno/vulkan/tu_autotune.h index 333236eee29..ec0d03b6d31 100644 --- a/src/freedreno/vulkan/tu_autotune.h +++ b/src/freedreno/vulkan/tu_autotune.h @@ -157,6 +157,9 @@ struct tu_autotune { const struct tu_framebuffer *framebuffer, const struct tu_cmd_buffer *cmd); + /* Further salt the hash to distinguish between multiple instances of the same RP within a single command buffer. */ + rp_key(const rp_key &key, uint32_t duplicates); + /* Equality operator, used in unordered_map. */ constexpr bool operator==(const rp_key &other) const noexcept { @@ -211,6 +214,8 @@ struct tu_autotune { rp_entry * attach_rp_entry(struct tu_device *device, rp_history_handle &&history, config_t config, uint32_t draw_count); + rp_entry *find_rp_entry(const rp_key &key); + friend struct tu_autotune; public: