tu/autotune: Improve RP hash

Makes RP hash more unique by using attachment IOVAs and considering
multiple instances of the same RP within a CB.

Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37802>
This commit is contained in:
Danylo Piliaiev 2025-09-01 17:45:18 +02:00 committed by Dhruv Mark Collins
parent 40ffc052af
commit 44564b966d
2 changed files with 65 additions and 11 deletions

View file

@ -434,6 +434,9 @@ struct tu_autotune::rp_entry {
bool sysmem;
uint32_t draw_count;
/* Amount of repeated RPs so far, used for uniquely identifying instances of the same RPs. */
uint32_t duplicates = 0;
rp_entry(struct tu_device *device, rp_history_handle &&history, config_t config, uint32_t draw_count)
: device(device), map(nullptr), history(std::move(history)), config(config), draw_count(draw_count)
{
@ -583,12 +586,25 @@ tu_autotune::rp_key::rp_key(const struct tu_render_pass *pass,
const struct tu_framebuffer *framebuffer,
const struct tu_cmd_buffer *cmd)
{
/* Q: Why not make the key from framebuffer + renderpass pointers?
* A: At least DXVK creates new framebuffers each frame while keeping renderpasses the same. Hashing the contents
* of the framebuffer and renderpass is more stable, and it maintains stability across runs, so we can reliably
* identify the same renderpass instance.
/* It may be hard to match the same renderpass between frames, or rather it's hard to strike a
* balance between being too lax with identifying different renderpasses as the same one, and
* not recognizing the same renderpass between frames when only a small thing changed.
*
* This is mainly an issue with translation layers (particularly DXVK), because a layer may
* break a "renderpass" into smaller ones due to some heuristic that isn't consistent between
* frames.
*
* Note: Not using image IOVA leads to too many false matches.
*/
struct PACKED packed_att_properties {
uint64_t iova;
bool load;
bool store;
bool load_stencil;
bool store_stencil;
};
auto get_hash = [&](uint32_t *data, size_t size) {
uint32_t *ptr = data;
*ptr++ = framebuffer->width;
@ -596,12 +612,18 @@ tu_autotune::rp_key::rp_key(const struct tu_render_pass *pass,
*ptr++ = framebuffer->layers;
for (unsigned i = 0; i < pass->attachment_count; i++) {
*ptr++ = cmd->state.attachments[i]->view.width;
*ptr++ = cmd->state.attachments[i]->view.height;
*ptr++ = cmd->state.attachments[i]->image->vk.format;
*ptr++ = cmd->state.attachments[i]->image->vk.array_layers;
*ptr++ = cmd->state.attachments[i]->image->vk.mip_levels;
packed_att_properties props = {
.iova = cmd->state.attachments[i]->image->iova + cmd->state.attachments[i]->view.offset,
.load = pass->attachments[i].load,
.store = pass->attachments[i].store,
.load_stencil = pass->attachments[i].load_stencil,
.store_stencil = pass->attachments[i].store_stencil,
};
memcpy(ptr, &props, sizeof(packed_att_properties));
ptr += sizeof(packed_att_properties) / sizeof(uint32_t);
}
assert(ptr == data + size);
return XXH3_64bits(data, size * sizeof(uint32_t));
};
@ -609,8 +631,8 @@ tu_autotune::rp_key::rp_key(const struct tu_render_pass *pass,
/* We do a manual Boost-style "small vector" optimization here where the stack is used for the vast majority of
* cases, while only extreme cases need to allocate on the heap.
*/
size_t data_count = 3 + (pass->attachment_count * 5);
constexpr size_t STACK_MAX_DATA_COUNT = 3 + (5 * 5); /* in u32 units. */
size_t data_count = 3 + (pass->attachment_count * sizeof(packed_att_properties) / sizeof(uint32_t));
constexpr size_t STACK_MAX_DATA_COUNT = 3 + (5 * 3); /* in u32 units. */
if (data_count <= STACK_MAX_DATA_COUNT) {
/* If the data is small enough, we can use the stack. */
@ -623,6 +645,11 @@ tu_autotune::rp_key::rp_key(const struct tu_render_pass *pass,
}
}
tu_autotune::rp_key::rp_key(const rp_key &key, uint32_t duplicates)
{
hash = XXH3_64bits_withSeed(&key.hash, sizeof(key.hash), duplicates);
}
/* Exponential moving average (EMA) calculator for smoothing successive values of any metric. An alpha (smoothing
* factor) of 0.1 means 10% weight to new values (slow adaptation), while 0.9 means 90% weight (fast adaptation).
*/
@ -670,6 +697,7 @@ template <typename T = double> class exponential_average {
struct tu_autotune::rp_history {
public:
uint64_t hash; /* The hash of the renderpass, just for debug output. */
uint32_t duplicates; /* The amount of times we've seen this RP, used for identifying repeated RPs. */
std::atomic<uint32_t> refcount = 0; /* Reference count to prevent deletion when active. */
std::atomic<uint64_t> last_use_ts; /* Last time the reference count was updated, in monotonic nanoseconds. */
@ -979,6 +1007,16 @@ tu_autotune::cmd_buf_ctx::attach_rp_entry(struct tu_device *device,
return new_entry.get();
}
tu_autotune::rp_entry *
tu_autotune::cmd_buf_ctx::find_rp_entry(const rp_key &key)
{
for (auto &entry : batch->entries) {
if (entry->history->hash == key.hash)
return entry.get();
}
return nullptr;
}
tu_autotune::render_mode
tu_autotune::get_optimal_mode(struct tu_cmd_buffer *cmd_buffer, rp_ctx_t *rp_ctx)
{
@ -1031,6 +1069,17 @@ tu_autotune::get_optimal_mode(struct tu_cmd_buffer *cmd_buffer, rp_ctx_t *rp_ctx
return render_mode::SYSMEM;
rp_key key(pass, framebuffer, cmd_buffer);
/* When nearly identical renderpasses appear multiple times within the same command buffer, we need to generate a
* unique hash for each instance to distinguish them. While this approach doesn't address identical renderpasses
* across different command buffers, it is good enough in most cases.
*/
rp_entry *entry = cb_ctx.find_rp_entry(key);
if (entry) {
entry->duplicates++;
key = rp_key(key, entry->duplicates);
}
*rp_ctx = cb_ctx.attach_rp_entry(device, find_or_create_rp_history(key), config, rp_state->drawcall_count);
rp_history &history = *((*rp_ctx)->history);

View file

@ -157,6 +157,9 @@ struct tu_autotune {
const struct tu_framebuffer *framebuffer,
const struct tu_cmd_buffer *cmd);
/* Further salt the hash to distinguish between multiple instances of the same RP within a single command buffer. */
rp_key(const rp_key &key, uint32_t duplicates);
/* Equality operator, used in unordered_map. */
constexpr bool operator==(const rp_key &other) const noexcept
{
@ -211,6 +214,8 @@ struct tu_autotune {
rp_entry *
attach_rp_entry(struct tu_device *device, rp_history_handle &&history, config_t config, uint32_t draw_count);
rp_entry *find_rp_entry(const rp_key &key);
friend struct tu_autotune;
public: