mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-07 04:58:05 +02:00
tu/autotune: Improve RP hash
Makes RP hash more unique by using attachment IOVAs and considering multiple instances of the same RP within a CB. Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37802>
This commit is contained in:
parent
40ffc052af
commit
44564b966d
2 changed files with 65 additions and 11 deletions
|
|
@ -434,6 +434,9 @@ struct tu_autotune::rp_entry {
|
|||
bool sysmem;
|
||||
uint32_t draw_count;
|
||||
|
||||
/* Amount of repeated RPs so far, used for uniquely identifying instances of the same RPs. */
|
||||
uint32_t duplicates = 0;
|
||||
|
||||
rp_entry(struct tu_device *device, rp_history_handle &&history, config_t config, uint32_t draw_count)
|
||||
: device(device), map(nullptr), history(std::move(history)), config(config), draw_count(draw_count)
|
||||
{
|
||||
|
|
@ -583,12 +586,25 @@ tu_autotune::rp_key::rp_key(const struct tu_render_pass *pass,
|
|||
const struct tu_framebuffer *framebuffer,
|
||||
const struct tu_cmd_buffer *cmd)
|
||||
{
|
||||
/* Q: Why not make the key from framebuffer + renderpass pointers?
|
||||
* A: At least DXVK creates new framebuffers each frame while keeping renderpasses the same. Hashing the contents
|
||||
* of the framebuffer and renderpass is more stable, and it maintains stability across runs, so we can reliably
|
||||
* identify the same renderpass instance.
|
||||
/* It may be hard to match the same renderpass between frames, or rather it's hard to strike a
|
||||
* balance between being too lax with identifying different renderpasses as the same one, and
|
||||
* not recognizing the same renderpass between frames when only a small thing changed.
|
||||
*
|
||||
* This is mainly an issue with translation layers (particularly DXVK), because a layer may
|
||||
* break a "renderpass" into smaller ones due to some heuristic that isn't consistent between
|
||||
* frames.
|
||||
*
|
||||
* Note: Not using image IOVA leads to too many false matches.
|
||||
*/
|
||||
|
||||
struct PACKED packed_att_properties {
|
||||
uint64_t iova;
|
||||
bool load;
|
||||
bool store;
|
||||
bool load_stencil;
|
||||
bool store_stencil;
|
||||
};
|
||||
|
||||
auto get_hash = [&](uint32_t *data, size_t size) {
|
||||
uint32_t *ptr = data;
|
||||
*ptr++ = framebuffer->width;
|
||||
|
|
@ -596,12 +612,18 @@ tu_autotune::rp_key::rp_key(const struct tu_render_pass *pass,
|
|||
*ptr++ = framebuffer->layers;
|
||||
|
||||
for (unsigned i = 0; i < pass->attachment_count; i++) {
|
||||
*ptr++ = cmd->state.attachments[i]->view.width;
|
||||
*ptr++ = cmd->state.attachments[i]->view.height;
|
||||
*ptr++ = cmd->state.attachments[i]->image->vk.format;
|
||||
*ptr++ = cmd->state.attachments[i]->image->vk.array_layers;
|
||||
*ptr++ = cmd->state.attachments[i]->image->vk.mip_levels;
|
||||
packed_att_properties props = {
|
||||
.iova = cmd->state.attachments[i]->image->iova + cmd->state.attachments[i]->view.offset,
|
||||
.load = pass->attachments[i].load,
|
||||
.store = pass->attachments[i].store,
|
||||
.load_stencil = pass->attachments[i].load_stencil,
|
||||
.store_stencil = pass->attachments[i].store_stencil,
|
||||
};
|
||||
|
||||
memcpy(ptr, &props, sizeof(packed_att_properties));
|
||||
ptr += sizeof(packed_att_properties) / sizeof(uint32_t);
|
||||
}
|
||||
assert(ptr == data + size);
|
||||
|
||||
return XXH3_64bits(data, size * sizeof(uint32_t));
|
||||
};
|
||||
|
|
@ -609,8 +631,8 @@ tu_autotune::rp_key::rp_key(const struct tu_render_pass *pass,
|
|||
/* We do a manual Boost-style "small vector" optimization here where the stack is used for the vast majority of
|
||||
* cases, while only extreme cases need to allocate on the heap.
|
||||
*/
|
||||
size_t data_count = 3 + (pass->attachment_count * 5);
|
||||
constexpr size_t STACK_MAX_DATA_COUNT = 3 + (5 * 5); /* in u32 units. */
|
||||
size_t data_count = 3 + (pass->attachment_count * sizeof(packed_att_properties) / sizeof(uint32_t));
|
||||
constexpr size_t STACK_MAX_DATA_COUNT = 3 + (5 * 3); /* in u32 units. */
|
||||
|
||||
if (data_count <= STACK_MAX_DATA_COUNT) {
|
||||
/* If the data is small enough, we can use the stack. */
|
||||
|
|
@ -623,6 +645,11 @@ tu_autotune::rp_key::rp_key(const struct tu_render_pass *pass,
|
|||
}
|
||||
}
|
||||
|
||||
tu_autotune::rp_key::rp_key(const rp_key &key, uint32_t duplicates)
|
||||
{
|
||||
hash = XXH3_64bits_withSeed(&key.hash, sizeof(key.hash), duplicates);
|
||||
}
|
||||
|
||||
/* Exponential moving average (EMA) calculator for smoothing successive values of any metric. An alpha (smoothing
|
||||
* factor) of 0.1 means 10% weight to new values (slow adaptation), while 0.9 means 90% weight (fast adaptation).
|
||||
*/
|
||||
|
|
@ -670,6 +697,7 @@ template <typename T = double> class exponential_average {
|
|||
struct tu_autotune::rp_history {
|
||||
public:
|
||||
uint64_t hash; /* The hash of the renderpass, just for debug output. */
|
||||
uint32_t duplicates; /* The amount of times we've seen this RP, used for identifying repeated RPs. */
|
||||
|
||||
std::atomic<uint32_t> refcount = 0; /* Reference count to prevent deletion when active. */
|
||||
std::atomic<uint64_t> last_use_ts; /* Last time the reference count was updated, in monotonic nanoseconds. */
|
||||
|
|
@ -979,6 +1007,16 @@ tu_autotune::cmd_buf_ctx::attach_rp_entry(struct tu_device *device,
|
|||
return new_entry.get();
|
||||
}
|
||||
|
||||
tu_autotune::rp_entry *
|
||||
tu_autotune::cmd_buf_ctx::find_rp_entry(const rp_key &key)
|
||||
{
|
||||
for (auto &entry : batch->entries) {
|
||||
if (entry->history->hash == key.hash)
|
||||
return entry.get();
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
tu_autotune::render_mode
|
||||
tu_autotune::get_optimal_mode(struct tu_cmd_buffer *cmd_buffer, rp_ctx_t *rp_ctx)
|
||||
{
|
||||
|
|
@ -1031,6 +1069,17 @@ tu_autotune::get_optimal_mode(struct tu_cmd_buffer *cmd_buffer, rp_ctx_t *rp_ctx
|
|||
return render_mode::SYSMEM;
|
||||
|
||||
rp_key key(pass, framebuffer, cmd_buffer);
|
||||
|
||||
/* When nearly identical renderpasses appear multiple times within the same command buffer, we need to generate a
|
||||
* unique hash for each instance to distinguish them. While this approach doesn't address identical renderpasses
|
||||
* across different command buffers, it is good enough in most cases.
|
||||
*/
|
||||
rp_entry *entry = cb_ctx.find_rp_entry(key);
|
||||
if (entry) {
|
||||
entry->duplicates++;
|
||||
key = rp_key(key, entry->duplicates);
|
||||
}
|
||||
|
||||
*rp_ctx = cb_ctx.attach_rp_entry(device, find_or_create_rp_history(key), config, rp_state->drawcall_count);
|
||||
rp_history &history = *((*rp_ctx)->history);
|
||||
|
||||
|
|
|
|||
|
|
@ -157,6 +157,9 @@ struct tu_autotune {
|
|||
const struct tu_framebuffer *framebuffer,
|
||||
const struct tu_cmd_buffer *cmd);
|
||||
|
||||
/* Further salt the hash to distinguish between multiple instances of the same RP within a single command buffer. */
|
||||
rp_key(const rp_key &key, uint32_t duplicates);
|
||||
|
||||
/* Equality operator, used in unordered_map. */
|
||||
constexpr bool operator==(const rp_key &other) const noexcept
|
||||
{
|
||||
|
|
@ -211,6 +214,8 @@ struct tu_autotune {
|
|||
rp_entry *
|
||||
attach_rp_entry(struct tu_device *device, rp_history_handle &&history, config_t config, uint32_t draw_count);
|
||||
|
||||
rp_entry *find_rp_entry(const rp_key &key);
|
||||
|
||||
friend struct tu_autotune;
|
||||
|
||||
public:
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue