anv/trtt: convert anv_trtt_bind arrays to util_dynarray

Since the L2 bug fix we've been overestimating l3l2_binds by a lot in
most of the cases: almost every single call to anv_sparse_bind_trtt
ends up using either 0 or 1 elements for l3l2_binds, with occasionally
something using 512 or more. By switching to util_dynarray we can
guarantee the best of every case:
  - l1_binds will remain a stack array for the vast majority of the
    calls
      - even more than before, since STACK_ARRAY was limited to 8
        elements and now we do 32
  - l1 will be properly dimensioned without the need for reallocs
  - l3l2_binds will be completely empty most of the times and only
    trigger allocations when necessary

Here's the top 10 most common results of anv_sparse_bind_trtt() for a
trace of Assassin's Creed: Valhalla. The first column is how many
times we had that case while running the trace. After this patch, all
these cases will proceed without any memory allocations.

    168 trtt_binds: num_vm_binds:04 l3l2:0000 l1:0004
    344 trtt_binds: num_vm_binds:01 l3l2:0000 l1:0004
    420 trtt_binds: num_vm_binds:01 l3l2:0000 l1:0012
    422 trtt_binds: num_vm_binds:04 l3l2:0000 l1:0008
    479 trtt_binds: num_vm_binds:01 l3l2:0000 l1:0024
    560 trtt_binds: num_vm_binds:03 l3l2:0000 l1:0003
   1005 trtt_binds: num_vm_binds:01 l3l2:0000 l1:0002
   1024 trtt_binds: num_vm_binds:02 l3l2:0000 l1:0004
   2145 trtt_binds: num_vm_binds:02 l3l2:0000 l1:0002
   3735 trtt_binds: num_vm_binds:01 l3l2:0000 l1:0001

Only 70 out of total 11340 calls to anv_sparse_bind_trtt() contained
l3l2 elements.

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Signed-off-by: Paulo Zanoni <paulo.r.zanoni@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30953>
This commit is contained in:
Paulo Zanoni 2024-08-28 14:54:54 -07:00 committed by Marge Bot
parent 5839b9d083
commit 8280a6fee9

View file

@ -412,14 +412,14 @@ trtt_get_page_table_bo(struct anv_device *device, struct anv_bo **bo,
#define ANV_TRTT_L3L2_INVALID_ENTRY (1 << 0)
static void
anv_trtt_bind_list_add_entry(struct anv_trtt_bind *binds, uint32_t *binds_len,
uint64_t pte_addr, uint64_t entry_addr)
anv_trtt_bind_list_add_entry(struct util_dynarray *binds, uint64_t pte_addr,
uint64_t entry_addr)
{
binds[*binds_len] = (struct anv_trtt_bind) {
struct anv_trtt_bind b = {
.pte_addr = pte_addr,
.entry_addr = entry_addr,
};
(*binds_len)++;
util_dynarray_append(binds, struct anv_trtt_bind, b);
}
/* Adds elements to the anv_trtt_bind structs passed. This doesn't write the
@ -428,10 +428,8 @@ anv_trtt_bind_list_add_entry(struct anv_trtt_bind *binds, uint32_t *binds_len,
static VkResult
anv_trtt_bind_add(struct anv_device *device,
uint64_t trtt_addr, uint64_t dest_addr,
struct anv_trtt_bind *l3l2_binds,
uint32_t *n_l3l2_binds,
struct anv_trtt_bind *l1_binds,
uint32_t *n_l1_binds)
struct util_dynarray *l3l2_binds,
struct util_dynarray *l1_binds)
{
VkResult result = VK_SUCCESS;
struct anv_trtt *trtt = &device->trtt;
@ -448,9 +446,8 @@ anv_trtt_bind_add(struct anv_device *device,
if (is_null_bind) {
trtt->l3_mirror[l3_index] = ANV_TRTT_L3L2_NULL_ENTRY;
anv_trtt_bind_list_add_entry(l3l2_binds, n_l3l2_binds,
trtt->l3_addr + l3_index *
sizeof(uint64_t),
anv_trtt_bind_list_add_entry(l3l2_binds, trtt->l3_addr +
l3_index * sizeof(uint64_t),
ANV_TRTT_L3L2_NULL_ENTRY);
return VK_SUCCESS;
@ -463,19 +460,22 @@ anv_trtt_bind_add(struct anv_device *device,
trtt->l3_mirror[l3_index] = l2_addr;
anv_trtt_bind_list_add_entry(l3l2_binds, n_l3l2_binds,
trtt->l3_addr + l3_index *
sizeof(uint64_t), l2_addr);
anv_trtt_bind_list_add_entry(l3l2_binds, trtt->l3_addr +
l3_index * sizeof(uint64_t), l2_addr);
/* We have just created a new L2 table. Other resources may already have
* been pointing to this L2 table relying on the fact that it was marked
* as NULL, so now we need to mark every one of its entries as NULL in
* order to preserve behavior for those entries.
*/
if (!util_dynarray_ensure_cap(l3l2_binds,
l3l2_binds->capacity + 512 * sizeof(struct anv_trtt_bind)))
return VK_ERROR_OUT_OF_HOST_MEMORY;
for (int i = 0; i < 512; i++) {
if (i != l2_index) {
trtt->l2_mirror[l3_index * 512 + i] = ANV_TRTT_L3L2_NULL_ENTRY;
anv_trtt_bind_list_add_entry(l3l2_binds, n_l3l2_binds,
anv_trtt_bind_list_add_entry(l3l2_binds,
l2_addr + i * sizeof(uint64_t),
ANV_TRTT_L3L2_NULL_ENTRY);
}
@ -492,7 +492,7 @@ anv_trtt_bind_add(struct anv_device *device,
trtt->l2_mirror[l3_index * 512 + l2_index] =
ANV_TRTT_L3L2_NULL_ENTRY;
anv_trtt_bind_list_add_entry(l3l2_binds, n_l3l2_binds,
anv_trtt_bind_list_add_entry(l3l2_binds,
l2_addr + l2_index * sizeof(uint64_t),
ANV_TRTT_L3L2_NULL_ENTRY);
@ -506,13 +506,13 @@ anv_trtt_bind_add(struct anv_device *device,
trtt->l2_mirror[l3_index * 512 + l2_index] = l1_addr;
anv_trtt_bind_list_add_entry(l3l2_binds, n_l3l2_binds,
anv_trtt_bind_list_add_entry(l3l2_binds,
l2_addr + l2_index * sizeof(uint64_t),
l1_addr);
}
assert(l1_addr != 0 && l1_addr != ANV_TRTT_L3L2_NULL_ENTRY);
anv_trtt_bind_list_add_entry(l1_binds, n_l1_binds,
anv_trtt_bind_list_add_entry(l1_binds,
l1_addr + l1_index * sizeof(uint32_t),
dest_addr);
@ -665,20 +665,23 @@ anv_trtt_first_bind_init(struct anv_device *device)
/* We only need to do this once, so pick the first queue. */
if (i == 0) {
struct anv_trtt_bind entries[512];
uint32_t entries_added = 0;
struct anv_trtt_bind l3l2_binds_data[512];
struct util_dynarray l3l2_binds;
util_dynarray_init_from_stack(&l3l2_binds, l3l2_binds_data,
sizeof(l3l2_binds_data));
for (int entry = 0; entry < 512; entry++) {
trtt->l3_mirror[entry] = ANV_TRTT_L3L2_NULL_ENTRY;
anv_trtt_bind_list_add_entry(entries, &entries_added,
anv_trtt_bind_list_add_entry(&l3l2_binds,
trtt->l3_addr +
entry * sizeof(uint64_t),
ANV_TRTT_L3L2_NULL_ENTRY);
}
assert(entries_added == 512);
anv_genX(device->info, write_trtt_entries)(submit, entries, 512,
NULL, 0);
anv_genX(device->info, write_trtt_entries)(
submit, l3l2_binds.data,
util_dynarray_num_elements(&l3l2_binds, struct anv_trtt_bind),
NULL, 0);
result = anv_reloc_list_add_bo(&submit->relocs, l3_bo);
if (result != VK_SUCCESS) {
@ -740,33 +743,33 @@ anv_sparse_bind_trtt(struct anv_device *device,
simple_mtx_lock(&trtt->mutex);
/* These capacities are conservative estimations. For L1 binds the
* number will match exactly unless we skip NULL binds due to L2 already
* being NULL. For L3/L2 things are harder to estimate, but the resulting
* numbers are so small that a little overestimation won't hurt.
*
* We have assertions below to catch estimation errors.
*
* TODO: a bug fix caused us to put that "+ 1024" in the l3l2 capacity so
* now our estimations are super overestimating things in most cases, as
* most cases are still using a total capacity of just 1 or 2. We should
* replace this whole thing with something more efficient.
*/
int l3l2_binds_capacity = 1;
/* Do this so we can avoid reallocs later. */
int l1_binds_capacity = 0;
for (int b = 0; b < sparse_submit->binds_len; b++) {
assert(sparse_submit->binds[b].size % (64 * 1024) == 0);
int pages = sparse_submit->binds[b].size / (64 * 1024);
l1_binds_capacity += pages;
l3l2_binds_capacity += (pages / 1024 + 1) * 2 + 1024;
}
/* Turn a series of virtual address maps, into a list of L3/L2/L1 TRTT page
* table updates.
*/
STACK_ARRAY(struct anv_trtt_bind, l3l2_binds, l3l2_binds_capacity);
STACK_ARRAY(struct anv_trtt_bind, l1_binds, l1_binds_capacity);
uint32_t n_l3l2_binds = 0, n_l1_binds = 0;
/* These are arrays of struct anv_trtt_bind. */
struct util_dynarray l3l2_binds = {};
struct util_dynarray l1_binds;
if (l1_binds_capacity <= 32) {
size_t alloc_size = l1_binds_capacity * sizeof(struct anv_trtt_bind);
struct anv_trtt_bind *ptr = alloca(alloc_size);
util_dynarray_init_from_stack(&l1_binds, ptr, alloc_size);
} else {
util_dynarray_init(&l1_binds, NULL);
if (!util_dynarray_ensure_cap(&l1_binds,
l1_binds_capacity * sizeof(struct anv_trtt_bind)))
goto out_dynarrays;
}
for (int b = 0; b < sparse_submit->binds_len && result == VK_SUCCESS; b++) {
struct anv_vm_bind *vm_bind = &sparse_submit->binds[b];
for (size_t i = 0; i < vm_bind->size && result == VK_SUCCESS; i += 64 * 1024) {
@ -777,32 +780,33 @@ anv_sparse_bind_trtt(struct anv_device *device,
ANV_TRTT_L1_NULL_TILE_VAL;
result = anv_trtt_bind_add(device, trtt_addr, dest_addr,
l3l2_binds, &n_l3l2_binds,
l1_binds, &n_l1_binds);
&l3l2_binds, &l1_binds);
if (result != VK_SUCCESS)
goto out_stack_arrays;
goto out_dynarrays;
}
}
assert(n_l3l2_binds <= l3l2_binds_capacity);
assert(n_l1_binds <= l1_binds_capacity);
/* Convert the L3/L2/L1 TRTT page table updates in anv_trtt_bind elements
* into MI commands.
*/
uint32_t n_l3l2_binds =
util_dynarray_num_elements(&l3l2_binds, struct anv_trtt_bind);
uint32_t n_l1_binds =
util_dynarray_num_elements(&l1_binds, struct anv_trtt_bind);
sparse_debug("trtt_binds: num_vm_binds:%02d l3l2:%04d l1:%04d\n",
sparse_submit->binds_len, n_l3l2_binds, n_l1_binds);
/* This is not an error, the application is simply trying to reset state
* that was already there. */
if (n_l3l2_binds == 0 && n_l1_binds == 0)
goto out_stack_arrays;
goto out_dynarrays;
anv_genX(device->info, write_trtt_entries)(
&submit->base, l3l2_binds, n_l3l2_binds, l1_binds, n_l1_binds);
anv_genX(device->info, write_trtt_entries)(&submit->base,
l3l2_binds.data, n_l3l2_binds,
l1_binds.data, n_l1_binds);
STACK_ARRAY_FINISH(l1_binds);
STACK_ARRAY_FINISH(l3l2_binds);
util_dynarray_fini(&l1_binds);
util_dynarray_fini(&l3l2_binds);
anv_genX(device->info, async_submit_end)(&submit->base);
@ -847,9 +851,9 @@ anv_sparse_bind_trtt(struct anv_device *device,
return VK_SUCCESS;
out_stack_arrays:
STACK_ARRAY_FINISH(l1_binds);
STACK_ARRAY_FINISH(l3l2_binds);
out_dynarrays:
util_dynarray_fini(&l1_binds);
util_dynarray_fini(&l3l2_binds);
out_add_bind:
simple_mtx_unlock(&trtt->mutex);
anv_async_submit_fini(&submit->base);