mesa/src/intel/vulkan/anv_va.c
Lionel Landwerlin 692e1ab2c1 anv: get rid of the second dynamic state heap
Pretty big change... Sorry for that.

I can't exactly remember why I created 2 heaps. I think it's because I
mistakenly thought the samplers in the binding sampler pointers needed
to be indexed from the binding table. But that's not the case, they
just need to be in the dynamic state heap.

In the future, this change will allow to also allocate buffers for
push constant data in the newly created dynamic_visible_pool which
will be useful on < Gfx12.0 where this is the only place push constant
data can live for compute shaders.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Ivan Briano <ivan.briano@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30047>
2024-07-19 12:21:46 +00:00

183 lines
7.5 KiB
C

/*
* Copyright © 2023 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "anv_private.h"
#include "util/u_math.h"
static uint64_t
va_add(struct anv_va_range *range, uint64_t addr, uint64_t size)
{
range->addr = addr;
range->size = size;
return addr + size;
}
static void
va_at(struct anv_va_range *range, uint64_t addr, uint64_t size)
{
range->addr = addr;
range->size = size;
}
static void
anv_device_print_vas(struct anv_physical_device *device)
{
fprintf(stderr, "Driver heaps:\n");
#define PRINT_HEAP(name) \
fprintf(stderr, " 0x%016"PRIx64"-0x%016"PRIx64": %s\n", \
device->va.name.addr, \
device->va.name.addr + device->va.name.size, \
#name);
PRINT_HEAP(general_state_pool);
PRINT_HEAP(low_heap);
PRINT_HEAP(binding_table_pool);
PRINT_HEAP(internal_surface_state_pool);
PRINT_HEAP(scratch_surface_state_pool);
PRINT_HEAP(bindless_surface_state_pool);
PRINT_HEAP(indirect_descriptor_pool);
PRINT_HEAP(indirect_push_descriptor_pool);
PRINT_HEAP(instruction_state_pool);
PRINT_HEAP(dynamic_state_pool);
PRINT_HEAP(dynamic_visible_pool);
PRINT_HEAP(push_descriptor_buffer_pool);
PRINT_HEAP(high_heap);
PRINT_HEAP(trtt);
}
void
anv_physical_device_init_va_ranges(struct anv_physical_device *device)
{
/* anv Virtual Memory Layout
* =========================
*
* When the anv driver is determining the virtual graphics addresses of
* memory objects itself using the softpin mechanism, the following memory
* ranges will be used.
*
* Three special considerations to notice:
*
* (1) the dynamic state pool is located within the same 4 GiB as the low
* heap. This is to work around a VF cache issue described in a comment in
* anv_physical_device_init_heaps.
*
* (2) the binding table pool is located at lower addresses than the BT
* (binding table) surface state pool, within a 4 GiB range which also
* contains the bindless surface state pool. This allows surface state base
* addresses to cover both binding tables (16 bit offsets), the internal
* surface states (32 bit offsets) and the bindless surface states.
*
* (3) the last 4 GiB of the address space is withheld from the high heap.
* Various hardware units will read past the end of an object for various
* reasons. This healthy margin prevents reads from wrapping around 48-bit
* addresses.
*/
uint64_t _1Mb = 1ull * 1024 * 1024;
uint64_t _1Gb = 1ull * 1024 * 1024 * 1024;
uint64_t _4Gb = 4ull * 1024 * 1024 * 1024;
uint64_t address = 0x000000200000ULL; /* 2MiB */
address = va_add(&device->va.general_state_pool, address,
_1Gb - address);
address = va_add(&device->va.low_heap, address, _1Gb);
/* The binding table pool has to be located directly in front of the
* surface states.
*/
address += _1Gb;
address = va_add(&device->va.binding_table_pool, address, _1Gb);
address = va_add(&device->va.internal_surface_state_pool, address, 1 * _1Gb);
assert(device->va.internal_surface_state_pool.addr ==
align64(device->va.internal_surface_state_pool.addr, 2 * _1Gb));
/* Scratch surface state overlaps with the internal surface state */
va_at(&device->va.scratch_surface_state_pool,
device->va.internal_surface_state_pool.addr,
8 * _1Mb);
address = va_add(&device->va.bindless_surface_state_pool, address, 2 * _1Gb);
if (device->indirect_descriptors) {
/* With indirect descriptors, descriptor buffers can go anywhere, they
* just need to be in a 4Gb aligned range, so all shader accesses can
* use a relocatable upper dword for the 64bit address.
*/
address = align64(address, _4Gb);
address = va_add(&device->va.indirect_descriptor_pool, address, 3 * _1Gb);
address = va_add(&device->va.indirect_push_descriptor_pool, address, _1Gb);
}
/* We use a trick to compute constant data offsets in the shaders to avoid
* unnecessary 64bit address computations (see lower_load_constant() in
* anv_nir_apply_pipeline_layout.c). This assumes the instruction pool is
* located at an address with the lower 32bits at 0.
*/
address = align64(address, _4Gb);
address = va_add(&device->va.instruction_state_pool, address, 2 * _1Gb);
address += 1 * _1Gb;
address = va_add(&device->va.dynamic_state_pool, address, _1Gb);
address = va_add(&device->va.dynamic_visible_pool, address,
device->info.verx10 >= 125 ? (2 * _1Gb) : (3 * _1Gb - 4096));
assert(device->va.dynamic_visible_pool.addr % _4Gb == 0);
if (device->info.verx10 >= 125)
address = va_add(&device->va.push_descriptor_buffer_pool, address, _1Gb - 4096);
address = align64(address, device->info.mem_alignment);
address = va_add(&device->va.aux_tt_pool, address, 2 * _1Gb);
/* What's left to do for us is to set va.high_heap and va.trtt without
* overlap, but there are a few things to be considered:
*
* The TR-TT address space is governed by the GFX_TRTT_VA_RANGE register,
* which carves out part of the address space for TR-TT and is independent
* of device->gtt_size. We use 47:44 for gen9+, the values we set here
* should be in sync with what we write to the register.
*
* If we ever gain the capability to use more than 48 bits of address space
* we'll have to adjust where we put the TR-TT space (and how we set
* GFX_TRTT_VA_RANGE).
*
* We have to leave the last 4GiB out of the high vma range, so that no
* state base address + size can overflow 48 bits. For more information see
* the comment about Wa32bitGeneralStateOffset in anv_allocator.c
*
* Despite the comment above, before we had TR-TT we were not only avoiding
* the last 4GiB of the 48bit address space, but also avoiding the last
* 4GiB from gtt_size, so let's be on the safe side and do the 4GiB
* avoiding for both the TR-TT space top and the gtt top.
*/
assert(device->gtt_size <= (1uLL << 48));
uint64_t trtt_start = 0xFuLL << 44;
uint64_t trtt_end = (1uLL << 48) - 4 * _1Gb;
uint64_t addressable_top = MIN2(device->gtt_size, trtt_start) - 4 * _1Gb;
uint64_t user_heaps_size = addressable_top - address;
address = va_add(&device->va.high_heap, address, user_heaps_size);
assert(address <= trtt_start);
address = va_add(&device->va.trtt, trtt_start, trtt_end - trtt_start);
if (INTEL_DEBUG(DEBUG_HEAPS))
anv_device_print_vas(device);
}