mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-24 08:50:13 +01:00
anv: Delete relocation support from batch submission
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18208>
This commit is contained in:
parent
3fd4a294f5
commit
c5f7e1f5b4
2 changed files with 24 additions and 275 deletions
|
|
@ -43,9 +43,10 @@
|
|||
*
|
||||
* This file contains functions related to anv_cmd_buffer as a data
|
||||
* structure. This involves everything required to create and destroy
|
||||
* the actual batch buffers as well as link them together and handle
|
||||
* relocations and surface state. It specifically does *not* contain any
|
||||
* handling of actual vkCmd calls beyond vkCmdExecuteCommands.
|
||||
* the actual batch buffers as well as link them together.
|
||||
*
|
||||
* It specifically does *not* contain any handling of actual vkCmd calls
|
||||
* beyond vkCmdExecuteCommands.
|
||||
*/
|
||||
|
||||
/*-----------------------------------------------------------------------*
|
||||
|
|
@ -69,26 +70,15 @@ anv_reloc_list_init_clone(struct anv_reloc_list *list,
|
|||
list->array_length = other_list->array_length;
|
||||
|
||||
if (list->num_relocs > 0) {
|
||||
list->relocs =
|
||||
vk_alloc(alloc, list->array_length * sizeof(*list->relocs), 8,
|
||||
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
|
||||
if (list->relocs == NULL)
|
||||
return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY);
|
||||
|
||||
list->reloc_bos =
|
||||
vk_alloc(alloc, list->array_length * sizeof(*list->reloc_bos), 8,
|
||||
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
|
||||
if (list->reloc_bos == NULL) {
|
||||
vk_free(alloc, list->relocs);
|
||||
if (list->reloc_bos == NULL)
|
||||
return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY);
|
||||
}
|
||||
|
||||
memcpy(list->relocs, other_list->relocs,
|
||||
list->array_length * sizeof(*list->relocs));
|
||||
memcpy(list->reloc_bos, other_list->reloc_bos,
|
||||
list->array_length * sizeof(*list->reloc_bos));
|
||||
} else {
|
||||
list->relocs = NULL;
|
||||
list->reloc_bos = NULL;
|
||||
}
|
||||
|
||||
|
|
@ -111,7 +101,6 @@ void
|
|||
anv_reloc_list_finish(struct anv_reloc_list *list,
|
||||
const VkAllocationCallbacks *alloc)
|
||||
{
|
||||
vk_free(alloc, list->relocs);
|
||||
vk_free(alloc, list->reloc_bos);
|
||||
vk_free(alloc, list->deps);
|
||||
}
|
||||
|
|
@ -128,14 +117,6 @@ anv_reloc_list_grow(struct anv_reloc_list *list,
|
|||
while (new_length < list->num_relocs + num_additional_relocs)
|
||||
new_length *= 2;
|
||||
|
||||
struct drm_i915_gem_relocation_entry *new_relocs =
|
||||
vk_realloc(alloc, list->relocs,
|
||||
new_length * sizeof(*list->relocs), 8,
|
||||
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
|
||||
if (new_relocs == NULL)
|
||||
return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY);
|
||||
list->relocs = new_relocs;
|
||||
|
||||
struct anv_bo **new_reloc_bos =
|
||||
vk_realloc(alloc, list->reloc_bos,
|
||||
new_length * sizeof(*list->reloc_bos), 8,
|
||||
|
|
@ -212,14 +193,9 @@ anv_reloc_list_append(struct anv_reloc_list *list,
|
|||
return result;
|
||||
|
||||
if (other->num_relocs > 0) {
|
||||
memcpy(&list->relocs[list->num_relocs], &other->relocs[0],
|
||||
other->num_relocs * sizeof(other->relocs[0]));
|
||||
memcpy(&list->reloc_bos[list->num_relocs], &other->reloc_bos[0],
|
||||
other->num_relocs * sizeof(other->reloc_bos[0]));
|
||||
|
||||
for (uint32_t i = 0; i < other->num_relocs; i++)
|
||||
list->relocs[i + list->num_relocs].offset += offset;
|
||||
|
||||
list->num_relocs += other->num_relocs;
|
||||
}
|
||||
|
||||
|
|
@ -1081,13 +1057,6 @@ struct anv_execbuf {
|
|||
uint32_t cmd_buffer_count;
|
||||
struct anv_query_pool *perf_query_pool;
|
||||
|
||||
/* Indicates whether any of the command buffers have relocations. This
|
||||
* doesn't not necessarily mean we'll need the kernel to process them. It
|
||||
* might be that a previous execbuf has already placed things in the VMA
|
||||
* and we can make i915 skip the relocations.
|
||||
*/
|
||||
bool has_relocs;
|
||||
|
||||
const VkAllocationCallbacks * alloc;
|
||||
VkSystemAllocationScope alloc_scope;
|
||||
|
||||
|
|
@ -1198,27 +1167,12 @@ anv_execbuf_add_bo(struct anv_device *device,
|
|||
}
|
||||
|
||||
if (relocs != NULL) {
|
||||
assert(obj->relocation_count == 0);
|
||||
|
||||
if (relocs->num_relocs > 0) {
|
||||
/* This is the first time we've ever seen a list of relocations for
|
||||
* this BO. Go ahead and set the relocations and then walk the list
|
||||
* of relocations and add them all.
|
||||
*/
|
||||
exec->has_relocs = true;
|
||||
obj->relocation_count = relocs->num_relocs;
|
||||
obj->relocs_ptr = (uintptr_t) relocs->relocs;
|
||||
|
||||
for (size_t i = 0; i < relocs->num_relocs; i++) {
|
||||
VkResult result;
|
||||
|
||||
/* A quick sanity check on relocations */
|
||||
assert(relocs->relocs[i].offset < bo->size);
|
||||
result = anv_execbuf_add_bo(device, exec, relocs->reloc_bos[i],
|
||||
NULL, extra_flags);
|
||||
if (result != VK_SUCCESS)
|
||||
return result;
|
||||
}
|
||||
for (size_t i = 0; i < relocs->num_relocs; i++) {
|
||||
VkResult result =
|
||||
anv_execbuf_add_bo(device, exec, relocs->reloc_bos[i],
|
||||
NULL, extra_flags);
|
||||
if (result != VK_SUCCESS)
|
||||
return result;
|
||||
}
|
||||
|
||||
return anv_execbuf_add_bo_bitset(device, exec, relocs->dep_words,
|
||||
|
|
@ -1253,141 +1207,6 @@ anv_execbuf_add_bo_bitset(struct anv_device *device,
|
|||
return VK_SUCCESS;
|
||||
}
|
||||
|
||||
static void
|
||||
anv_cmd_buffer_process_relocs(struct anv_cmd_buffer *cmd_buffer,
|
||||
struct anv_reloc_list *list)
|
||||
{
|
||||
for (size_t i = 0; i < list->num_relocs; i++) {
|
||||
list->relocs[i].target_handle = list->reloc_bos[i]->exec_obj_index;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
anv_reloc_list_apply(struct anv_device *device,
|
||||
struct anv_reloc_list *list,
|
||||
struct anv_bo *bo,
|
||||
bool always_relocate)
|
||||
{
|
||||
for (size_t i = 0; i < list->num_relocs; i++) {
|
||||
struct anv_bo *target_bo = list->reloc_bos[i];
|
||||
if (list->relocs[i].presumed_offset == target_bo->offset &&
|
||||
!always_relocate)
|
||||
continue;
|
||||
|
||||
void *p = bo->map + list->relocs[i].offset;
|
||||
write_reloc(device, p, target_bo->offset + list->relocs[i].delta, true);
|
||||
list->relocs[i].presumed_offset = target_bo->offset;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This function applies the relocation for a command buffer and writes the
|
||||
* actual addresses into the buffers as per what we were told by the kernel on
|
||||
* the previous execbuf2 call. This should be safe to do because, for each
|
||||
* relocated address, we have two cases:
|
||||
*
|
||||
* 1) The target BO is inactive (as seen by the kernel). In this case, it is
|
||||
* not in use by the GPU so updating the address is 100% ok. It won't be
|
||||
* in-use by the GPU (from our context) again until the next execbuf2
|
||||
* happens. If the kernel decides to move it in the next execbuf2, it
|
||||
* will have to do the relocations itself, but that's ok because it should
|
||||
* have all of the information needed to do so.
|
||||
*
|
||||
* 2) The target BO is active (as seen by the kernel). In this case, it
|
||||
* hasn't moved since the last execbuffer2 call because GTT shuffling
|
||||
* *only* happens when the BO is idle. (From our perspective, it only
|
||||
* happens inside the execbuffer2 ioctl, but the shuffling may be
|
||||
* triggered by another ioctl, with full-ppgtt this is limited to only
|
||||
* execbuffer2 ioctls on the same context, or memory pressure.) Since the
|
||||
* target BO hasn't moved, our anv_bo::offset exactly matches the BO's GTT
|
||||
* address and the relocated value we are writing into the BO will be the
|
||||
* same as the value that is already there.
|
||||
*
|
||||
* There is also a possibility that the target BO is active but the exact
|
||||
* RENDER_SURFACE_STATE object we are writing the relocation into isn't in
|
||||
* use. In this case, the address currently in the RENDER_SURFACE_STATE
|
||||
* may be stale but it's still safe to write the relocation because that
|
||||
* particular RENDER_SURFACE_STATE object isn't in-use by the GPU and
|
||||
* won't be until the next execbuf2 call.
|
||||
*
|
||||
* By doing relocations on the CPU, we can tell the kernel that it doesn't
|
||||
* need to bother. We want to do this because the surface state buffer is
|
||||
* used by every command buffer so, if the kernel does the relocations, it
|
||||
* will always be busy and the kernel will always stall. This is also
|
||||
* probably the fastest mechanism for doing relocations since the kernel would
|
||||
* have to make a full copy of all the relocations lists.
|
||||
*/
|
||||
static bool
|
||||
execbuf_can_skip_relocations(struct anv_execbuf *exec)
|
||||
{
|
||||
if (!exec->has_relocs)
|
||||
return true;
|
||||
|
||||
static int userspace_relocs = -1;
|
||||
if (userspace_relocs < 0)
|
||||
userspace_relocs = env_var_as_boolean("ANV_USERSPACE_RELOCS", true);
|
||||
if (!userspace_relocs)
|
||||
return false;
|
||||
|
||||
/* First, we have to check to see whether or not we can even do the
|
||||
* relocation. New buffers which have never been submitted to the kernel
|
||||
* don't have a valid offset so we need to let the kernel do relocations so
|
||||
* that we can get offsets for them. On future execbuf2 calls, those
|
||||
* buffers will have offsets and we will be able to skip relocating.
|
||||
* Invalid offsets are indicated by anv_bo::offset == (uint64_t)-1.
|
||||
*/
|
||||
for (uint32_t i = 0; i < exec->bo_count; i++) {
|
||||
if (exec->bos[i]->offset == (uint64_t)-1)
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void
|
||||
relocate_cmd_buffer(struct anv_cmd_buffer *cmd_buffer,
|
||||
struct anv_execbuf *exec)
|
||||
{
|
||||
/* Since surface states are shared between command buffers and we don't
|
||||
* know what order they will be submitted to the kernel, we don't know
|
||||
* what address is actually written in the surface state object at any
|
||||
* given time. The only option is to always relocate them.
|
||||
*/
|
||||
struct anv_bo *surface_state_bo =
|
||||
cmd_buffer->device->surface_state_pool.block_pool.bo;
|
||||
anv_reloc_list_apply(cmd_buffer->device, &cmd_buffer->surface_relocs,
|
||||
surface_state_bo,
|
||||
true /* always relocate surface states */);
|
||||
|
||||
/* Since we own all of the batch buffers, we know what values are stored
|
||||
* in the relocated addresses and only have to update them if the offsets
|
||||
* have changed.
|
||||
*/
|
||||
struct anv_batch_bo **bbo;
|
||||
u_vector_foreach(bbo, &cmd_buffer->seen_bbos) {
|
||||
anv_reloc_list_apply(cmd_buffer->device,
|
||||
&(*bbo)->relocs, (*bbo)->bo, false);
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < exec->bo_count; i++)
|
||||
exec->objects[i].offset = exec->bos[i]->offset;
|
||||
}
|
||||
|
||||
static void
|
||||
reset_cmd_buffer_surface_offsets(struct anv_cmd_buffer *cmd_buffer)
|
||||
{
|
||||
/* In the case where we fall back to doing kernel relocations, we need to
|
||||
* ensure that the relocation list is valid. All relocations on the batch
|
||||
* buffers are already valid and kept up-to-date. Since surface states are
|
||||
* shared between command buffers and we don't know what order they will be
|
||||
* submitted to the kernel, we don't know what address is actually written
|
||||
* in the surface state object at any given time. The only option is to set
|
||||
* a bogus presumed offset and let the kernel relocate them.
|
||||
*/
|
||||
for (size_t i = 0; i < cmd_buffer->surface_relocs.num_relocs; i++)
|
||||
cmd_buffer->surface_relocs.relocs[i].presumed_offset = -1;
|
||||
}
|
||||
|
||||
static VkResult
|
||||
anv_execbuf_add_syncobj(struct anv_device *device,
|
||||
struct anv_execbuf *exec,
|
||||
|
|
@ -1487,9 +1306,6 @@ static VkResult
|
|||
setup_execbuf_for_cmd_buffer(struct anv_execbuf *execbuf,
|
||||
struct anv_cmd_buffer *cmd_buffer)
|
||||
{
|
||||
struct anv_state_pool *ss_pool =
|
||||
&cmd_buffer->device->surface_state_pool;
|
||||
|
||||
VkResult result;
|
||||
/* Add surface dependencies (BOs) to the execbuf */
|
||||
anv_execbuf_add_bo_bitset(cmd_buffer->device, execbuf,
|
||||
|
|
@ -1534,7 +1350,6 @@ setup_execbuf_for_cmd_buffers(struct anv_execbuf *execbuf,
|
|||
uint32_t num_cmd_buffers)
|
||||
{
|
||||
struct anv_device *device = queue->device;
|
||||
struct anv_state_pool *ss_pool = &device->surface_state_pool;
|
||||
VkResult result;
|
||||
|
||||
/* Edit the tail of the command buffers to chain them all together if they
|
||||
|
|
@ -1550,13 +1365,14 @@ setup_execbuf_for_cmd_buffers(struct anv_execbuf *execbuf,
|
|||
}
|
||||
|
||||
/* Add all the global BOs to the object list for softpin case. */
|
||||
anv_block_pool_foreach_bo(bo, &ss_pool->block_pool) {
|
||||
struct anv_block_pool *pool;
|
||||
pool = &device->surface_state_pool.block_pool;
|
||||
anv_block_pool_foreach_bo(bo, pool) {
|
||||
result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0);
|
||||
if (result != VK_SUCCESS)
|
||||
return result;
|
||||
}
|
||||
|
||||
struct anv_block_pool *pool;
|
||||
pool = &device->dynamic_state_pool.block_pool;
|
||||
anv_block_pool_foreach_bo(bo, pool) {
|
||||
result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0);
|
||||
|
|
@ -1595,56 +1411,8 @@ setup_execbuf_for_cmd_buffers(struct anv_execbuf *execbuf,
|
|||
return result;
|
||||
}
|
||||
|
||||
bool no_reloc = true;
|
||||
if (execbuf->has_relocs) {
|
||||
no_reloc = execbuf_can_skip_relocations(execbuf);
|
||||
if (no_reloc) {
|
||||
/* If we were able to successfully relocate everything, tell the
|
||||
* kernel that it can skip doing relocations. The requirement for
|
||||
* using NO_RELOC is:
|
||||
*
|
||||
* 1) The addresses written in the objects must match the
|
||||
* corresponding reloc.presumed_offset which in turn must match
|
||||
* the corresponding execobject.offset.
|
||||
*
|
||||
* 2) To avoid stalling, execobject.offset should match the current
|
||||
* address of that object within the active context.
|
||||
*
|
||||
* In order to satisfy all of the invariants that make userspace
|
||||
* relocations to be safe (see relocate_cmd_buffer()), we need to
|
||||
* further ensure that the addresses we use match those used by the
|
||||
* kernel for the most recent execbuf2.
|
||||
*
|
||||
* The kernel may still choose to do relocations anyway if something
|
||||
* has moved in the GTT. In this case, the relocation list still
|
||||
* needs to be valid. All relocations on the batch buffers are
|
||||
* already valid and kept up-to-date. For surface state relocations,
|
||||
* by applying the relocations in relocate_cmd_buffer, we ensured
|
||||
* that the address in the RENDER_SURFACE_STATE matches
|
||||
* presumed_offset, so it should be safe for the kernel to relocate
|
||||
* them as needed.
|
||||
*/
|
||||
for (uint32_t i = 0; i < num_cmd_buffers; i++) {
|
||||
relocate_cmd_buffer(cmd_buffers[i], execbuf);
|
||||
|
||||
anv_reloc_list_apply(device, &cmd_buffers[i]->surface_relocs,
|
||||
device->surface_state_pool.block_pool.bo,
|
||||
true /* always relocate surface states */);
|
||||
}
|
||||
} else {
|
||||
/* In the case where we fall back to doing kernel relocations, we
|
||||
* need to ensure that the relocation list is valid. All relocations
|
||||
* on the batch buffers are already valid and kept up-to-date. Since
|
||||
* surface states are shared between command buffers and we don't
|
||||
* know what order they will be submitted to the kernel, we don't
|
||||
* know what address is actually written in the surface state object
|
||||
* at any given time. The only option is to set a bogus presumed
|
||||
* offset and let the kernel relocate them.
|
||||
*/
|
||||
for (uint32_t i = 0; i < num_cmd_buffers; i++)
|
||||
reset_cmd_buffer_surface_offsets(cmd_buffers[i]);
|
||||
}
|
||||
}
|
||||
for (uint32_t i = 0; i < execbuf->bo_count; i++)
|
||||
execbuf->objects[i].offset = execbuf->bos[i]->offset;
|
||||
|
||||
struct anv_batch_bo *first_batch_bo =
|
||||
list_first_entry(&cmd_buffers[0]->batch_bos, struct anv_batch_bo, link);
|
||||
|
|
@ -1670,25 +1438,9 @@ setup_execbuf_for_cmd_buffers(struct anv_execbuf *execbuf,
|
|||
first_batch_bo->bo->exec_obj_index = last_idx;
|
||||
}
|
||||
|
||||
/* If we are pinning our BOs, we shouldn't have to relocate anything */
|
||||
assert(!execbuf->has_relocs);
|
||||
|
||||
/* Now we go through and fixup all of the relocation lists to point to the
|
||||
* correct indices in the object array (I915_EXEC_HANDLE_LUT). We have to
|
||||
* do this after we reorder the list above as some of the indices may have
|
||||
* changed.
|
||||
*/
|
||||
struct anv_batch_bo **bbo;
|
||||
if (execbuf->has_relocs) {
|
||||
assert(num_cmd_buffers == 1);
|
||||
u_vector_foreach(bbo, &cmd_buffers[0]->seen_bbos)
|
||||
anv_cmd_buffer_process_relocs(cmd_buffers[0], &(*bbo)->relocs);
|
||||
|
||||
anv_cmd_buffer_process_relocs(cmd_buffers[0], &cmd_buffers[0]->surface_relocs);
|
||||
}
|
||||
|
||||
if (device->physical->memory.need_clflush) {
|
||||
__builtin_ia32_mfence();
|
||||
struct anv_batch_bo **bbo;
|
||||
for (uint32_t i = 0; i < num_cmd_buffers; i++) {
|
||||
u_vector_foreach(bbo, &cmd_buffers[i]->seen_bbos) {
|
||||
for (uint32_t l = 0; l < (*bbo)->length; l += CACHELINE_SIZE)
|
||||
|
|
@ -1707,7 +1459,9 @@ setup_execbuf_for_cmd_buffers(struct anv_execbuf *execbuf,
|
|||
.num_cliprects = 0,
|
||||
.DR1 = 0,
|
||||
.DR4 = 0,
|
||||
.flags = I915_EXEC_HANDLE_LUT | queue->exec_flags | (no_reloc ? I915_EXEC_NO_RELOC : 0),
|
||||
.flags = I915_EXEC_NO_RELOC |
|
||||
I915_EXEC_HANDLE_LUT |
|
||||
queue->exec_flags,
|
||||
.rsvd1 = device->context_id,
|
||||
.rsvd2 = 0,
|
||||
};
|
||||
|
|
@ -1778,8 +1532,10 @@ setup_utrace_execbuf(struct anv_execbuf *execbuf, struct anv_queue *queue,
|
|||
.buffer_count = execbuf->bo_count,
|
||||
.batch_start_offset = 0,
|
||||
.batch_len = flush->batch.next - flush->batch.start,
|
||||
.flags = I915_EXEC_HANDLE_LUT | I915_EXEC_FENCE_ARRAY | queue->exec_flags |
|
||||
(execbuf->has_relocs ? 0 : I915_EXEC_NO_RELOC),
|
||||
.flags = I915_EXEC_NO_RELOC |
|
||||
I915_EXEC_HANDLE_LUT |
|
||||
I915_EXEC_FENCE_ARRAY |
|
||||
queue->exec_flags,
|
||||
.rsvd1 = device->context_id,
|
||||
.rsvd2 = 0,
|
||||
.num_cliprects = execbuf->syncobj_count,
|
||||
|
|
@ -1830,12 +1586,6 @@ anv_queue_exec_utrace_locked(struct anv_queue *queue,
|
|||
* up in the wild due to a broken app. It's better to play it safe and
|
||||
* just lock around QueueSubmit.
|
||||
*
|
||||
* 3) The anv_cmd_buffer_execbuf function may perform relocations in
|
||||
* userspace. Due to the fact that the surface state buffer is shared
|
||||
* between batches, we can't afford to have that happen from multiple
|
||||
* threads at the same time. Even though the user is supposed to ensure
|
||||
* this doesn't happen, we play it safe as in (2) above.
|
||||
*
|
||||
* Since the only other things that ever take the device lock such as block
|
||||
* pool resize only rarely happen, this will almost never be contended so
|
||||
* taking a lock isn't really an expensive operation in this case.
|
||||
|
|
|
|||
|
|
@ -1349,7 +1349,6 @@ void anv_vma_free(struct anv_device *device,
|
|||
struct anv_reloc_list {
|
||||
uint32_t num_relocs;
|
||||
uint32_t array_length;
|
||||
struct drm_i915_gem_relocation_entry * relocs;
|
||||
struct anv_bo ** reloc_bos;
|
||||
uint32_t dep_words;
|
||||
BITSET_WORD * deps;
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue