anv: Delete relocation support from batch submission

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18208>
2026-05-06 11:38:05 +02:00 · 2022-08-30 17:17:53 -07:00 · 2022-08-30 17:17:53 -07:00 · c5f7e1f5b4
commit c5f7e1f5b4
parent 3fd4a294f5
2 changed files with 24 additions and 275 deletions
--- a/src/intel/vulkan/anv_batch_chain.c
+++ b/src/intel/vulkan/anv_batch_chain.c
@ -43,9 +43,10 @@
 *
 * This file contains functions related to anv_cmd_buffer as a data
 * structure.  This involves everything required to create and destroy
- * the actual batch buffers as well as link them together and handle
- * relocations and surface state.  It specifically does *not* contain any
- * handling of actual vkCmd calls beyond vkCmdExecuteCommands.
+ * the actual batch buffers as well as link them together.
+ *
+ * It specifically does *not* contain any handling of actual vkCmd calls
+ * beyond vkCmdExecuteCommands.
 */

 /*-----------------------------------------------------------------------*
@ -69,26 +70,15 @@ anv_reloc_list_init_clone(struct anv_reloc_list *list,
   list->array_length = other_list->array_length;

   if (list->num_relocs > 0) {
-      list->relocs =
-         vk_alloc(alloc, list->array_length * sizeof(*list->relocs), 8,
-                   VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
-      if (list->relocs == NULL)
-         return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY);
-
      list->reloc_bos =
         vk_alloc(alloc, list->array_length * sizeof(*list->reloc_bos), 8,
                   VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
-      if (list->reloc_bos == NULL) {
-         vk_free(alloc, list->relocs);
+      if (list->reloc_bos == NULL)
         return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY);
-      }

-      memcpy(list->relocs, other_list->relocs,
-             list->array_length * sizeof(*list->relocs));
      memcpy(list->reloc_bos, other_list->reloc_bos,
             list->array_length * sizeof(*list->reloc_bos));
   } else {
-      list->relocs = NULL;
      list->reloc_bos = NULL;
   }

@ -111,7 +101,6 @@ void
 anv_reloc_list_finish(struct anv_reloc_list *list,
                      const VkAllocationCallbacks *alloc)
 {
-   vk_free(alloc, list->relocs);
   vk_free(alloc, list->reloc_bos);
   vk_free(alloc, list->deps);
 }
@ -128,14 +117,6 @@ anv_reloc_list_grow(struct anv_reloc_list *list,
   while (new_length < list->num_relocs + num_additional_relocs)
      new_length *= 2;

-   struct drm_i915_gem_relocation_entry *new_relocs =
-      vk_realloc(alloc, list->relocs,
-                 new_length * sizeof(*list->relocs), 8,
-                 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
-   if (new_relocs == NULL)
-      return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY);
-   list->relocs = new_relocs;
-
   struct anv_bo **new_reloc_bos =
      vk_realloc(alloc, list->reloc_bos,
                 new_length * sizeof(*list->reloc_bos), 8,
@ -212,14 +193,9 @@ anv_reloc_list_append(struct anv_reloc_list *list,
      return result;

   if (other->num_relocs > 0) {
-      memcpy(&list->relocs[list->num_relocs], &other->relocs[0],
-             other->num_relocs * sizeof(other->relocs[0]));
      memcpy(&list->reloc_bos[list->num_relocs], &other->reloc_bos[0],
             other->num_relocs * sizeof(other->reloc_bos[0]));

-      for (uint32_t i = 0; i < other->num_relocs; i++)
-         list->relocs[i + list->num_relocs].offset += offset;
-
      list->num_relocs += other->num_relocs;
   }

@ -1081,13 +1057,6 @@ struct anv_execbuf {
   uint32_t                                  cmd_buffer_count;
   struct anv_query_pool                     *perf_query_pool;

-   /* Indicates whether any of the command buffers have relocations. This
-    * doesn't not necessarily mean we'll need the kernel to process them. It
-    * might be that a previous execbuf has already placed things in the VMA
-    * and we can make i915 skip the relocations.
-    */
-   bool                                      has_relocs;
-
   const VkAllocationCallbacks *             alloc;
   VkSystemAllocationScope                   alloc_scope;

@ -1198,27 +1167,12 @@ anv_execbuf_add_bo(struct anv_device *device,
   }

   if (relocs != NULL) {
-      assert(obj->relocation_count == 0);
-
-      if (relocs->num_relocs > 0) {
-         /* This is the first time we've ever seen a list of relocations for
-          * this BO.  Go ahead and set the relocations and then walk the list
-          * of relocations and add them all.
-          */
-         exec->has_relocs = true;
-         obj->relocation_count = relocs->num_relocs;
-         obj->relocs_ptr = (uintptr_t) relocs->relocs;
-
-         for (size_t i = 0; i < relocs->num_relocs; i++) {
-            VkResult result;
-
-            /* A quick sanity check on relocations */
-            assert(relocs->relocs[i].offset < bo->size);
-            result = anv_execbuf_add_bo(device, exec, relocs->reloc_bos[i],
-                                        NULL, extra_flags);
-            if (result != VK_SUCCESS)
-               return result;
-         }
+      for (size_t i = 0; i < relocs->num_relocs; i++) {
+         VkResult result =
+            anv_execbuf_add_bo(device, exec, relocs->reloc_bos[i],
+                               NULL, extra_flags);
+         if (result != VK_SUCCESS)
+            return result;
      }

      return anv_execbuf_add_bo_bitset(device, exec, relocs->dep_words,
@ -1253,141 +1207,6 @@ anv_execbuf_add_bo_bitset(struct anv_device *device,
   return VK_SUCCESS;
 }

-static void
-anv_cmd_buffer_process_relocs(struct anv_cmd_buffer *cmd_buffer,
-                              struct anv_reloc_list *list)
-{
-   for (size_t i = 0; i < list->num_relocs; i++) {
-      list->relocs[i].target_handle = list->reloc_bos[i]->exec_obj_index;
-   }
-}
-
-static void
-anv_reloc_list_apply(struct anv_device *device,
-                     struct anv_reloc_list *list,
-                     struct anv_bo *bo,
-                     bool always_relocate)
-{
-   for (size_t i = 0; i < list->num_relocs; i++) {
-      struct anv_bo *target_bo = list->reloc_bos[i];
-      if (list->relocs[i].presumed_offset == target_bo->offset &&
-          !always_relocate)
-         continue;
-
-      void *p = bo->map + list->relocs[i].offset;
-      write_reloc(device, p, target_bo->offset + list->relocs[i].delta, true);
-      list->relocs[i].presumed_offset = target_bo->offset;
-   }
-}
-
-/**
- * This function applies the relocation for a command buffer and writes the
- * actual addresses into the buffers as per what we were told by the kernel on
- * the previous execbuf2 call.  This should be safe to do because, for each
- * relocated address, we have two cases:
- *
- *  1) The target BO is inactive (as seen by the kernel).  In this case, it is
- *     not in use by the GPU so updating the address is 100% ok.  It won't be
- *     in-use by the GPU (from our context) again until the next execbuf2
- *     happens.  If the kernel decides to move it in the next execbuf2, it
- *     will have to do the relocations itself, but that's ok because it should
- *     have all of the information needed to do so.
- *
- *  2) The target BO is active (as seen by the kernel).  In this case, it
- *     hasn't moved since the last execbuffer2 call because GTT shuffling
- *     *only* happens when the BO is idle. (From our perspective, it only
- *     happens inside the execbuffer2 ioctl, but the shuffling may be
- *     triggered by another ioctl, with full-ppgtt this is limited to only
- *     execbuffer2 ioctls on the same context, or memory pressure.)  Since the
- *     target BO hasn't moved, our anv_bo::offset exactly matches the BO's GTT
- *     address and the relocated value we are writing into the BO will be the
- *     same as the value that is already there.
- *
- *     There is also a possibility that the target BO is active but the exact
- *     RENDER_SURFACE_STATE object we are writing the relocation into isn't in
- *     use.  In this case, the address currently in the RENDER_SURFACE_STATE
- *     may be stale but it's still safe to write the relocation because that
- *     particular RENDER_SURFACE_STATE object isn't in-use by the GPU and
- *     won't be until the next execbuf2 call.
- *
- * By doing relocations on the CPU, we can tell the kernel that it doesn't
- * need to bother.  We want to do this because the surface state buffer is
- * used by every command buffer so, if the kernel does the relocations, it
- * will always be busy and the kernel will always stall.  This is also
- * probably the fastest mechanism for doing relocations since the kernel would
- * have to make a full copy of all the relocations lists.
- */
-static bool
-execbuf_can_skip_relocations(struct anv_execbuf *exec)
-{
-   if (!exec->has_relocs)
-      return true;
-
-   static int userspace_relocs = -1;
-   if (userspace_relocs < 0)
-      userspace_relocs = env_var_as_boolean("ANV_USERSPACE_RELOCS", true);
-   if (!userspace_relocs)
-      return false;
-
-   /* First, we have to check to see whether or not we can even do the
-    * relocation.  New buffers which have never been submitted to the kernel
-    * don't have a valid offset so we need to let the kernel do relocations so
-    * that we can get offsets for them.  On future execbuf2 calls, those
-    * buffers will have offsets and we will be able to skip relocating.
-    * Invalid offsets are indicated by anv_bo::offset == (uint64_t)-1.
-    */
-   for (uint32_t i = 0; i < exec->bo_count; i++) {
-      if (exec->bos[i]->offset == (uint64_t)-1)
-         return false;
-   }
-
-   return true;
-}
-
-static void
-relocate_cmd_buffer(struct anv_cmd_buffer *cmd_buffer,
-                    struct anv_execbuf *exec)
-{
-   /* Since surface states are shared between command buffers and we don't
-    * know what order they will be submitted to the kernel, we don't know
-    * what address is actually written in the surface state object at any
-    * given time.  The only option is to always relocate them.
-    */
-   struct anv_bo *surface_state_bo =
-      cmd_buffer->device->surface_state_pool.block_pool.bo;
-   anv_reloc_list_apply(cmd_buffer->device, &cmd_buffer->surface_relocs,
-                        surface_state_bo,
-                        true /* always relocate surface states */);
-
-   /* Since we own all of the batch buffers, we know what values are stored
-    * in the relocated addresses and only have to update them if the offsets
-    * have changed.
-    */
-   struct anv_batch_bo **bbo;
-   u_vector_foreach(bbo, &cmd_buffer->seen_bbos) {
-      anv_reloc_list_apply(cmd_buffer->device,
-                           &(*bbo)->relocs, (*bbo)->bo, false);
-   }
-
-   for (uint32_t i = 0; i < exec->bo_count; i++)
-      exec->objects[i].offset = exec->bos[i]->offset;
-}
-
-static void
-reset_cmd_buffer_surface_offsets(struct anv_cmd_buffer *cmd_buffer)
-{
-   /* In the case where we fall back to doing kernel relocations, we need to
-    * ensure that the relocation list is valid. All relocations on the batch
-    * buffers are already valid and kept up-to-date. Since surface states are
-    * shared between command buffers and we don't know what order they will be
-    * submitted to the kernel, we don't know what address is actually written
-    * in the surface state object at any given time. The only option is to set
-    * a bogus presumed offset and let the kernel relocate them.
-    */
-   for (size_t i = 0; i < cmd_buffer->surface_relocs.num_relocs; i++)
-      cmd_buffer->surface_relocs.relocs[i].presumed_offset = -1;
-}
-
 static VkResult
 anv_execbuf_add_syncobj(struct anv_device *device,
                        struct anv_execbuf *exec,
@ -1487,9 +1306,6 @@ static VkResult
 setup_execbuf_for_cmd_buffer(struct anv_execbuf *execbuf,
                             struct anv_cmd_buffer *cmd_buffer)
 {
-   struct anv_state_pool *ss_pool =
-      &cmd_buffer->device->surface_state_pool;
-
   VkResult result;
   /* Add surface dependencies (BOs) to the execbuf */
   anv_execbuf_add_bo_bitset(cmd_buffer->device, execbuf,
@ -1534,7 +1350,6 @@ setup_execbuf_for_cmd_buffers(struct anv_execbuf *execbuf,
                              uint32_t num_cmd_buffers)
 {
   struct anv_device *device = queue->device;
-   struct anv_state_pool *ss_pool = &device->surface_state_pool;
   VkResult result;

   /* Edit the tail of the command buffers to chain them all together if they
@ -1550,13 +1365,14 @@ setup_execbuf_for_cmd_buffers(struct anv_execbuf *execbuf,
   }

   /* Add all the global BOs to the object list for softpin case. */
-   anv_block_pool_foreach_bo(bo, &ss_pool->block_pool) {
+   struct anv_block_pool *pool;
+   pool = &device->surface_state_pool.block_pool;
+   anv_block_pool_foreach_bo(bo, pool) {
      result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0);
      if (result != VK_SUCCESS)
         return result;
   }

-   struct anv_block_pool *pool;
   pool = &device->dynamic_state_pool.block_pool;
   anv_block_pool_foreach_bo(bo, pool) {
      result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0);
@ -1595,56 +1411,8 @@ setup_execbuf_for_cmd_buffers(struct anv_execbuf *execbuf,
         return result;
   }

-   bool no_reloc = true;
-   if (execbuf->has_relocs) {
-      no_reloc = execbuf_can_skip_relocations(execbuf);
-      if (no_reloc) {
-         /* If we were able to successfully relocate everything, tell the
-          * kernel that it can skip doing relocations. The requirement for
-          * using NO_RELOC is:
-          *
-          *  1) The addresses written in the objects must match the
-          *     corresponding reloc.presumed_offset which in turn must match
-          *     the corresponding execobject.offset.
-          *
-          *  2) To avoid stalling, execobject.offset should match the current
-          *     address of that object within the active context.
-          *
-          * In order to satisfy all of the invariants that make userspace
-          * relocations to be safe (see relocate_cmd_buffer()), we need to
-          * further ensure that the addresses we use match those used by the
-          * kernel for the most recent execbuf2.
-          *
-          * The kernel may still choose to do relocations anyway if something
-          * has moved in the GTT. In this case, the relocation list still
-          * needs to be valid. All relocations on the batch buffers are
-          * already valid and kept up-to-date. For surface state relocations,
-          * by applying the relocations in relocate_cmd_buffer, we ensured
-          * that the address in the RENDER_SURFACE_STATE matches
-          * presumed_offset, so it should be safe for the kernel to relocate
-          * them as needed.
-          */
-         for (uint32_t i = 0; i < num_cmd_buffers; i++) {
-            relocate_cmd_buffer(cmd_buffers[i], execbuf);
-
-            anv_reloc_list_apply(device, &cmd_buffers[i]->surface_relocs,
-                                 device->surface_state_pool.block_pool.bo,
-                                 true /* always relocate surface states */);
-         }
-      } else {
-         /* In the case where we fall back to doing kernel relocations, we
-          * need to ensure that the relocation list is valid. All relocations
-          * on the batch buffers are already valid and kept up-to-date. Since
-          * surface states are shared between command buffers and we don't
-          * know what order they will be submitted to the kernel, we don't
-          * know what address is actually written in the surface state object
-          * at any given time. The only option is to set a bogus presumed
-          * offset and let the kernel relocate them.
-          */
-         for (uint32_t i = 0; i < num_cmd_buffers; i++)
-            reset_cmd_buffer_surface_offsets(cmd_buffers[i]);
-      }
-   }
+   for (uint32_t i = 0; i < execbuf->bo_count; i++)
+      execbuf->objects[i].offset = execbuf->bos[i]->offset;

   struct anv_batch_bo *first_batch_bo =
      list_first_entry(&cmd_buffers[0]->batch_bos, struct anv_batch_bo, link);
@ -1670,25 +1438,9 @@ setup_execbuf_for_cmd_buffers(struct anv_execbuf *execbuf,
      first_batch_bo->bo->exec_obj_index = last_idx;
   }

-   /* If we are pinning our BOs, we shouldn't have to relocate anything */
-   assert(!execbuf->has_relocs);
-
-   /* Now we go through and fixup all of the relocation lists to point to the
-    * correct indices in the object array (I915_EXEC_HANDLE_LUT).  We have to
-    * do this after we reorder the list above as some of the indices may have
-    * changed.
-    */
-   struct anv_batch_bo **bbo;
-   if (execbuf->has_relocs) {
-      assert(num_cmd_buffers == 1);
-      u_vector_foreach(bbo, &cmd_buffers[0]->seen_bbos)
-         anv_cmd_buffer_process_relocs(cmd_buffers[0], &(*bbo)->relocs);
-
-      anv_cmd_buffer_process_relocs(cmd_buffers[0], &cmd_buffers[0]->surface_relocs);
-   }
-
   if (device->physical->memory.need_clflush) {
      __builtin_ia32_mfence();
+      struct anv_batch_bo **bbo;
      for (uint32_t i = 0; i < num_cmd_buffers; i++) {
         u_vector_foreach(bbo, &cmd_buffers[i]->seen_bbos) {
            for (uint32_t l = 0; l < (*bbo)->length; l += CACHELINE_SIZE)
@ -1707,7 +1459,9 @@ setup_execbuf_for_cmd_buffers(struct anv_execbuf *execbuf,
      .num_cliprects = 0,
      .DR1 = 0,
      .DR4 = 0,
-      .flags = I915_EXEC_HANDLE_LUT | queue->exec_flags | (no_reloc ? I915_EXEC_NO_RELOC : 0),
+      .flags = I915_EXEC_NO_RELOC |
+               I915_EXEC_HANDLE_LUT |
+               queue->exec_flags,
      .rsvd1 = device->context_id,
      .rsvd2 = 0,
   };
@ -1778,8 +1532,10 @@ setup_utrace_execbuf(struct anv_execbuf *execbuf, struct anv_queue *queue,
      .buffer_count = execbuf->bo_count,
      .batch_start_offset = 0,
      .batch_len = flush->batch.next - flush->batch.start,
-      .flags = I915_EXEC_HANDLE_LUT | I915_EXEC_FENCE_ARRAY | queue->exec_flags |
-               (execbuf->has_relocs ? 0 : I915_EXEC_NO_RELOC),
+      .flags = I915_EXEC_NO_RELOC |
+               I915_EXEC_HANDLE_LUT |
+               I915_EXEC_FENCE_ARRAY |
+               queue->exec_flags,
      .rsvd1 = device->context_id,
      .rsvd2 = 0,
      .num_cliprects = execbuf->syncobj_count,
@ -1830,12 +1586,6 @@ anv_queue_exec_utrace_locked(struct anv_queue *queue,
 *     up in the wild due to a broken app. It's better to play it safe and
 *     just lock around QueueSubmit.
 *
- *  3) The anv_cmd_buffer_execbuf function may perform relocations in
- *      userspace. Due to the fact that the surface state buffer is shared
- *      between batches, we can't afford to have that happen from multiple
- *      threads at the same time. Even though the user is supposed to ensure
- *      this doesn't happen, we play it safe as in (2) above.
- *
 * Since the only other things that ever take the device lock such as block
 * pool resize only rarely happen, this will almost never be contended so
 * taking a lock isn't really an expensive operation in this case.
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@ -1349,7 +1349,6 @@ void anv_vma_free(struct anv_device *device,
 struct anv_reloc_list {
   uint32_t                                     num_relocs;
   uint32_t                                     array_length;
-   struct drm_i915_gem_relocation_entry *       relocs;
   struct anv_bo **                             reloc_bos;
   uint32_t                                     dep_words;
   BITSET_WORD *                                deps;