anv: Switch shaders to dedicated VMA allocator

Switched to the new VMA allocator that provides explicit GPU VA control via util_vma_heap. This is architectural preparation for ray tracing capture/replay, which requires the ability to reserve and allocate shaders at specific VAs. The state pool's free-list design makes VA reservation difficult to add, while the new chunk allocator is designed for explicit VA management from the ground up. Signed-off-by: Michael Cheng <michael.cheng@intel.com> Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38869>
2025-12-20 09:30:13 +01:00 · 2025-08-06 14:22:13 -07:00 · 2025-08-06 14:22:13 -07:00 · 8ba197c9ef
commit 8ba197c9ef
parent 1fa327ac32
5 changed files with 88 additions and 34 deletions
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@ -112,6 +112,32 @@ get_bo_from_pool(struct intel_batch_decode_bo *ret,
   return false;
 }

+/* Shader heap: find the backing BO for a GPU VA */
+static bool
+get_bo_from_shader_heap(struct intel_batch_decode_bo *ret,
+                        const struct anv_device *device,
+                        uint64_t address)
+{
+   unsigned i;
+   BITSET_FOREACH_SET(i, device->shader_heap.allocated_bos, ANV_SHADER_HEAP_MAX_BOS) {
+      struct anv_bo *bo = device->shader_heap.bos[i].bo;
+
+      /* Match the 48b-addressing convention used elsewhere */
+      uint64_t base = intel_48b_address(bo->offset);
+      uint64_t size = bo->size;
+
+      if (address >= base && address < base + size) {
+         *ret = (struct intel_batch_decode_bo) {
+            .addr = base,
+            .size = size,
+            .map  = bo->map,
+         };
+         return true;
+      }
+   }
+   return false;
+}
+
 /* Finding a buffer for batch decoding */
 static struct intel_batch_decode_bo
 decode_get_bo(void *v_batch, bool ppgtt, uint64_t address)
@ -123,7 +149,7 @@ decode_get_bo(void *v_batch, bool ppgtt, uint64_t address)

   if (get_bo_from_pool(&ret_bo, &device->dynamic_state_pool.block_pool, address))
      return ret_bo;
-   if (get_bo_from_pool(&ret_bo, &device->instruction_state_pool.block_pool, address))
+   if (get_bo_from_shader_heap(&ret_bo, device, address))
      return ret_bo;
   if (get_bo_from_pool(&ret_bo, &device->binding_table_pool.block_pool, address))
      return ret_bo;
@ -551,13 +577,9 @@ VkResult anv_CreateDevice(
   if (result != VK_SUCCESS)
      goto fail_dynamic_state_pool;

-   result = anv_state_pool_init(&device->instruction_state_pool, device,
-                                &(struct anv_state_pool_params) {
-                                   .name         = "instruction pool",
-                                   .base_address = device->physical->va.instruction_state_pool.addr,
-                                   .block_size   = 16384,
-                                   .max_size     = device->physical->va.instruction_state_pool.size,
-                                });
+   result = anv_shader_heap_init(&device->shader_heap, device,
+                                 device->physical->va.instruction_state_pool,
+                                 21 /* 2MiB */, 27 /* 64MiB */);
   if (result != VK_SUCCESS)
      goto fail_custom_border_color_pool;

@ -573,7 +595,7 @@ VkResult anv_CreateDevice(
                                      .max_size     = device->physical->va.scratch_surface_state_pool.size,
                                   });
      if (result != VK_SUCCESS)
-         goto fail_instruction_state_pool;
+         goto fail_shader_vma_heap;

      result = anv_state_pool_init(&device->internal_surface_state_pool, device,
                                   &(struct anv_state_pool_params) {
@ -1094,8 +1116,8 @@ VkResult anv_CreateDevice(
 fail_scratch_surface_state_pool:
   if (device->info->verx10 >= 125)
      anv_state_pool_finish(&device->scratch_surface_state_pool);
- fail_instruction_state_pool:
-   anv_state_pool_finish(&device->instruction_state_pool);
+ fail_shader_vma_heap:
+      anv_shader_heap_finish(&device->shader_heap);
 fail_custom_border_color_pool:
   anv_state_reserved_array_pool_finish(&device->custom_border_colors);
 fail_dynamic_state_pool:
@ -1251,7 +1273,8 @@ void anv_DestroyDevice(
   anv_state_pool_finish(&device->internal_surface_state_pool);
   if (device->physical->indirect_descriptors)
      anv_state_pool_finish(&device->bindless_surface_state_pool);
-   anv_state_pool_finish(&device->instruction_state_pool);
+
+   anv_shader_heap_finish(&device->shader_heap);
   anv_state_pool_finish(&device->dynamic_state_pool);
   anv_state_pool_finish(&device->general_state_pool);

--- a/src/intel/vulkan/anv_pipeline_cache.c
+++ b/src/intel/vulkan/anv_pipeline_cache.c
@ -59,8 +59,7 @@ anv_shader_internal_destroy(struct vk_device *_device,
   for (uint32_t i = 0; i < shader->bind_map.embedded_sampler_count; i++)
      anv_embedded_sampler_unref(device, shader->embedded_samplers[i]);

-   ANV_DMR_SP_FREE(&device->vk.base, &device->instruction_state_pool, shader->kernel);
-   anv_state_pool_free(&device->instruction_state_pool, shader->kernel);
+   anv_shader_heap_free(&device->shader_heap, shader->kernel);
   vk_pipeline_cache_object_finish(&shader->base);
   vk_free(&device->vk.alloc, shader);
 }
@ -96,6 +95,7 @@ anv_shader_internal_create(struct anv_device *device,
   VK_MULTIALLOC_DECL(&ma, struct intel_shader_reloc, prog_data_relocs,
                           prog_data_in->num_relocs);
   VK_MULTIALLOC_DECL(&ma, uint32_t, prog_data_param, prog_data_in->nr_params);
+   VK_MULTIALLOC_DECL(&ma, void, code, kernel_size);

   VK_MULTIALLOC_DECL_SIZE(&ma, nir_xfb_info, xfb_info,
                                xfb_info_in == NULL ? 0 :
@ -121,17 +121,27 @@ anv_shader_internal_create(struct anv_device *device,

   shader->stage = stage;

-   shader->kernel =
-      anv_state_pool_alloc(&device->instruction_state_pool, kernel_size, 64);
-   ANV_DMR_SP_ALLOC(&device->vk.base, &device->instruction_state_pool, shader->kernel);
-   memcpy(shader->kernel.map, kernel_data, kernel_size);
+   shader->code = code;
+   memcpy(shader->code, kernel_data, kernel_size);
+
+   shader->kernel = anv_shader_heap_alloc(&device->shader_heap,
+                                          kernel_size, 64, false, 0);
+   if (shader->kernel.alloc_size == 0) {
+      vk_pipeline_cache_object_finish(&shader->base);
+      vk_free(&device->vk.alloc, shader);
+      return NULL;
+   }
+
+   anv_shader_heap_upload(&device->shader_heap, shader->kernel,
+                          kernel_data, kernel_size);
+
   shader->kernel_size = kernel_size;

   if (bind_map->embedded_sampler_count > 0) {
      shader->embedded_samplers = embedded_samplers;
      if (anv_device_get_embedded_samplers(device, embedded_samplers, bind_map) != VK_SUCCESS) {
-         ANV_DMR_SP_FREE(&device->vk.base, &device->instruction_state_pool, shader->kernel);
-         anv_state_pool_free(&device->instruction_state_pool, shader->kernel);
+         anv_shader_heap_free(&device->shader_heap, shader->kernel);
+         vk_pipeline_cache_object_finish(&shader->base);
         vk_free(&device->vk.alloc, shader);
         return NULL;
      }
@ -192,7 +202,7 @@ anv_shader_internal_serialize(struct vk_pipeline_cache_object *object,
   blob_write_uint32(blob, shader->stage);

   blob_write_uint32(blob, shader->kernel_size);
-   blob_write_bytes(blob, shader->kernel.map, shader->kernel_size);
+   blob_write_bytes(blob, shader->code, shader->kernel_size);

   blob_write_uint32(blob, shader->prog_data_size);

--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@ -1285,7 +1285,7 @@ struct anv_shader {

   void *code;

-   struct anv_state kernel;
+   struct anv_shader_alloc kernel;

   const struct brw_stage_prog_data *prog_data;

@ -2540,7 +2540,6 @@ struct anv_device {
    struct anv_state_pool                       general_state_pool;
    struct anv_state_pool                       aux_tt_pool;
    struct anv_state_pool                       dynamic_state_pool;
-    struct anv_state_pool                       instruction_state_pool;
    struct anv_state_pool                       binding_table_pool;
    struct anv_state_pool                       scratch_surface_state_pool;
    struct anv_state_pool                       internal_surface_state_pool;
@ -5220,7 +5219,9 @@ struct anv_shader_internal {

   mesa_shader_stage stage;

-   struct anv_state kernel;
+   void *code;
+
+   struct anv_shader_alloc kernel;
   uint32_t kernel_size;

   const struct brw_stage_prog_data *prog_data;
--- a/src/intel/vulkan/anv_shader.c
+++ b/src/intel/vulkan/anv_shader.c
@ -23,7 +23,7 @@ anv_shader_destroy(struct vk_device *vk_device,
   for (uint32_t i = 0; i < shader->bind_map.embedded_sampler_count; i++)
      anv_embedded_sampler_unref(device, shader->embedded_samplers[i]);

-   anv_state_pool_free(&device->instruction_state_pool, shader->kernel);
+   anv_shader_heap_free(&device->shader_heap, shader->kernel);
   anv_reloc_list_finish(&shader->relocs);
   vk_shader_free(vk_device, pAllocator, vk_shader);
 }
@ -629,10 +629,9 @@ anv_shader_create(struct anv_device *device,
   memcpy(shader->code, shader_data->code,
          shader_data->prog_data.base.program_size);

-   shader->kernel =
-      anv_state_pool_alloc(&device->instruction_state_pool,
-                           shader_data->prog_data.base.program_size, 64);
-   ANV_DMR_SP_ALLOC(&device->vk.base, &device->instruction_state_pool, shader->kernel);
+   shader->kernel = anv_shader_heap_alloc(&device->shader_heap,
+                                          shader_data->prog_data.base.program_size,
+                                          64, false, 0);
   if (shader->kernel.alloc_size == 0) {
      result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
      goto error_embedded_samplers;
@ -688,7 +687,8 @@ anv_shader_create(struct anv_device *device,
   if (result != VK_SUCCESS)
      goto error_state;

-   memcpy(shader->kernel.map, shader_data->code,
+   anv_shader_heap_upload(&device->shader_heap,
+                          shader->kernel, shader_data->code,
                          shader_data->prog_data.base.program_size);

   if (mesa_shader_stage_is_rt(shader->vk.stage)) {
@ -717,8 +717,7 @@ anv_shader_create(struct anv_device *device,
   return VK_SUCCESS;

 error_state:
-   ANV_DMR_SP_FREE(&device->vk.base, &device->instruction_state_pool, shader->kernel);
-   anv_state_pool_free(&device->instruction_state_pool, shader->kernel);
+   anv_shader_heap_free(&device->shader_heap, shader->kernel);
 error_embedded_samplers:
   for (uint32_t s = 0; s < shader->bind_map.embedded_sampler_count; s++)
      anv_embedded_sampler_unref(device, shader->embedded_samplers[s]);
--- a/src/intel/vulkan/i915/anv_batch_chain.c
+++ b/src/intel/vulkan/i915/anv_batch_chain.c
@ -352,6 +352,27 @@ out:
   return result;
 }

+static VkResult
+pin_shader_heap(struct anv_device *device,
+                struct anv_execbuf *execbuf,
+                struct anv_shader_heap *heap)
+{
+   VkResult result = VK_SUCCESS;
+
+   simple_mtx_lock(&heap->mutex);
+
+   unsigned i;
+   BITSET_FOREACH_SET(i, heap->allocated_bos, ANV_SHADER_HEAP_MAX_BOS) {
+      result = anv_execbuf_add_bo(device, execbuf, heap->bos[i].bo, NULL, 0);
+      if (result != VK_SUCCESS)
+         goto out;
+   }
+
+out:
+   simple_mtx_unlock(&heap->mutex);
+   return result;
+}
+
 static uint32_t
 calc_batch_start_offset(struct anv_bo *bo)
 {
@ -414,7 +435,7 @@ setup_execbuf_for_cmd_buffers(struct anv_execbuf *execbuf,
   if (result != VK_SUCCESS)
      return result;

-   result = pin_state_pool(device, execbuf, &device->instruction_state_pool);
+   result = pin_shader_heap(device, execbuf, &device->shader_heap);
   if (result != VK_SUCCESS)
      return result;