anv: Switch shaders to dedicated VMA allocator
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

Switched to the new VMA allocator that provides explicit GPU VA
control via util_vma_heap.

This is architectural preparation for ray tracing capture/replay,
which requires the ability to reserve and allocate shaders at specific
VAs. The state pool's free-list design makes VA reservation difficult
to add, while the new chunk allocator is designed for explicit VA
management from the ground up.

Signed-off-by: Michael Cheng <michael.cheng@intel.com>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38869>
This commit is contained in:
Michael Cheng 2025-08-06 14:22:13 -07:00 committed by Marge Bot
parent 1fa327ac32
commit 8ba197c9ef
5 changed files with 88 additions and 34 deletions

View file

@ -112,6 +112,32 @@ get_bo_from_pool(struct intel_batch_decode_bo *ret,
return false;
}
/* Shader heap: find the backing BO for a GPU VA */
static bool
get_bo_from_shader_heap(struct intel_batch_decode_bo *ret,
const struct anv_device *device,
uint64_t address)
{
unsigned i;
BITSET_FOREACH_SET(i, device->shader_heap.allocated_bos, ANV_SHADER_HEAP_MAX_BOS) {
struct anv_bo *bo = device->shader_heap.bos[i].bo;
/* Match the 48b-addressing convention used elsewhere */
uint64_t base = intel_48b_address(bo->offset);
uint64_t size = bo->size;
if (address >= base && address < base + size) {
*ret = (struct intel_batch_decode_bo) {
.addr = base,
.size = size,
.map = bo->map,
};
return true;
}
}
return false;
}
/* Finding a buffer for batch decoding */
static struct intel_batch_decode_bo
decode_get_bo(void *v_batch, bool ppgtt, uint64_t address)
@ -123,7 +149,7 @@ decode_get_bo(void *v_batch, bool ppgtt, uint64_t address)
if (get_bo_from_pool(&ret_bo, &device->dynamic_state_pool.block_pool, address))
return ret_bo;
if (get_bo_from_pool(&ret_bo, &device->instruction_state_pool.block_pool, address))
if (get_bo_from_shader_heap(&ret_bo, device, address))
return ret_bo;
if (get_bo_from_pool(&ret_bo, &device->binding_table_pool.block_pool, address))
return ret_bo;
@ -551,13 +577,9 @@ VkResult anv_CreateDevice(
if (result != VK_SUCCESS)
goto fail_dynamic_state_pool;
result = anv_state_pool_init(&device->instruction_state_pool, device,
&(struct anv_state_pool_params) {
.name = "instruction pool",
.base_address = device->physical->va.instruction_state_pool.addr,
.block_size = 16384,
.max_size = device->physical->va.instruction_state_pool.size,
});
result = anv_shader_heap_init(&device->shader_heap, device,
device->physical->va.instruction_state_pool,
21 /* 2MiB */, 27 /* 64MiB */);
if (result != VK_SUCCESS)
goto fail_custom_border_color_pool;
@ -573,7 +595,7 @@ VkResult anv_CreateDevice(
.max_size = device->physical->va.scratch_surface_state_pool.size,
});
if (result != VK_SUCCESS)
goto fail_instruction_state_pool;
goto fail_shader_vma_heap;
result = anv_state_pool_init(&device->internal_surface_state_pool, device,
&(struct anv_state_pool_params) {
@ -1094,8 +1116,8 @@ VkResult anv_CreateDevice(
fail_scratch_surface_state_pool:
if (device->info->verx10 >= 125)
anv_state_pool_finish(&device->scratch_surface_state_pool);
fail_instruction_state_pool:
anv_state_pool_finish(&device->instruction_state_pool);
fail_shader_vma_heap:
anv_shader_heap_finish(&device->shader_heap);
fail_custom_border_color_pool:
anv_state_reserved_array_pool_finish(&device->custom_border_colors);
fail_dynamic_state_pool:
@ -1251,7 +1273,8 @@ void anv_DestroyDevice(
anv_state_pool_finish(&device->internal_surface_state_pool);
if (device->physical->indirect_descriptors)
anv_state_pool_finish(&device->bindless_surface_state_pool);
anv_state_pool_finish(&device->instruction_state_pool);
anv_shader_heap_finish(&device->shader_heap);
anv_state_pool_finish(&device->dynamic_state_pool);
anv_state_pool_finish(&device->general_state_pool);

View file

@ -59,8 +59,7 @@ anv_shader_internal_destroy(struct vk_device *_device,
for (uint32_t i = 0; i < shader->bind_map.embedded_sampler_count; i++)
anv_embedded_sampler_unref(device, shader->embedded_samplers[i]);
ANV_DMR_SP_FREE(&device->vk.base, &device->instruction_state_pool, shader->kernel);
anv_state_pool_free(&device->instruction_state_pool, shader->kernel);
anv_shader_heap_free(&device->shader_heap, shader->kernel);
vk_pipeline_cache_object_finish(&shader->base);
vk_free(&device->vk.alloc, shader);
}
@ -96,6 +95,7 @@ anv_shader_internal_create(struct anv_device *device,
VK_MULTIALLOC_DECL(&ma, struct intel_shader_reloc, prog_data_relocs,
prog_data_in->num_relocs);
VK_MULTIALLOC_DECL(&ma, uint32_t, prog_data_param, prog_data_in->nr_params);
VK_MULTIALLOC_DECL(&ma, void, code, kernel_size);
VK_MULTIALLOC_DECL_SIZE(&ma, nir_xfb_info, xfb_info,
xfb_info_in == NULL ? 0 :
@ -121,17 +121,27 @@ anv_shader_internal_create(struct anv_device *device,
shader->stage = stage;
shader->kernel =
anv_state_pool_alloc(&device->instruction_state_pool, kernel_size, 64);
ANV_DMR_SP_ALLOC(&device->vk.base, &device->instruction_state_pool, shader->kernel);
memcpy(shader->kernel.map, kernel_data, kernel_size);
shader->code = code;
memcpy(shader->code, kernel_data, kernel_size);
shader->kernel = anv_shader_heap_alloc(&device->shader_heap,
kernel_size, 64, false, 0);
if (shader->kernel.alloc_size == 0) {
vk_pipeline_cache_object_finish(&shader->base);
vk_free(&device->vk.alloc, shader);
return NULL;
}
anv_shader_heap_upload(&device->shader_heap, shader->kernel,
kernel_data, kernel_size);
shader->kernel_size = kernel_size;
if (bind_map->embedded_sampler_count > 0) {
shader->embedded_samplers = embedded_samplers;
if (anv_device_get_embedded_samplers(device, embedded_samplers, bind_map) != VK_SUCCESS) {
ANV_DMR_SP_FREE(&device->vk.base, &device->instruction_state_pool, shader->kernel);
anv_state_pool_free(&device->instruction_state_pool, shader->kernel);
anv_shader_heap_free(&device->shader_heap, shader->kernel);
vk_pipeline_cache_object_finish(&shader->base);
vk_free(&device->vk.alloc, shader);
return NULL;
}
@ -192,7 +202,7 @@ anv_shader_internal_serialize(struct vk_pipeline_cache_object *object,
blob_write_uint32(blob, shader->stage);
blob_write_uint32(blob, shader->kernel_size);
blob_write_bytes(blob, shader->kernel.map, shader->kernel_size);
blob_write_bytes(blob, shader->code, shader->kernel_size);
blob_write_uint32(blob, shader->prog_data_size);

View file

@ -1285,7 +1285,7 @@ struct anv_shader {
void *code;
struct anv_state kernel;
struct anv_shader_alloc kernel;
const struct brw_stage_prog_data *prog_data;
@ -2540,7 +2540,6 @@ struct anv_device {
struct anv_state_pool general_state_pool;
struct anv_state_pool aux_tt_pool;
struct anv_state_pool dynamic_state_pool;
struct anv_state_pool instruction_state_pool;
struct anv_state_pool binding_table_pool;
struct anv_state_pool scratch_surface_state_pool;
struct anv_state_pool internal_surface_state_pool;
@ -5220,7 +5219,9 @@ struct anv_shader_internal {
mesa_shader_stage stage;
struct anv_state kernel;
void *code;
struct anv_shader_alloc kernel;
uint32_t kernel_size;
const struct brw_stage_prog_data *prog_data;

View file

@ -23,7 +23,7 @@ anv_shader_destroy(struct vk_device *vk_device,
for (uint32_t i = 0; i < shader->bind_map.embedded_sampler_count; i++)
anv_embedded_sampler_unref(device, shader->embedded_samplers[i]);
anv_state_pool_free(&device->instruction_state_pool, shader->kernel);
anv_shader_heap_free(&device->shader_heap, shader->kernel);
anv_reloc_list_finish(&shader->relocs);
vk_shader_free(vk_device, pAllocator, vk_shader);
}
@ -629,10 +629,9 @@ anv_shader_create(struct anv_device *device,
memcpy(shader->code, shader_data->code,
shader_data->prog_data.base.program_size);
shader->kernel =
anv_state_pool_alloc(&device->instruction_state_pool,
shader_data->prog_data.base.program_size, 64);
ANV_DMR_SP_ALLOC(&device->vk.base, &device->instruction_state_pool, shader->kernel);
shader->kernel = anv_shader_heap_alloc(&device->shader_heap,
shader_data->prog_data.base.program_size,
64, false, 0);
if (shader->kernel.alloc_size == 0) {
result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
goto error_embedded_samplers;
@ -688,7 +687,8 @@ anv_shader_create(struct anv_device *device,
if (result != VK_SUCCESS)
goto error_state;
memcpy(shader->kernel.map, shader_data->code,
anv_shader_heap_upload(&device->shader_heap,
shader->kernel, shader_data->code,
shader_data->prog_data.base.program_size);
if (mesa_shader_stage_is_rt(shader->vk.stage)) {
@ -717,8 +717,7 @@ anv_shader_create(struct anv_device *device,
return VK_SUCCESS;
error_state:
ANV_DMR_SP_FREE(&device->vk.base, &device->instruction_state_pool, shader->kernel);
anv_state_pool_free(&device->instruction_state_pool, shader->kernel);
anv_shader_heap_free(&device->shader_heap, shader->kernel);
error_embedded_samplers:
for (uint32_t s = 0; s < shader->bind_map.embedded_sampler_count; s++)
anv_embedded_sampler_unref(device, shader->embedded_samplers[s]);

View file

@ -352,6 +352,27 @@ out:
return result;
}
static VkResult
pin_shader_heap(struct anv_device *device,
struct anv_execbuf *execbuf,
struct anv_shader_heap *heap)
{
VkResult result = VK_SUCCESS;
simple_mtx_lock(&heap->mutex);
unsigned i;
BITSET_FOREACH_SET(i, heap->allocated_bos, ANV_SHADER_HEAP_MAX_BOS) {
result = anv_execbuf_add_bo(device, execbuf, heap->bos[i].bo, NULL, 0);
if (result != VK_SUCCESS)
goto out;
}
out:
simple_mtx_unlock(&heap->mutex);
return result;
}
static uint32_t
calc_batch_start_offset(struct anv_bo *bo)
{
@ -414,7 +435,7 @@ setup_execbuf_for_cmd_buffers(struct anv_execbuf *execbuf,
if (result != VK_SUCCESS)
return result;
result = pin_state_pool(device, execbuf, &device->instruction_state_pool);
result = pin_shader_heap(device, execbuf, &device->shader_heap);
if (result != VK_SUCCESS)
return result;