anv: add support for indirect execution set

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Acked-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31384>
This commit is contained in:
Lionel Landwerlin 2024-05-30 11:20:32 +03:00 committed by Marge Bot
parent c6dc2df9e5
commit 68885511d2
7 changed files with 485 additions and 0 deletions

View file

@ -51,6 +51,8 @@ genX_bits_included_symbols = [
'3DSTATE_STENCIL_BUFFER::Surface Pitch',
'3DSTATE_HIER_DEPTH_BUFFER::Surface Base Address',
'3DSTATE_HIER_DEPTH_BUFFER::Surface Pitch',
'3DSTATE_DS',
'3DSTATE_HS',
'3DSTATE_CLEAR_PARAMS',
'3DSTATE_SO_BUFFER::Surface Base Address',
'3DSTATE_SO_BUFFER::Stream Offset',

View file

@ -0,0 +1,336 @@
/*
* Copyright 2024 Intel Corporation
* SPDX-License-Identifier: MIT
*/
#include <assert.h>
#include <stdbool.h>
#include "genxml/genX_bits.h"
#include "anv_private.h"
enum anv_dgc_stage
anv_vk_stage_to_dgc_stage(VkShaderStageFlags vk_stage)
{
switch (vk_stage) {
case VK_SHADER_STAGE_VERTEX_BIT:
return ANV_DGC_STAGE_VERTEX;
case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
return ANV_DGC_STAGE_TESS_CTRL;
case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
return ANV_DGC_STAGE_TESS_EVAL;
case VK_SHADER_STAGE_GEOMETRY_BIT:
return ANV_DGC_STAGE_GEOMETRY;
case VK_SHADER_STAGE_FRAGMENT_BIT:
return ANV_DGC_STAGE_FRAGMENT;
case VK_SHADER_STAGE_TASK_BIT_EXT:
return ANV_DGC_STAGE_TASK;
case VK_SHADER_STAGE_MESH_BIT_EXT:
return ANV_DGC_STAGE_MESH;
case VK_SHADER_STAGE_COMPUTE_BIT:
return ANV_DGC_STAGE_COMPUTE;
default:
UNREACHABLE("Unhandled stage");
}
}
uint32_t
anv_vk_stages_to_generated_stages(VkShaderStageFlags vk_stages)
{
uint32_t gen_stages = 0;
anv_foreach_vk_stage(stage, vk_stages)
gen_stages |= BITFIELD_BIT(anv_vk_stage_to_dgc_stage(stage));
return gen_stages;
}
void
anv_write_gfx_indirect_descriptor(struct anv_device *device,
struct anv_dgc_gfx_descriptor *descriptor,
struct anv_cmd_graphics_state *gfx)
{
struct anv_dgc_push_stage_state empty_push = {};
if (intel_needs_workaround(device->info, 16011107343) &&
gfx->shaders[MESA_SHADER_TESS_CTRL] != NULL) {
memcpy(&descriptor->final_commands[descriptor->final_commands_size],
gfx->dyn_state.packed.hs,
_3DSTATE_HS_length(device->info) * 4);
descriptor->final_commands_size += _3DSTATE_HS_length(device->info) * 4;
}
if (intel_needs_workaround(device->info, 22018402687) &&
gfx->shaders[MESA_SHADER_TESS_EVAL] != NULL) {
memcpy(&descriptor->final_commands[descriptor->final_commands_size],
gfx->dyn_state.packed.ds,
_3DSTATE_DS_length(device->info) * 4);
descriptor->final_commands_size += _3DSTATE_DS_length(device->info) * 4;
}
assert(descriptor->final_commands_size <= sizeof(descriptor->final_commands));
anv_foreach_vk_stage(vk_stage, ANV_GRAPHICS_STAGE_BITS) {
enum anv_dgc_stage gen_stage = anv_vk_stage_to_dgc_stage(vk_stage);
enum mesa_shader_stage stage = vk_to_mesa_shader_stage(vk_stage);
if ((gfx->active_stages & vk_stage) == 0) {
descriptor->push_constants.stages[gen_stage] = empty_push;
continue;
}
const struct anv_pipeline_bind_map *bind_map =
&gfx->shaders[stage]->bind_map;
if ((bind_map->push_ranges[0].length == 0 ||
bind_map->push_ranges[0].set != ANV_DESCRIPTOR_SET_PUSH_CONSTANTS) &&
bind_map->inline_dwords_count == 0) {
descriptor->push_constants.stages[gen_stage] = empty_push;
continue;
}
if (stage == MESA_SHADER_MESH &&
intel_needs_workaround(device->info, 18019110168)) {
const struct brw_mesh_prog_data *mesh_prog_data = get_gfx_mesh_prog_data(gfx);
descriptor->wa_18019110168_remapping_table_offset =
gfx->shaders[MESA_SHADER_MESH]->kernel.offset +
mesh_prog_data->wa_18019110168_mapping_offset;
}
if (stage == MESA_SHADER_MESH || stage == MESA_SHADER_TASK) {
descriptor->push_constants.stages[gen_stage].bindless.inline_dwords_count =
bind_map->inline_dwords_count;
assert(sizeof(bind_map->inline_dwords) ==
sizeof(descriptor->push_constants.stages[gen_stage].bindless.inline_dwords));
memcpy(descriptor->push_constants.stages[gen_stage].bindless.inline_dwords,
bind_map->inline_dwords, sizeof(bind_map->inline_dwords));
} else {
for (uint32_t i = 0; i < ARRAY_SIZE(bind_map->push_ranges); i++) {
const struct anv_push_range *range = &bind_map->push_ranges[i];
if (range->length == 0)
break;
/* We should have compiler all the indirectly bindable shaders in
* such a way that it's the only types of push constants we should
* see.
*/
assert(range->set == ANV_DESCRIPTOR_SET_PUSH_CONSTANTS ||
range->set == ANV_DESCRIPTOR_SET_DESCRIPTORS ||
range->set == ANV_DESCRIPTOR_SET_NULL ||
range->set == ANV_DESCRIPTOR_SET_PER_PRIM_PADDING);
struct anv_dgc_push_stage_slot *slot =
&descriptor->push_constants.stages[gen_stage].legacy.slots[i];
slot->push_data_size = 32 * range->length;
slot->push_data_offset = 32 * range->start;
slot->type = ANV_DGC_PUSH_SLOT_TYPE_PUSH_CONSTANTS;
descriptor->push_constants.stages[gen_stage].legacy.n_slots++;
}
}
descriptor->push_constants.active_stages |= 1u << gen_stage;
}
}
static void
write_cs_set_entry(struct anv_device *device,
struct anv_indirect_execution_set *indirect_set,
uint32_t entry, struct anv_shader *shader)
{
struct anv_dgc_cs_descriptor descriptor;
anv_genX(device->info, write_cs_descriptor)(&descriptor, device, shader);
const struct brw_cs_prog_data *prog_data =
brw_cs_prog_data_const(shader->prog_data);
if (device->info->verx10 < 125)
anv_reloc_list_append(&indirect_set->relocs, &shader->relocs);
memcpy(indirect_set->bo->map + entry * indirect_set->stride,
&descriptor, sizeof(descriptor));
indirect_set->uses_systolic |= prog_data->uses_systolic;
indirect_set->max_scratch = MAX2(indirect_set->max_scratch,
prog_data->base.total_scratch);
indirect_set->max_ray_queries = MAX2(indirect_set->max_ray_queries,
shader->prog_data->ray_queries);
}
static void
write_rt_set_entry(struct anv_indirect_execution_set *indirect_set,
uint32_t entry, struct vk_pipeline *pipeline)
{
indirect_set->max_scratch = MAX2(indirect_set->max_scratch,
vk_pipeline_get_rt_scratch_size(pipeline));
indirect_set->max_ray_queries = MAX2(indirect_set->max_ray_queries,
vk_pipeline_get_rt_ray_queries(pipeline));
}
VkResult anv_CreateIndirectExecutionSetEXT(
VkDevice _device,
const VkIndirectExecutionSetCreateInfoEXT* pCreateInfo,
const VkAllocationCallbacks* pAllocator,
VkIndirectExecutionSetEXT* pIndirectExecutionSet)
{
ANV_FROM_HANDLE(anv_device, device, _device);
struct anv_indirect_execution_set *indirect_set =
vk_object_zalloc(&device->vk, pAllocator,
sizeof(struct anv_indirect_execution_set),
VK_OBJECT_TYPE_INDIRECT_EXECUTION_SET_EXT);
if (indirect_set == NULL)
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
VkResult result =
anv_reloc_list_init(&indirect_set->relocs,
pAllocator ? pAllocator : &device->vk.alloc,
device->physical->uses_relocs);
if (result != VK_SUCCESS)
goto fail_object;
struct vk_pipeline *vk_pipeline = NULL;
struct vk_shader *vk_shader = NULL;
VkPipelineBindPoint bind_point;
uint32_t entry_count;
if (pCreateInfo->type == VK_INDIRECT_EXECUTION_SET_INFO_TYPE_PIPELINES_EXT) {
entry_count = pCreateInfo->info.pPipelineInfo->maxPipelineCount;
vk_pipeline =
vk_pipeline_from_handle(pCreateInfo->info.pPipelineInfo->initialPipeline);
bind_point = vk_pipeline->bind_point;
if (vk_pipeline->bind_point == VK_PIPELINE_BIND_POINT_COMPUTE)
vk_shader = vk_pipeline_get_shader(vk_pipeline, MESA_SHADER_COMPUTE);
} else {
entry_count = pCreateInfo->info.pShaderInfo->maxShaderCount;
vk_shader =
vk_shader_from_handle(pCreateInfo->info.pShaderInfo->pInitialShaders[0]);
bind_point = VK_PIPELINE_BIND_POINT_COMPUTE;
}
enum anv_bo_alloc_flags alloc_flags =
ANV_BO_ALLOC_CAPTURE |
ANV_BO_ALLOC_MAPPED |
ANV_BO_ALLOC_HOST_CACHED_COHERENT;
switch (bind_point) {
case VK_PIPELINE_BIND_POINT_COMPUTE: {
struct anv_shader *shader = container_of(vk_shader, struct anv_shader, vk);
/* Alignment required for
* MEDIA_INTERFACE_DESCRIPTOR_LOAD::InterfaceDescriptorDataStartAddress
*/
STATIC_ASSERT(sizeof(struct anv_dgc_cs_descriptor) % 64 == 0);
indirect_set->stride = sizeof(struct anv_dgc_cs_descriptor);
uint32_t size = align(entry_count * indirect_set->stride, 4096);
/* Generations up to Gfx12.0 have a structures describing the compute
* shader that needs to live in the dynamic state heap.
*/
if (device->info->verx10 <= 120)
alloc_flags |= ANV_BO_ALLOC_DYNAMIC_VISIBLE_POOL;
result = anv_device_alloc_bo(device, "indirect-exec-set", size,
alloc_flags, 0 /* explicit_address */,
&indirect_set->bo);
if (result != VK_SUCCESS)
goto fail_relocs;
indirect_set->bind_map = anv_pipeline_bind_map_clone(
device, pAllocator, &shader->bind_map);
if (indirect_set->bind_map == NULL) {
result = vk_errorf(device, VK_ERROR_OUT_OF_HOST_MEMORY,
"Fail to allocate bind map");
goto fail_bo;
}
write_cs_set_entry(device, indirect_set, 0, shader);
break;
}
case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR: {
VK_FROM_HANDLE(vk_pipeline, pipeline,
pCreateInfo->info.pPipelineInfo->initialPipeline);
write_rt_set_entry(indirect_set, 0, pipeline);
break;
}
default:
UNREACHABLE("Unsupported indirect pipeline type");
}
*pIndirectExecutionSet = anv_indirect_execution_set_to_handle(indirect_set);
return VK_SUCCESS;
fail_bo:
anv_device_release_bo(device, indirect_set->bo);
fail_relocs:
anv_reloc_list_finish(&indirect_set->relocs);
fail_object:
vk_object_free(&device->vk, pAllocator, indirect_set);
return result;
}
void anv_DestroyIndirectExecutionSetEXT(
VkDevice _device,
VkIndirectExecutionSetEXT indirectExecutionSet,
const VkAllocationCallbacks* pAllocator)
{
ANV_FROM_HANDLE(anv_device, device, _device);
ANV_FROM_HANDLE(anv_indirect_execution_set, indirect_set, indirectExecutionSet);
vk_free2(&device->vk.alloc, pAllocator, indirect_set->bind_map);
anv_reloc_list_finish(&indirect_set->relocs);
if (indirect_set->bo)
anv_device_release_bo(device, indirect_set->bo);
vk_object_free(&device->vk, pAllocator, indirect_set);
}
void anv_UpdateIndirectExecutionSetPipelineEXT(
VkDevice _device,
VkIndirectExecutionSetEXT indirectExecutionSet,
uint32_t executionSetWriteCount,
const VkWriteIndirectExecutionSetPipelineEXT* pExecutionSetWrites)
{
ANV_FROM_HANDLE(anv_device, device, _device);
ANV_FROM_HANDLE(anv_indirect_execution_set, indirect_set, indirectExecutionSet);
for (uint32_t i = 0; i < executionSetWriteCount; i++) {
VK_FROM_HANDLE(vk_pipeline, pipeline, pExecutionSetWrites[i].pipeline);
switch (pipeline->bind_point) {
case VK_PIPELINE_BIND_POINT_COMPUTE: {
struct vk_shader *vk_shader =
vk_pipeline_get_shader(pipeline, MESA_SHADER_COMPUTE);
struct anv_shader *shader = container_of(vk_shader, struct anv_shader, vk);
write_cs_set_entry(device, indirect_set,
pExecutionSetWrites[i].index, shader);
break;
}
case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR:
write_rt_set_entry(indirect_set, pExecutionSetWrites[i].index, pipeline);
break;
default:
UNREACHABLE("Unsupported indirect pipeline type");
}
}
}
void anv_UpdateIndirectExecutionSetShaderEXT(
VkDevice _device,
VkIndirectExecutionSetEXT indirectExecutionSet,
uint32_t executionSetWriteCount,
const VkWriteIndirectExecutionSetShaderEXT* pExecutionSetWrites)
{
ANV_FROM_HANDLE(anv_device, device, _device);
ANV_FROM_HANDLE(anv_indirect_execution_set, indirect_set, indirectExecutionSet);
for (uint32_t i = 0; i < executionSetWriteCount; i++) {
VK_FROM_HANDLE(vk_shader, vk_shader, pExecutionSetWrites[i].shader);
assert(vk_shader->stage == MESA_SHADER_COMPUTE);
struct anv_shader *shader = container_of(vk_shader, struct anv_shader, vk);
write_cs_set_entry(device, indirect_set,
pExecutionSetWrites[i].index, shader);
}
}

View file

@ -548,6 +548,10 @@ void genX(write_rt_shader_group)(struct anv_device *device,
uint32_t shader_count,
void *output);
void genX(write_cs_descriptor)(struct anv_dgc_cs_descriptor *desc,
struct anv_device *device,
struct anv_shader *shader);
uint32_t genX(shader_cmd_size)(struct anv_device *device,
mesa_shader_stage stage);

View file

@ -54,6 +54,7 @@
#include "compiler/brw/brw_rt.h"
#include "ds/intel_driver_ds.h"
#include "dev/virtio/intel_virtio.h"
#include "shaders/libintel_shaders.h"
#include "util/bitset.h"
#include "util/bitscan.h"
#include "util/cache_ops.h"
@ -1116,6 +1117,11 @@ struct anv_pipeline_bind_map {
uint8_t inferred_behavior;
};
struct anv_pipeline_bind_map *
anv_pipeline_bind_map_clone(struct anv_device *device,
const VkAllocationCallbacks *alloc,
const struct anv_pipeline_bind_map *src);
struct anv_push_descriptor_info {
/* A bitfield of descriptors used. */
uint32_t used_descriptors;
@ -5193,6 +5199,10 @@ struct anv_event {
#define ANV_STAGE_MASK ((1 << MESA_VULKAN_SHADER_STAGES) - 1)
#define ANV_VK_STAGE_MASK (ANV_GRAPHICS_STAGE_BITS | \
ANV_RT_STAGE_BITS | \
VK_SHADER_STAGE_COMPUTE_BIT)
#define anv_foreach_stage(stage, stage_bits) \
u_foreach_bit(stage, (stage_bits & ANV_STAGE_MASK))
@ -6539,6 +6549,36 @@ static inline uint32_t khr_perf_query_preamble_offset(const struct anv_query_poo
pool->khr_perf_preamble_stride * pass;
}
struct anv_indirect_execution_set {
struct vk_object_base base;
struct anv_pipeline_bind_map *bind_map;
/** List of all the scratch buffers on < Gfx12.5 */
struct anv_reloc_list relocs;
struct anv_bo *bo;
bool uses_systolic;
uint32_t stride;
uint32_t max_final_commands_size;
/** Maximum scratch space for shaders */
uint32_t max_scratch;
/** Maximum number of ray queries used by shaders */
uint32_t max_ray_queries;
};
void anv_write_gfx_indirect_descriptor(struct anv_device *device,
struct anv_dgc_gfx_descriptor *descriptor,
struct anv_cmd_graphics_state *gfx);
enum anv_dgc_stage anv_vk_stage_to_dgc_stage(VkShaderStageFlags vk_stage);
uint32_t anv_vk_stages_to_generated_stages(VkShaderStageFlags vk_stages);
struct anv_vid_mem {
struct anv_device_memory *mem;
VkDeviceSize offset;
@ -6949,6 +6989,9 @@ VK_DEFINE_NONDISP_HANDLE_CASTS(anv_performance_configuration_intel, base,
VK_DEFINE_NONDISP_HANDLE_CASTS(anv_video_session, vk.base,
VkVideoSessionKHR,
VK_OBJECT_TYPE_VIDEO_SESSION_KHR)
VK_DEFINE_NONDISP_HANDLE_CASTS(anv_indirect_execution_set, base,
VkIndirectExecutionSetEXT,
VK_OBJECT_TYPE_INDIRECT_EXECUTION_SET_EXT)
#define anv_genX(devinfo, thing) ({ \
__typeof(&gfx9_##thing) genX_thing; \

View file

@ -574,3 +574,33 @@ anv_device_finish_rt_shaders(struct anv_device *device)
if (!device->vk.enabled_extensions.KHR_ray_tracing_pipeline)
return;
}
struct anv_pipeline_bind_map *
anv_pipeline_bind_map_clone(struct anv_device *device,
const VkAllocationCallbacks *alloc,
const struct anv_pipeline_bind_map *src)
{
VK_MULTIALLOC(ma);
VK_MULTIALLOC_DECL(&ma, struct anv_pipeline_bind_map, bind_map, 1);
VK_MULTIALLOC_DECL(&ma, struct anv_pipeline_binding, surfaces, src->surface_count);
VK_MULTIALLOC_DECL(&ma, struct anv_pipeline_binding, samplers, src->sampler_count);
VK_MULTIALLOC_DECL(&ma, struct anv_pipeline_embedded_sampler_binding, embedded_samplers, src->embedded_sampler_count);
if (!vk_multialloc_zalloc2(&ma, &device->vk.alloc, alloc,
VK_SYSTEM_ALLOCATION_SCOPE_DEVICE))
return NULL;
memcpy(bind_map, src, sizeof(*src));
memcpy(surfaces, src->surface_to_descriptor,
sizeof(*surfaces) * src->surface_count);
bind_map->surface_to_descriptor = surfaces;
memcpy(samplers, src->sampler_to_descriptor,
sizeof(*samplers) * src->sampler_count);
bind_map->sampler_to_descriptor = samplers;
memcpy(embedded_samplers, src->embedded_sampler_to_binding,
sizeof(*embedded_samplers) * src->embedded_sampler_count);
bind_map->embedded_sampler_to_binding = embedded_samplers;
return bind_map;
}

View file

@ -1272,6 +1272,75 @@ emit_cs_shader(struct anv_batch *batch,
#endif
}
void
genX(write_cs_descriptor)(struct anv_dgc_cs_descriptor *desc,
struct anv_device *device,
struct anv_shader *shader)
{
const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
const struct anv_push_range *push_range = &bind_map->push_ranges[0];
*desc = (struct anv_dgc_cs_descriptor) {
.push_data_offset = 32 * (push_range->set == ANV_DESCRIPTOR_SET_PUSH_CONSTANTS ?
push_range->start : 0),
};
const struct brw_cs_prog_data *prog_data =
brw_cs_prog_data_const(shader->prog_data);
const struct intel_cs_dispatch_info dispatch =
brw_cs_get_dispatch_info(device->info, prog_data, NULL);
desc->right_mask = dispatch.right_mask;
desc->threads = dispatch.threads;
desc->simd_size = dispatch.simd_size;
#if GFX_VERx10 >= 125
GENX(COMPUTE_WALKER_pack)(NULL, desc->gfx125.compute_walker,
&(struct GENX(COMPUTE_WALKER)) {
GENX(COMPUTE_WALKER_header),
.body = {
.PostSync.MOCS = anv_mocs(device, NULL, 0),
},
});
assert(sizeof(desc->gfx125.compute_walker) >
sizeof(shader->cs.gfx125.compute_walker_body));
for (uint32_t i = 0; i < ARRAY_SIZE(shader->cs.gfx125.compute_walker_body); i++)
desc->gfx125.compute_walker[1 + i] |= shader->cs.gfx125.compute_walker_body[i];
desc->gfx125.inline_dwords_count = bind_map->inline_dwords_count;
assert(sizeof(desc->gfx125.inline_dwords) ==
sizeof(bind_map->inline_dwords));
memcpy(desc->gfx125.inline_dwords,
bind_map->inline_dwords,
sizeof(bind_map->inline_dwords));
#else
assert(sizeof(desc->gfx9.media_vfe_state) ==
shader->cs.gfx9.vfe.len * 4);
assert(sizeof(desc->gfx9.interface_descriptor_data) ==
sizeof(shader->cs.gfx9.idd));
memcpy(desc->gfx9.media_vfe_state,
&shader->cmd_data[shader->cs.gfx9.vfe.offset],
shader->cs.gfx9.vfe.len * 4);
memcpy(desc->gfx9.interface_descriptor_data,
shader->cs.gfx9.idd,
sizeof(desc->gfx9.interface_descriptor_data));
desc->gfx9.n_threads = dispatch.threads;
desc->gfx9.cross_thread_push_size = prog_data->push.cross_thread.size;
desc->gfx9.per_thread_push_size = prog_data->push.per_thread.size;
desc->gfx9.subgroup_id_offset =
offsetof(struct anv_push_constants, cs.subgroup_id) -
(32 * push_range->start + prog_data->push.cross_thread.size);
GENX(GPGPU_WALKER_pack)(NULL, desc->gfx9.gpgpu_walker,
&(struct GENX(GPGPU_WALKER)) {
GENX(GPGPU_WALKER_header),
});
#endif
}
void
genX(init_instructions)(struct anv_physical_device *device)
{

View file

@ -160,6 +160,7 @@ libanv_files = files(
'anv_cmd_buffer.c',
'anv_descriptor_set.c',
'anv_device.c',
'anv_dgc_set.c',
'anv_embedded_sampler.c',
'anv_event.c',
'anv_formats.c',