lavapipe: Implement exec graph pipelines

Just a collection of compute shaders that can enqueue each other.

Reviewed-by: Mike Blumenkrantz <michael.blumenkrantz@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24512>
This commit is contained in:
Konstantin Seurer 2023-08-01 14:39:03 +02:00 committed by Marge Bot
parent b817b597c7
commit ff6a133b72
4 changed files with 389 additions and 3 deletions

View file

@ -0,0 +1,181 @@
/*
* Copyright © 2023 Valve Corporation
*
* SPDX-License-Identifier: MIT
*/
#include "lvp_private.h"
#include "nir_builder.h"
#define lvp_load_internal_field(b, bit_size, field) \
nir_load_ssbo(b, 1, bit_size, nir_imm_int(b, 0), \
nir_imm_int(b, offsetof(struct lvp_exec_graph_internal_data, field)))
#define lvp_store_internal_field(b, value, field, scope) \
nir_store_ssbo(b, value, nir_imm_int(b, 0), \
nir_iadd_imm(b, \
nir_imul_imm(b, nir_load_local_invocation_index(b), \
scope == SCOPE_INVOCATION \
? sizeof(struct lvp_exec_graph_shader_output) \
: 0), \
offsetof(struct lvp_exec_graph_internal_data, outputs) + \
offsetof(struct lvp_exec_graph_shader_output, field)))
static bool
lvp_lower_node_payload_deref(nir_builder *b, nir_instr *instr, void *data)
{
if (instr->type != nir_instr_type_deref)
return false;
nir_deref_instr *deref = nir_instr_as_deref(instr);
bool is_payload = nir_deref_mode_is(deref, nir_var_mem_node_payload);
bool is_payload_in = nir_deref_mode_is(deref, nir_var_mem_node_payload_in);
if (!is_payload && !is_payload_in)
return false;
deref->modes = nir_var_mem_global;
if (deref->deref_type != nir_deref_type_var)
return true;
if (is_payload_in) {
b->cursor = nir_after_instr(instr);
nir_def *payload = lvp_load_internal_field(b, 64, payload_in);
nir_deref_instr *cast = nir_build_deref_cast(b, payload, nir_var_mem_global, deref->type, 0);
nir_def_rewrite_uses(&deref->def, &cast->def);
} else {
nir_foreach_use_safe(use, &deref->def) {
b->cursor = nir_before_instr(use->parent_instr);
nir_def *payload = nir_load_var(b, deref->var);
nir_deref_instr *cast =
nir_build_deref_cast(b, payload, nir_var_mem_global, deref->type, 0);
nir_src_rewrite(use, &cast->def);
}
}
nir_instr_remove(instr);
return true;
}
static bool
lvp_lower_node_payload_derefs(nir_shader *nir)
{
return nir_shader_instructions_pass(nir, lvp_lower_node_payload_deref,
nir_metadata_block_index | nir_metadata_dominance, NULL);
}
static void
lvp_build_initialize_node_payloads(nir_builder *b, nir_intrinsic_instr *intr)
{
mesa_scope scope = nir_intrinsic_execution_scope(intr);
assert(scope == SCOPE_INVOCATION || scope == SCOPE_WORKGROUP);
nir_deref_instr *payloads_deref = nir_src_as_deref(intr->src[0]);
assert(payloads_deref->deref_type == nir_deref_type_var);
nir_variable *payloads_var = payloads_deref->var;
nir_def *addr = lvp_load_internal_field(b, 64, payloads);
if (scope == SCOPE_INVOCATION) {
nir_def *payloads_offset =
nir_imul_imm(b, nir_load_local_invocation_index(b), b->shader->info.cs.node_payloads_size);
addr = nir_iadd(b, addr, nir_u2u64(b, payloads_offset));
}
nir_store_var(b, payloads_var, addr, 0x1);
nir_def *payload_count = intr->src[1].ssa;
lvp_store_internal_field(b, payload_count, payload_count, scope);
nir_def *node_index = intr->src[1].ssa;
lvp_store_internal_field(b, node_index, node_index, scope);
}
static bool
lvp_lower_node_payload_intrinsic(nir_builder *b, nir_instr *instr, void *data)
{
if (instr->type != nir_instr_type_intrinsic)
return false;
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
if (intr->intrinsic == nir_intrinsic_enqueue_node_payloads) {
nir_instr_remove(instr);
return false;
}
b->cursor = nir_after_instr(instr);
switch (intr->intrinsic) {
case nir_intrinsic_initialize_node_payloads:
lvp_build_initialize_node_payloads(b, intr);
nir_instr_remove(instr);
return true;
case nir_intrinsic_finalize_incoming_node_payload:
nir_def_rewrite_uses(&intr->def, nir_imm_true(b));
nir_instr_remove(instr);
return true;
case nir_intrinsic_load_coalesced_input_count:
nir_def_rewrite_uses(&intr->def, nir_imm_int(b, 1));
nir_instr_remove(instr);
return true;
default:
return false;
}
}
static bool
lvp_lower_exec_graph_intrinsics(nir_shader *nir)
{
return nir_shader_instructions_pass(nir, lvp_lower_node_payload_intrinsic,
nir_metadata_block_index | nir_metadata_dominance, NULL);
}
static void
lvp_lower_node_payload_vars(struct lvp_pipeline *pipeline, nir_shader *nir)
{
nir_foreach_variable_in_shader(var, nir) {
if (var->data.mode != nir_var_mem_node_payload &&
var->data.mode != nir_var_mem_node_payload_in)
continue;
if (var->data.mode == nir_var_mem_node_payload) {
assert(var->data.node_name);
assert(!pipeline->exec_graph.next_name);
pipeline->exec_graph.next_name = var->data.node_name;
}
var->data.mode = nir_var_shader_temp;
var->type = glsl_uint64_t_type();
}
}
bool
lvp_lower_exec_graph(struct lvp_pipeline *pipeline, nir_shader *nir)
{
bool progress = false;
NIR_PASS(progress, nir, nir_lower_vars_to_explicit_types,
nir_var_mem_node_payload | nir_var_mem_node_payload_in,
glsl_get_natural_size_align_bytes);
if (!progress)
return false;
/* Lower node payload variables to 64-bit addresses. */
lvp_lower_node_payload_vars(pipeline, nir);
/* Lower exec graph intrinsics to their actual implementation. */
lvp_lower_exec_graph_intrinsics(nir);
/* Lower node payloads to load/store_global intructions. */
lvp_lower_node_payload_derefs(nir);
NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_global, nir_address_format_64bit_global);
/* Cleanup passes */
NIR_PASS(_, nir, nir_lower_global_vars_to_local);
NIR_PASS(_, nir, nir_lower_vars_to_ssa);
NIR_PASS(_, nir, nir_opt_constant_folding);
NIR_PASS(_, nir, nir_opt_dce);
return true;
}

View file

@ -279,6 +279,11 @@ compile_spirv(struct lvp_device *pdevice, const VkPipelineShaderStageCreateInfo
assert(stage <= LVP_SHADER_STAGES && stage != MESA_SHADER_NONE);
VkResult result;
#ifdef VK_ENABLE_BETA_EXTENSIONS
const VkPipelineShaderStageNodeCreateInfoAMDX *node_info = vk_find_struct_const(
sinfo->pNext, PIPELINE_SHADER_STAGE_NODE_CREATE_INFO_AMDX);
#endif
const struct spirv_to_nir_options spirv_options = {
.environment = NIR_SPIRV_VULKAN,
.caps = {
@ -333,6 +338,9 @@ compile_spirv(struct lvp_device *pdevice, const VkPipelineShaderStageCreateInfo
.phys_ssbo_addr_format = nir_address_format_64bit_global,
.push_const_addr_format = nir_address_format_logical,
.shared_addr_format = nir_address_format_32bit_offset,
#ifdef VK_ENABLE_BETA_EXTENSIONS
.shader_index = node_info ? node_info->index : 0,
#endif
};
result = vk_pipeline_shader_stage_to_nir(&pdevice->vk, sinfo,
@ -367,8 +375,9 @@ lvp_ycbcr_conversion_lookup(const void *data, uint32_t set, uint32_t binding, ui
return ycbcr_conversion ? &ycbcr_conversion->state : NULL;
}
/* pipeline is NULL for shader objects. */
static void
lvp_shader_lower(struct lvp_device *pdevice, nir_shader *nir, struct lvp_shader *shader, struct lvp_pipeline_layout *layout)
lvp_shader_lower(struct lvp_device *pdevice, struct lvp_pipeline *pipeline, nir_shader *nir, struct lvp_shader *shader, struct lvp_pipeline_layout *layout)
{
if (nir->info.stage != MESA_SHADER_TESS_CTRL)
NIR_PASS_V(nir, remove_barriers, nir->info.stage == MESA_SHADER_COMPUTE || nir->info.stage == MESA_SHADER_MESH || nir->info.stage == MESA_SHADER_TASK);
@ -413,6 +422,9 @@ lvp_shader_lower(struct lvp_device *pdevice, nir_shader *nir, struct lvp_shader
nir_var_mem_global,
nir_address_format_64bit_global);
if (nir->info.stage == MESA_SHADER_COMPUTE)
lvp_lower_exec_graph(pipeline, nir);
NIR_PASS(_, nir, nir_vk_lower_ycbcr_tex, lvp_ycbcr_conversion_lookup, layout);
nir_lower_non_uniform_access_options options = {
@ -492,7 +504,7 @@ lvp_shader_compile_to_ir(struct lvp_pipeline *pipeline,
nir_shader *nir;
VkResult result = compile_spirv(pdevice, sinfo, &nir);
if (result == VK_SUCCESS)
lvp_shader_lower(pdevice, nir, shader, pipeline->layout);
lvp_shader_lower(pdevice, pipeline, nir, shader, pipeline->layout);
return result;
}
@ -1027,6 +1039,12 @@ get_pipeline_create_flags(const void *pCreateInfo)
const VkRayTracingPipelineCreateInfoKHR *create_info = (VkRayTracingPipelineCreateInfoKHR *)pCreateInfo;
return create_info->flags;
}
#ifdef VK_ENABLE_BETA_EXTENSIONS
case VK_STRUCTURE_TYPE_EXECUTION_GRAPH_PIPELINE_CREATE_INFO_AMDX: {
const VkExecutionGraphPipelineCreateInfoAMDX *create_info = (VkExecutionGraphPipelineCreateInfoAMDX *)pCreateInfo;
return create_info->flags;
}
#endif
default:
unreachable("invalid pCreateInfo pipeline struct");
}
@ -1263,7 +1281,7 @@ create_shader_object(struct lvp_device *device, const VkShaderCreateInfoEXT *pCr
pCreateInfo->pPushConstantRanges,
};
shader->layout = lvp_pipeline_layout_create(device, &pci, pAllocator);
lvp_shader_lower(device, nir, shader, shader->layout);
lvp_shader_lower(device, NULL, nir, shader, shader->layout);
lvp_shader_xfb_init(shader);
if (stage == MESA_SHADER_TESS_EVAL) {
/* spec requires that all tess modes are set in both shaders */
@ -1339,3 +1357,163 @@ VKAPI_ATTR VkResult VKAPI_CALL lvp_GetShaderBinaryDataEXT(
}
return ret;
}
#ifdef VK_ENABLE_BETA_EXTENSIONS
static VkResult
lvp_exec_graph_pipeline_create(VkDevice _device, VkPipelineCache _cache,
const VkExecutionGraphPipelineCreateInfoAMDX *create_info,
VkPipelineCreateFlagBits2KHR flags,
VkPipeline *out_pipeline)
{
LVP_FROM_HANDLE(lvp_device, device, _device);
struct lvp_pipeline *pipeline;
VkResult result;
assert(create_info->sType == VK_STRUCTURE_TYPE_EXECUTION_GRAPH_PIPELINE_CREATE_INFO_AMDX);
uint32_t stage_count = create_info->stageCount;
if (create_info->pLibraryInfo) {
for (uint32_t i = 0; i < create_info->pLibraryInfo->libraryCount; i++) {
VK_FROM_HANDLE(lvp_pipeline, library, create_info->pLibraryInfo->pLibraries[i]);
stage_count += library->num_groups;
}
}
pipeline = vk_zalloc(&device->vk.alloc, sizeof(*pipeline) + stage_count * sizeof(VkPipeline), 8,
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (!pipeline)
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
vk_object_base_init(&device->vk, &pipeline->base,
VK_OBJECT_TYPE_PIPELINE);
uint64_t t0 = os_time_get_nano();
pipeline->type = LVP_PIPELINE_EXEC_GRAPH;
pipeline->layout = lvp_pipeline_layout_from_handle(create_info->layout);
pipeline->exec_graph.scratch_size = 0;
pipeline->num_groups = stage_count;
uint32_t stage_index = 0;
for (uint32_t i = 0; i < create_info->stageCount; i++) {
const VkPipelineShaderStageNodeCreateInfoAMDX *node_info = vk_find_struct_const(
create_info->pStages[i].pNext, PIPELINE_SHADER_STAGE_NODE_CREATE_INFO_AMDX);
VkComputePipelineCreateInfo stage_create_info = {
.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
.flags = create_info->flags,
.stage = create_info->pStages[i],
.layout = create_info->layout,
};
result = lvp_compute_pipeline_create(_device, _cache, &stage_create_info, flags, &pipeline->groups[i]);
if (result != VK_SUCCESS)
goto fail;
VK_FROM_HANDLE(lvp_pipeline, stage, pipeline->groups[i]);
nir_shader *nir = stage->shaders[MESA_SHADER_COMPUTE].pipeline_nir->nir;
if (node_info) {
stage->exec_graph.name = node_info->pName;
stage->exec_graph.index = node_info->index;
}
/* TODO: Add a shader info NIR pass to figure out how many the payloads the shader creates. */
stage->exec_graph.scratch_size = nir->info.cs.node_payloads_size * 256;
pipeline->exec_graph.scratch_size = MAX2(pipeline->exec_graph.scratch_size, stage->exec_graph.scratch_size);
stage_index++;
}
if (create_info->pLibraryInfo) {
for (uint32_t i = 0; i < create_info->pLibraryInfo->libraryCount; i++) {
VK_FROM_HANDLE(lvp_pipeline, library, create_info->pLibraryInfo->pLibraries[i]);
for (uint32_t j = 0; j < library->num_groups; j++) {
/* TODO: Do we need reference counting? */
pipeline->groups[stage_index] = library->groups[j];
stage_index++;
}
pipeline->exec_graph.scratch_size = MAX2(pipeline->exec_graph.scratch_size, library->exec_graph.scratch_size);
}
}
const VkPipelineCreationFeedbackCreateInfo *feedback = (void*)vk_find_struct_const(create_info->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);
if (feedback) {
feedback->pPipelineCreationFeedback->duration = os_time_get_nano() - t0;
feedback->pPipelineCreationFeedback->flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT;
memset(feedback->pPipelineStageCreationFeedbacks, 0, sizeof(VkPipelineCreationFeedback) * feedback->pipelineStageCreationFeedbackCount);
}
*out_pipeline = lvp_pipeline_to_handle(pipeline);
return VK_SUCCESS;
fail:
for (uint32_t i = 0; i < stage_count; i++)
lvp_DestroyPipeline(_device, pipeline->groups[i], NULL);
vk_free(&device->vk.alloc, pipeline);
return result;
}
VKAPI_ATTR VkResult VKAPI_CALL
lvp_CreateExecutionGraphPipelinesAMDX(VkDevice device, VkPipelineCache pipelineCache,
uint32_t createInfoCount,
const VkExecutionGraphPipelineCreateInfoAMDX *pCreateInfos,
const VkAllocationCallbacks *pAllocator,
VkPipeline *pPipelines)
{
VkResult result = VK_SUCCESS;
uint32_t i = 0;
for (; i < createInfoCount; i++) {
VkPipelineCreateFlagBits2KHR flags = get_pipeline_create_flags(&pCreateInfos[i]);
VkResult r = VK_PIPELINE_COMPILE_REQUIRED;
if (!(flags & VK_PIPELINE_CREATE_2_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_KHR))
r = lvp_exec_graph_pipeline_create(device, pipelineCache, &pCreateInfos[i], flags, &pPipelines[i]);
if (r != VK_SUCCESS) {
result = r;
pPipelines[i] = VK_NULL_HANDLE;
if (flags & VK_PIPELINE_CREATE_2_EARLY_RETURN_ON_FAILURE_BIT_KHR)
break;
}
}
if (result != VK_SUCCESS) {
for (; i < createInfoCount; i++)
pPipelines[i] = VK_NULL_HANDLE;
}
return result;
}
VKAPI_ATTR VkResult VKAPI_CALL
lvp_GetExecutionGraphPipelineScratchSizeAMDX(VkDevice device, VkPipeline executionGraph,
VkExecutionGraphPipelineScratchSizeAMDX *pSizeInfo)
{
VK_FROM_HANDLE(lvp_pipeline, pipeline, executionGraph);
pSizeInfo->size = MAX2(pipeline->exec_graph.scratch_size * 32, 16);
return VK_SUCCESS;
}
VKAPI_ATTR VkResult VKAPI_CALL
lvp_GetExecutionGraphPipelineNodeIndexAMDX(VkDevice device, VkPipeline executionGraph,
const VkPipelineShaderStageNodeCreateInfoAMDX *pNodeInfo,
uint32_t *pNodeIndex)
{
VK_FROM_HANDLE(lvp_pipeline, pipeline, executionGraph);
for (uint32_t i = 0; i < pipeline->num_groups; i++) {
VK_FROM_HANDLE(lvp_pipeline, stage, pipeline->groups[i]);
if (stage->exec_graph.index == pNodeInfo->index &&
!strcmp(stage->exec_graph.name, pNodeInfo->pName)) {
*pNodeIndex = i;
return VK_SUCCESS;
}
}
return VK_ERROR_OUT_OF_HOST_MEMORY;
}
#endif

View file

@ -495,11 +495,37 @@ struct lvp_pipeline {
bool compiled;
bool used;
struct {
const char *name;
const char *next_name;
uint32_t index;
uint32_t scratch_size;
} exec_graph;
unsigned num_groups;
unsigned num_groups_total;
VkPipeline groups[0];
};
/* Minimum requirement by the spec. */
#define LVP_MAX_EXEC_GRAPH_PAYLOADS 256
struct lvp_exec_graph_shader_output {
uint32_t payload_count;
uint32_t node_index;
};
struct lvp_exec_graph_internal_data {
/* inputs */
void *payload_in;
void *payloads;
/* outputs */
struct lvp_exec_graph_shader_output outputs[LVP_MAX_EXEC_GRAPH_PAYLOADS];
};
bool
lvp_lower_exec_graph(struct lvp_pipeline *pipeline, nir_shader *nir);
void
lvp_pipeline_shaders_compile(struct lvp_pipeline *pipeline, bool locked);

View file

@ -20,6 +20,7 @@ liblvp_files = files(
'lvp_image.c',
'lvp_formats.c',
'lvp_inline_uniforms.c',
'lvp_lower_exec_graph.c',
'lvp_lower_vulkan_resource.c',
'lvp_lower_vulkan_resource.h',
'lvp_lower_input_attachments.c',