lavapipe: Implement exec graph pipelines

Just a collection of compute shaders that can enqueue each other. Reviewed-by: Mike Blumenkrantz <michael.blumenkrantz@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24512>
2026-05-05 07:28:11 +02:00 · 2023-08-01 14:39:03 +02:00 · 2023-08-01 14:39:03 +02:00 · ff6a133b72
commit ff6a133b72
parent b817b597c7
4 changed files with 389 additions and 3 deletions
--- a/src/gallium/frontends/lavapipe/lvp_lower_exec_graph.c
+++ b/src/gallium/frontends/lavapipe/lvp_lower_exec_graph.c
@ -0,0 +1,181 @@
+/*
+ * Copyright © 2023 Valve Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "lvp_private.h"
+
+#include "nir_builder.h"
+
+#define lvp_load_internal_field(b, bit_size, field)                                                \
+   nir_load_ssbo(b, 1, bit_size, nir_imm_int(b, 0),                                                \
+                 nir_imm_int(b, offsetof(struct lvp_exec_graph_internal_data, field)))
+
+#define lvp_store_internal_field(b, value, field, scope)                                           \
+   nir_store_ssbo(b, value, nir_imm_int(b, 0),                                                     \
+                  nir_iadd_imm(b,                                                                  \
+                               nir_imul_imm(b, nir_load_local_invocation_index(b),                 \
+                                            scope == SCOPE_INVOCATION                              \
+                                               ? sizeof(struct lvp_exec_graph_shader_output)       \
+                                               : 0),                                               \
+                               offsetof(struct lvp_exec_graph_internal_data, outputs) +            \
+                                  offsetof(struct lvp_exec_graph_shader_output, field)))
+
+static bool
+lvp_lower_node_payload_deref(nir_builder *b, nir_instr *instr, void *data)
+{
+   if (instr->type != nir_instr_type_deref)
+      return false;
+
+   nir_deref_instr *deref = nir_instr_as_deref(instr);
+
+   bool is_payload = nir_deref_mode_is(deref, nir_var_mem_node_payload);
+   bool is_payload_in = nir_deref_mode_is(deref, nir_var_mem_node_payload_in);
+   if (!is_payload && !is_payload_in)
+      return false;
+
+   deref->modes = nir_var_mem_global;
+
+   if (deref->deref_type != nir_deref_type_var)
+      return true;
+
+   if (is_payload_in) {
+      b->cursor = nir_after_instr(instr);
+      nir_def *payload = lvp_load_internal_field(b, 64, payload_in);
+      nir_deref_instr *cast = nir_build_deref_cast(b, payload, nir_var_mem_global, deref->type, 0);
+      nir_def_rewrite_uses(&deref->def, &cast->def);
+   } else {
+      nir_foreach_use_safe(use, &deref->def) {
+         b->cursor = nir_before_instr(use->parent_instr);
+         nir_def *payload = nir_load_var(b, deref->var);
+         nir_deref_instr *cast =
+            nir_build_deref_cast(b, payload, nir_var_mem_global, deref->type, 0);
+         nir_src_rewrite(use, &cast->def);
+      }
+   }
+
+   nir_instr_remove(instr);
+
+   return true;
+}
+
+static bool
+lvp_lower_node_payload_derefs(nir_shader *nir)
+{
+   return nir_shader_instructions_pass(nir, lvp_lower_node_payload_deref,
+                                       nir_metadata_block_index | nir_metadata_dominance, NULL);
+}
+
+static void
+lvp_build_initialize_node_payloads(nir_builder *b, nir_intrinsic_instr *intr)
+{
+   mesa_scope scope = nir_intrinsic_execution_scope(intr);
+   assert(scope == SCOPE_INVOCATION || scope == SCOPE_WORKGROUP);
+
+   nir_deref_instr *payloads_deref = nir_src_as_deref(intr->src[0]);
+   assert(payloads_deref->deref_type == nir_deref_type_var);
+   nir_variable *payloads_var = payloads_deref->var;
+
+   nir_def *addr = lvp_load_internal_field(b, 64, payloads);
+   if (scope == SCOPE_INVOCATION) {
+      nir_def *payloads_offset =
+         nir_imul_imm(b, nir_load_local_invocation_index(b), b->shader->info.cs.node_payloads_size);
+      addr = nir_iadd(b, addr, nir_u2u64(b, payloads_offset));
+   }
+   nir_store_var(b, payloads_var, addr, 0x1);
+
+   nir_def *payload_count = intr->src[1].ssa;
+   lvp_store_internal_field(b, payload_count, payload_count, scope);
+
+   nir_def *node_index = intr->src[1].ssa;
+   lvp_store_internal_field(b, node_index, node_index, scope);
+}
+
+static bool
+lvp_lower_node_payload_intrinsic(nir_builder *b, nir_instr *instr, void *data)
+{
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+   if (intr->intrinsic == nir_intrinsic_enqueue_node_payloads) {
+      nir_instr_remove(instr);
+      return false;
+   }
+
+   b->cursor = nir_after_instr(instr);
+
+   switch (intr->intrinsic) {
+   case nir_intrinsic_initialize_node_payloads:
+      lvp_build_initialize_node_payloads(b, intr);
+      nir_instr_remove(instr);
+      return true;
+   case nir_intrinsic_finalize_incoming_node_payload:
+      nir_def_rewrite_uses(&intr->def, nir_imm_true(b));
+      nir_instr_remove(instr);
+      return true;
+   case nir_intrinsic_load_coalesced_input_count:
+      nir_def_rewrite_uses(&intr->def, nir_imm_int(b, 1));
+      nir_instr_remove(instr);
+      return true;
+   default:
+      return false;
+   }
+}
+
+static bool
+lvp_lower_exec_graph_intrinsics(nir_shader *nir)
+{
+   return nir_shader_instructions_pass(nir, lvp_lower_node_payload_intrinsic,
+                                       nir_metadata_block_index | nir_metadata_dominance, NULL);
+}
+
+static void
+lvp_lower_node_payload_vars(struct lvp_pipeline *pipeline, nir_shader *nir)
+{
+   nir_foreach_variable_in_shader(var, nir) {
+      if (var->data.mode != nir_var_mem_node_payload &&
+          var->data.mode != nir_var_mem_node_payload_in)
+         continue;
+
+      if (var->data.mode == nir_var_mem_node_payload) {
+         assert(var->data.node_name);
+         assert(!pipeline->exec_graph.next_name);
+         pipeline->exec_graph.next_name = var->data.node_name;
+      }
+
+      var->data.mode = nir_var_shader_temp;
+      var->type = glsl_uint64_t_type();
+   }
+}
+
+bool
+lvp_lower_exec_graph(struct lvp_pipeline *pipeline, nir_shader *nir)
+{
+   bool progress = false;
+   NIR_PASS(progress, nir, nir_lower_vars_to_explicit_types,
+            nir_var_mem_node_payload | nir_var_mem_node_payload_in,
+            glsl_get_natural_size_align_bytes);
+
+   if (!progress)
+      return false;
+
+   /* Lower node payload variables to 64-bit addresses. */
+   lvp_lower_node_payload_vars(pipeline, nir);
+
+   /* Lower exec graph intrinsics to their actual implementation. */
+   lvp_lower_exec_graph_intrinsics(nir);
+
+   /* Lower node payloads to load/store_global intructions. */
+   lvp_lower_node_payload_derefs(nir);
+   NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_global, nir_address_format_64bit_global);
+
+   /* Cleanup passes */
+   NIR_PASS(_, nir, nir_lower_global_vars_to_local);
+   NIR_PASS(_, nir, nir_lower_vars_to_ssa);
+   NIR_PASS(_, nir, nir_opt_constant_folding);
+   NIR_PASS(_, nir, nir_opt_dce);
+
+   return true;
+}
--- a/src/gallium/frontends/lavapipe/lvp_pipeline.c
+++ b/src/gallium/frontends/lavapipe/lvp_pipeline.c
@ -279,6 +279,11 @@ compile_spirv(struct lvp_device *pdevice, const VkPipelineShaderStageCreateInfo
   assert(stage <= LVP_SHADER_STAGES && stage != MESA_SHADER_NONE);
   VkResult result;

+#ifdef VK_ENABLE_BETA_EXTENSIONS
+   const VkPipelineShaderStageNodeCreateInfoAMDX *node_info = vk_find_struct_const(
+      sinfo->pNext, PIPELINE_SHADER_STAGE_NODE_CREATE_INFO_AMDX);
+#endif
+
   const struct spirv_to_nir_options spirv_options = {
      .environment = NIR_SPIRV_VULKAN,
      .caps = {
@ -333,6 +338,9 @@ compile_spirv(struct lvp_device *pdevice, const VkPipelineShaderStageCreateInfo
      .phys_ssbo_addr_format = nir_address_format_64bit_global,
      .push_const_addr_format = nir_address_format_logical,
      .shared_addr_format = nir_address_format_32bit_offset,
+#ifdef VK_ENABLE_BETA_EXTENSIONS
+      .shader_index = node_info ? node_info->index : 0,
+#endif
   };

   result = vk_pipeline_shader_stage_to_nir(&pdevice->vk, sinfo,
@ -367,8 +375,9 @@ lvp_ycbcr_conversion_lookup(const void *data, uint32_t set, uint32_t binding, ui
   return ycbcr_conversion ? &ycbcr_conversion->state : NULL;
 }

+/* pipeline is NULL for shader objects. */
 static void
-lvp_shader_lower(struct lvp_device *pdevice, nir_shader *nir, struct lvp_shader *shader, struct lvp_pipeline_layout *layout)
+lvp_shader_lower(struct lvp_device *pdevice, struct lvp_pipeline *pipeline, nir_shader *nir, struct lvp_shader *shader, struct lvp_pipeline_layout *layout)
 {
   if (nir->info.stage != MESA_SHADER_TESS_CTRL)
      NIR_PASS_V(nir, remove_barriers, nir->info.stage == MESA_SHADER_COMPUTE || nir->info.stage == MESA_SHADER_MESH || nir->info.stage == MESA_SHADER_TASK);
@ -413,6 +422,9 @@ lvp_shader_lower(struct lvp_device *pdevice, nir_shader *nir, struct lvp_shader
              nir_var_mem_global,
              nir_address_format_64bit_global);

+   if (nir->info.stage == MESA_SHADER_COMPUTE)
+      lvp_lower_exec_graph(pipeline, nir);
+
   NIR_PASS(_, nir, nir_vk_lower_ycbcr_tex, lvp_ycbcr_conversion_lookup, layout);

   nir_lower_non_uniform_access_options options = {
@ -492,7 +504,7 @@ lvp_shader_compile_to_ir(struct lvp_pipeline *pipeline,
   nir_shader *nir;
   VkResult result = compile_spirv(pdevice, sinfo, &nir);
   if (result == VK_SUCCESS)
-      lvp_shader_lower(pdevice, nir, shader, pipeline->layout);
+      lvp_shader_lower(pdevice, pipeline, nir, shader, pipeline->layout);
   return result;
 }

@ -1027,6 +1039,12 @@ get_pipeline_create_flags(const void *pCreateInfo)
      const VkRayTracingPipelineCreateInfoKHR *create_info = (VkRayTracingPipelineCreateInfoKHR *)pCreateInfo;
      return create_info->flags;
   }
+#ifdef VK_ENABLE_BETA_EXTENSIONS
+   case VK_STRUCTURE_TYPE_EXECUTION_GRAPH_PIPELINE_CREATE_INFO_AMDX: {
+      const VkExecutionGraphPipelineCreateInfoAMDX *create_info = (VkExecutionGraphPipelineCreateInfoAMDX *)pCreateInfo;
+      return create_info->flags;
+   }
+#endif
   default:
      unreachable("invalid pCreateInfo pipeline struct");
   }
@ -1263,7 +1281,7 @@ create_shader_object(struct lvp_device *device, const VkShaderCreateInfoEXT *pCr
      pCreateInfo->pPushConstantRanges,
   };
   shader->layout = lvp_pipeline_layout_create(device, &pci, pAllocator);
-   lvp_shader_lower(device, nir, shader, shader->layout);
+   lvp_shader_lower(device, NULL, nir, shader, shader->layout);
   lvp_shader_xfb_init(shader);
   if (stage == MESA_SHADER_TESS_EVAL) {
      /* spec requires that all tess modes are set in both shaders */
@ -1339,3 +1357,163 @@ VKAPI_ATTR VkResult VKAPI_CALL lvp_GetShaderBinaryDataEXT(
   }
   return ret;
 }
+
+#ifdef VK_ENABLE_BETA_EXTENSIONS
+static VkResult
+lvp_exec_graph_pipeline_create(VkDevice _device, VkPipelineCache _cache,
+                               const VkExecutionGraphPipelineCreateInfoAMDX *create_info,
+                               VkPipelineCreateFlagBits2KHR flags,
+                               VkPipeline *out_pipeline)
+{
+   LVP_FROM_HANDLE(lvp_device, device, _device);
+   struct lvp_pipeline *pipeline;
+   VkResult result;
+
+   assert(create_info->sType == VK_STRUCTURE_TYPE_EXECUTION_GRAPH_PIPELINE_CREATE_INFO_AMDX);
+
+   uint32_t stage_count = create_info->stageCount;
+   if (create_info->pLibraryInfo) {
+      for (uint32_t i = 0; i < create_info->pLibraryInfo->libraryCount; i++) {
+         VK_FROM_HANDLE(lvp_pipeline, library, create_info->pLibraryInfo->pLibraries[i]);
+         stage_count += library->num_groups;
+      }
+   }
+
+   pipeline = vk_zalloc(&device->vk.alloc, sizeof(*pipeline) + stage_count * sizeof(VkPipeline), 8,
+                         VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (!pipeline)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   vk_object_base_init(&device->vk, &pipeline->base,
+                       VK_OBJECT_TYPE_PIPELINE);
+
+   uint64_t t0 = os_time_get_nano();
+
+   pipeline->type = LVP_PIPELINE_EXEC_GRAPH;
+   pipeline->layout = lvp_pipeline_layout_from_handle(create_info->layout);
+
+   pipeline->exec_graph.scratch_size = 0;
+   pipeline->num_groups = stage_count;
+
+   uint32_t stage_index = 0;
+   for (uint32_t i = 0; i < create_info->stageCount; i++) {
+      const VkPipelineShaderStageNodeCreateInfoAMDX *node_info = vk_find_struct_const(
+         create_info->pStages[i].pNext, PIPELINE_SHADER_STAGE_NODE_CREATE_INFO_AMDX);
+
+      VkComputePipelineCreateInfo stage_create_info = {
+         .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
+         .flags = create_info->flags,
+         .stage = create_info->pStages[i],
+         .layout = create_info->layout,
+      };
+
+      result = lvp_compute_pipeline_create(_device, _cache, &stage_create_info, flags, &pipeline->groups[i]);
+      if (result != VK_SUCCESS)
+         goto fail;
+
+      VK_FROM_HANDLE(lvp_pipeline, stage, pipeline->groups[i]);
+      nir_shader *nir = stage->shaders[MESA_SHADER_COMPUTE].pipeline_nir->nir;
+
+      if (node_info) {
+         stage->exec_graph.name = node_info->pName;
+         stage->exec_graph.index = node_info->index;
+      }
+
+      /* TODO: Add a shader info NIR pass to figure out how many the payloads the shader creates. */
+      stage->exec_graph.scratch_size = nir->info.cs.node_payloads_size * 256;
+      pipeline->exec_graph.scratch_size = MAX2(pipeline->exec_graph.scratch_size, stage->exec_graph.scratch_size);
+
+      stage_index++;
+   }
+
+   if (create_info->pLibraryInfo) {
+      for (uint32_t i = 0; i < create_info->pLibraryInfo->libraryCount; i++) {
+         VK_FROM_HANDLE(lvp_pipeline, library, create_info->pLibraryInfo->pLibraries[i]);
+         for (uint32_t j = 0; j < library->num_groups; j++) {
+            /* TODO: Do we need reference counting? */
+            pipeline->groups[stage_index] = library->groups[j];
+            stage_index++;
+         }
+         pipeline->exec_graph.scratch_size = MAX2(pipeline->exec_graph.scratch_size, library->exec_graph.scratch_size);
+      }
+   }
+
+   const VkPipelineCreationFeedbackCreateInfo *feedback = (void*)vk_find_struct_const(create_info->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);
+   if (feedback) {
+      feedback->pPipelineCreationFeedback->duration = os_time_get_nano() - t0;
+      feedback->pPipelineCreationFeedback->flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT;
+      memset(feedback->pPipelineStageCreationFeedbacks, 0, sizeof(VkPipelineCreationFeedback) * feedback->pipelineStageCreationFeedbackCount);
+   }
+
+   *out_pipeline = lvp_pipeline_to_handle(pipeline);
+
+   return VK_SUCCESS;
+
+fail:
+   for (uint32_t i = 0; i < stage_count; i++)
+      lvp_DestroyPipeline(_device, pipeline->groups[i], NULL);
+
+   vk_free(&device->vk.alloc, pipeline);
+
+   return result;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+lvp_CreateExecutionGraphPipelinesAMDX(VkDevice device, VkPipelineCache pipelineCache,
+                                      uint32_t createInfoCount,
+                                      const VkExecutionGraphPipelineCreateInfoAMDX *pCreateInfos,
+                                      const VkAllocationCallbacks *pAllocator,
+                                      VkPipeline *pPipelines)
+{
+   VkResult result = VK_SUCCESS;
+   uint32_t i = 0;
+
+   for (; i < createInfoCount; i++) {
+      VkPipelineCreateFlagBits2KHR flags = get_pipeline_create_flags(&pCreateInfos[i]);
+
+      VkResult r = VK_PIPELINE_COMPILE_REQUIRED;
+      if (!(flags & VK_PIPELINE_CREATE_2_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_KHR))
+         r = lvp_exec_graph_pipeline_create(device, pipelineCache, &pCreateInfos[i], flags, &pPipelines[i]);
+      if (r != VK_SUCCESS) {
+         result = r;
+         pPipelines[i] = VK_NULL_HANDLE;
+         if (flags & VK_PIPELINE_CREATE_2_EARLY_RETURN_ON_FAILURE_BIT_KHR)
+            break;
+      }
+   }
+   if (result != VK_SUCCESS) {
+      for (; i < createInfoCount; i++)
+         pPipelines[i] = VK_NULL_HANDLE;
+   }
+
+   return result;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+lvp_GetExecutionGraphPipelineScratchSizeAMDX(VkDevice device, VkPipeline executionGraph,
+                                             VkExecutionGraphPipelineScratchSizeAMDX *pSizeInfo)
+{
+   VK_FROM_HANDLE(lvp_pipeline, pipeline, executionGraph);
+   pSizeInfo->size = MAX2(pipeline->exec_graph.scratch_size * 32, 16);
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+lvp_GetExecutionGraphPipelineNodeIndexAMDX(VkDevice device, VkPipeline executionGraph,
+                                           const VkPipelineShaderStageNodeCreateInfoAMDX *pNodeInfo,
+                                           uint32_t *pNodeIndex)
+{
+   VK_FROM_HANDLE(lvp_pipeline, pipeline, executionGraph);
+
+   for (uint32_t i = 0; i < pipeline->num_groups; i++) {
+      VK_FROM_HANDLE(lvp_pipeline, stage, pipeline->groups[i]);
+      if (stage->exec_graph.index == pNodeInfo->index &&
+          !strcmp(stage->exec_graph.name, pNodeInfo->pName)) {
+         *pNodeIndex = i;
+         return VK_SUCCESS;
+      }
+   }
+
+   return VK_ERROR_OUT_OF_HOST_MEMORY;
+}
+#endif
--- a/src/gallium/frontends/lavapipe/lvp_private.h
+++ b/src/gallium/frontends/lavapipe/lvp_private.h
@ -495,11 +495,37 @@ struct lvp_pipeline {
   bool compiled;
   bool used;

+   struct {
+      const char *name;
+      const char *next_name;
+      uint32_t index;
+      uint32_t scratch_size;
+   } exec_graph;
+
   unsigned num_groups;
   unsigned num_groups_total;
   VkPipeline groups[0];
 };

+/* Minimum requirement by the spec. */
+#define LVP_MAX_EXEC_GRAPH_PAYLOADS 256
+
+struct lvp_exec_graph_shader_output {
+   uint32_t payload_count;
+   uint32_t node_index;
+};
+
+struct lvp_exec_graph_internal_data {
+   /* inputs */
+   void *payload_in;
+   void *payloads;
+   /* outputs */
+   struct lvp_exec_graph_shader_output outputs[LVP_MAX_EXEC_GRAPH_PAYLOADS];
+};
+
+bool
+lvp_lower_exec_graph(struct lvp_pipeline *pipeline, nir_shader *nir);
+
 void
 lvp_pipeline_shaders_compile(struct lvp_pipeline *pipeline, bool locked);

--- a/src/gallium/frontends/lavapipe/meson.build
+++ b/src/gallium/frontends/lavapipe/meson.build
@ -20,6 +20,7 @@ liblvp_files = files(
    'lvp_image.c',
    'lvp_formats.c',
    'lvp_inline_uniforms.c',
+    'lvp_lower_exec_graph.c',
    'lvp_lower_vulkan_resource.c',
    'lvp_lower_vulkan_resource.h',
    'lvp_lower_input_attachments.c',