anv/brw: handle pipeline libraries with mesh

I always thought there was a massive issue with pipeline libraries & mesh shaders. Indeed recent CTS tests have exposed a number of issues. Some values delivered to the fragment shader are coming from different places depending on whether the preceding shader is Mesh or not. For example PrimitiveID is delivered in the per-primitive block in Mesh pipelines whereas for other pipelines it's coming as a VUE slot (which is per-vertex). Those are 2 different locations in the payload. We have to find a layout for fragment shaders that is compatible with everything. Leaving gaps here and there in the thread payload. Fixes the following test pattern : dEQP-VK.mesh_shader.ext.smoke.fast_lib.shared_* Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Ivan Briano <ivan.briano@intel.com> Acked-by: Caio Oliveira <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34109>
2025-12-24 08:50:13 +01:00 · 2025-03-18 20:56:02 +02:00 · 2025-03-18 20:56:02 +02:00 · 5c7c1eceb5
commit 5c7c1eceb5
parent 18bbcf9a63
8 changed files with 89 additions and 8 deletions
--- a/src/intel/compiler/brw_compile_fs.cpp
+++ b/src/intel/compiler/brw_compile_fs.cpp
@ -950,6 +950,8 @@ brw_nir_populate_wm_prog_data(nir_shader *shader,
    */
   prog_data->alpha_to_coverage = key->alpha_to_coverage;

+   prog_data->mesh_input = key->mesh_input;
+
   prog_data->uses_sample_mask =
      BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_MASK_IN);

--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@ -380,7 +380,9 @@ struct brw_wm_prog_key {
 static inline bool
 brw_wm_prog_key_is_dynamic(const struct brw_wm_prog_key *key)
 {
-   return key->alpha_to_coverage == INTEL_SOMETIMES ||
+   return
+      key->mesh_input == INTEL_SOMETIMES ||
+      key->alpha_to_coverage == INTEL_SOMETIMES ||
      key->persample_interp == INTEL_SOMETIMES ||
      key->multisample_fbo == INTEL_SOMETIMES ||
      key->base.vue_layout == INTEL_VUE_LAYOUT_SEPARATE_MESH;
@ -749,6 +751,11 @@ struct brw_wm_prog_data {
    */
   enum intel_sometimes alpha_to_coverage;

+   /**
+    * Whether the shader is dispatch with a preceeding mesh shader.
+    */
+   enum intel_sometimes mesh_input;
+
   /**
    * Push constant location of intel_msaa_flags (dynamic configuration of the
    * pixel shader).
@ -806,7 +813,8 @@ struct brw_wm_prog_data {
 static inline bool
 brw_wm_prog_data_is_dynamic(const struct brw_wm_prog_data *prog_data)
 {
-   return prog_data->alpha_to_coverage == INTEL_SOMETIMES ||
+   return prog_data->mesh_input == INTEL_SOMETIMES ||
+      prog_data->alpha_to_coverage == INTEL_SOMETIMES ||
      prog_data->coarse_pixel_dispatch == INTEL_SOMETIMES ||
      prog_data->persample_dispatch == INTEL_SOMETIMES;
 }
--- a/src/intel/compiler/brw_from_nir.cpp
+++ b/src/intel/compiler/brw_from_nir.cpp
@ -4189,7 +4189,8 @@ brw_per_primitive_reg(const brw_builder &bld, int location, unsigned comp)
 {
   brw_shader &s = *bld.shader;
   assert(s.stage == MESA_SHADER_FRAGMENT);
-   assert(BITFIELD64_BIT(location) & s.nir->info.per_primitive_inputs);
+   assert((BITFIELD64_BIT(location) & s.nir->info.per_primitive_inputs) ||
+          location == VARYING_SLOT_PRIMITIVE_ID);

   const struct brw_wm_prog_data *prog_data = brw_wm_prog_data(s.prog_data);

--- a/src/intel/vulkan/anv_nir.h
+++ b/src/intel/vulkan/anv_nir.h
@ -110,6 +110,7 @@ void anv_nir_compute_push_layout(nir_shader *nir,
                                 const struct anv_physical_device *pdevice,
                                 enum brw_robustness_flags robust_flags,
                                 bool fragment_dynamic,
+                                 bool mesh_dynamic,
                                 struct brw_stage_prog_data *prog_data,
                                 struct anv_pipeline_bind_map *map,
                                 const struct anv_pipeline_push_map *push_map,
--- a/src/intel/vulkan/anv_nir_compute_push_layout.c
+++ b/src/intel/vulkan/anv_nir_compute_push_layout.c
@ -31,6 +31,7 @@ anv_nir_compute_push_layout(nir_shader *nir,
                            const struct anv_physical_device *pdevice,
                            enum brw_robustness_flags robust_flags,
                            bool fragment_dynamic,
+                            bool mesh_dynamic,
                            struct brw_stage_prog_data *prog_data,
                            struct anv_pipeline_bind_map *map,
                            const struct anv_pipeline_push_map *push_map,
@ -199,6 +200,34 @@ anv_nir_compute_push_layout(nir_shader *nir,
      }
   }

+   /* When platforms support Mesh and the fragment shader is not fully linked
+    * to the previous shader, payload format can change if the preceding
+    * shader is mesh or not, this is an issue in particular for PrimitiveID
+    * value (in legacy it's delivered as a VUE slot, in mesh it's delivered as
+    * in the per-primitive block).
+    *
+    * Here is the difference in payload format :
+    *
+    *       Legacy                 Mesh
+    * -------------------   -------------------
+    * |      ...        |   |      ...        |
+    * |-----------------|   |-----------------|
+    * |  Constant data  |   |  Constant data  |
+    * |-----------------|   |-----------------|
+    * | VUE attributes  |   | Per Primive data|
+    * -------------------   |-----------------|
+    *                       | VUE attributes  |
+    *                       -------------------
+    *
+    * To solve that issue we push an additional dummy push constant buffer in
+    * legacy pipelines to align everything. The compiler then adds a SEL
+    * instruction to source the PrimitiveID from the right location based on a
+    * dynamic bit in fs_msaa_intel.
+    */
+   const bool needs_padding_per_primitive =
+      mesh_dynamic &&
+      (nir->info.inputs_read & VARYING_BIT_PRIMITIVE_ID);
+
   unsigned n_push_ranges = 0;
   if (push_ubo_ranges) {
      brw_nir_analyze_ubo_ranges(compiler, nir, prog_data->ubo_ranges);
@ -224,6 +253,7 @@ anv_nir_compute_push_layout(nir_shader *nir,
            (push_reg_mask_offset - push_start) / 4;
      }

+      const unsigned max_push_buffers = needs_padding_per_primitive ? 3 : 4;
      unsigned range_start_reg = push_constant_range.length;

      for (int i = 0; i < 4; i++) {
@ -231,7 +261,7 @@ anv_nir_compute_push_layout(nir_shader *nir,
         if (ubo_range->length == 0)
            continue;

-         if (n_push_ranges >= 4) {
+         if (n_push_ranges >= max_push_buffers) {
            memset(ubo_range, 0, sizeof(*ubo_range));
            continue;
         }
@ -288,6 +318,17 @@ anv_nir_compute_push_layout(nir_shader *nir,
      prog_data->nr_params = 32 / 4;
   }

+   if (needs_padding_per_primitive) {
+      struct anv_push_range push_constant_range = {
+         .set = ANV_DESCRIPTOR_SET_PER_PRIM_PADDING,
+         .start = 0,
+         .length = 1,
+      };
+      map->push_ranges[n_push_ranges++] = push_constant_range;
+   }
+
+   assert(n_push_ranges <= 4);
+
   if (nir->info.stage == MESA_SHADER_FRAGMENT && fragment_dynamic) {
      struct brw_wm_prog_data *wm_prog_data =
         container_of(prog_data, struct brw_wm_prog_data, base);
@ -330,8 +371,12 @@ anv_nir_validate_push_layout(const struct anv_physical_device *pdevice,
      prog_data_push_size += prog_data->ubo_ranges[i].length;

   unsigned bind_map_push_size = 0;
-   for (unsigned i = 0; i < 4; i++)
+   for (unsigned i = 0; i < 4; i++) {
+      /* This is dynamic and doesn't count against prog_data->ubo_ranges[] */
+      if (map->push_ranges[i].set == ANV_DESCRIPTOR_SET_PER_PRIM_PADDING)
+         continue;
      bind_map_push_size += map->push_ranges[i].length;
+   }

   /* We could go through everything again but it should be enough to assert
    * that they push the same number of registers.  This should alert us if
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@ -604,6 +604,13 @@ anv_graphics_pipeline_stage_fragment_dynamic(const struct anv_pipeline_stage *st
          brw_wm_prog_key_is_dynamic(&stage->key.wm);
 }

+static bool
+anv_graphics_pipeline_stage_mesh_dynamic(const struct anv_pipeline_stage *stage)
+{
+   return stage->stage == MESA_SHADER_FRAGMENT &&
+          stage->key.wm.mesh_input == INTEL_SOMETIMES;
+}
+
 static void
 anv_pipeline_hash_common(struct mesa_sha1 *ctx,
                         const struct anv_pipeline *pipeline)
@ -1086,6 +1093,7 @@ anv_pipeline_lower_nir(struct anv_pipeline *pipeline,
   NIR_PASS_V(nir, anv_nir_compute_push_layout,
              pdevice, stage->key.base.robust_flags,
              anv_graphics_pipeline_stage_fragment_dynamic(stage),
+              anv_graphics_pipeline_stage_mesh_dynamic(stage),
              prog_data, &stage->bind_map, &push_map,
              pipeline->layout.type, mem_ctx);

--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@ -3293,6 +3293,7 @@ anv_descriptor_set_write_template(struct anv_device *device,
                                  const struct vk_descriptor_update_template *template,
                                  const void *data);

+#define ANV_DESCRIPTOR_SET_PER_PRIM_PADDING   (UINT8_MAX - 5)
 #define ANV_DESCRIPTOR_SET_DESCRIPTORS_BUFFER (UINT8_MAX - 4)
 #define ANV_DESCRIPTOR_SET_NULL               (UINT8_MAX - 3)
 #define ANV_DESCRIPTOR_SET_PUSH_CONSTANTS     (UINT8_MAX - 2)
--- a/src/intel/vulkan/genX_cmd_draw.c
+++ b/src/intel/vulkan/genX_cmd_draw.c
@ -196,6 +196,7 @@ get_push_range_address(struct anv_cmd_buffer *cmd_buffer,
   }

   case ANV_DESCRIPTOR_SET_NULL:
+   case ANV_DESCRIPTOR_SET_PER_PRIM_PADDING:
      return cmd_buffer->device->workaround_address;

   default: {
@ -263,6 +264,7 @@ get_push_range_bound_size(struct anv_cmd_buffer *cmd_buffer,

   case ANV_DESCRIPTOR_SET_NULL:
   case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS:
+   case ANV_DESCRIPTOR_SET_PER_PRIM_PADDING:
      return (range->start + range->length) * 32;

   default: {
@ -459,6 +461,12 @@ cmd_buffer_flush_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer,
            if (range->length == 0)
               continue;

+            /* Never clear this padding register as it might contain payload
+             * data.
+             */
+            if (range->set == ANV_DESCRIPTOR_SET_PER_PRIM_PADDING)
+               continue;
+
            unsigned bound_size =
               get_push_range_bound_size(cmd_buffer, shader, range);
            if (bound_size >= range->start * 32) {
@ -479,7 +487,7 @@ cmd_buffer_flush_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer,
      }
   }

-    /* Setting NULL resets the push constant state so that we allocate a new one
+   /* Setting NULL resets the push constant state so that we allocate a new one
    * if needed. If push constant data not dirty, get_push_range_address can
    * re-use existing allocation.
    *
@ -511,14 +519,21 @@ cmd_buffer_flush_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer,
            if (range->length == 0)
               break;

+            if (range->set == ANV_DESCRIPTOR_SET_PER_PRIM_PADDING &&
+                anv_pipeline_has_stage(pipeline, MESA_SHADER_MESH))
+               break;
+
            buffers[i] = get_push_range_address(cmd_buffer, shader, range);
            max_push_range = MAX2(max_push_range, range->length);
            buffer_count++;
         }

         /* We have at most 4 buffers but they should be tightly packed */
-         for (unsigned i = buffer_count; i < 4; i++)
-            assert(bind_map->push_ranges[i].length == 0);
+         for (unsigned i = buffer_count; i < 4; i++) {
+            assert(bind_map->push_ranges[i].length == 0 ||
+                   bind_map->push_ranges[i].set ==
+                   ANV_DESCRIPTOR_SET_PER_PRIM_PADDING);
+         }
      }

 #if GFX_VER >= 12