brw: add support for no VF input slot compaction

Normally the driver & compiler work together to use as few 3DSTATE_VERTEX_ELEMENTS/VERTEX_BUFFER_ELEMENT data as possible. The compiler ignores unused bits and driver avoids emitting the corresponding elements in 3DSTATE_VERTEX_ELEMENTS. For device generated commands, we want an 3DSTATE_VERTEX_ELEMENTS programming that is independent from the shader so that we can implement indirect pipeline binding without complicating the generation shader as well as emitting fewer generated commands. Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Caio Oliveira <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32418>
2025-12-21 15:50:11 +01:00 · 2024-12-02 15:00:34 +02:00 · 2024-12-02 15:00:34 +02:00 · 6845dede59
commit 6845dede59
parent f19c5f4fcc
2 changed files with 49 additions and 20 deletions
--- a/src/intel/compiler/brw_compile_vs.cpp
+++ b/src/intel/compiler/brw_compile_vs.cpp
@ -88,6 +88,27 @@ brw_nir_pack_vs_input(nir_shader *nir, struct brw_vs_prog_data *prog_data)
      }
   }

+   /* SKL PRMs, Vol 2a: Command Reference: Instructions,
+    * 3DSTATE_VF_COMPONENT_PACKING:
+    *
+    *    "At least one component of one "valid" Vertex Element must be
+    *     enabled."
+    */
+   if (nir->info.inputs_read == 0) {
+      if (prog_data->no_vf_slot_compaction) {
+         attributes[VERT_ATTRIB_GENERIC0].is_used = true;
+         attributes[VERT_ATTRIB_GENERIC0].component_mask = 0x1;
+      } else if (!BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_IS_INDEXED_DRAW) &&
+                 !BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FIRST_VERTEX) &&
+                 !BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_INSTANCE) &&
+                 !BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) &&
+                 !BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID) &&
+                 !BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID)) {
+         attributes[VERT_ATTRIB_GENERIC0].is_used = true;
+         attributes[VERT_ATTRIB_GENERIC0].component_mask = 0x1;
+      }
+   }
+
   /* Compute the register offsets */
   unsigned reg_offset = 0;
   unsigned vertex_element = 0;
@ -102,7 +123,8 @@ brw_nir_pack_vs_input(nir_shader *nir, struct brw_vs_prog_data *prog_data)
       *     and therefore no packing is performed on these elements (if
       *     Valid, all 4 components are stored)."
       */
-      if (vertex_element >= 32)
+      if (vertex_element >= 32 ||
+          (prog_data->no_vf_slot_compaction && a >= VERT_ATTRIB_GENERIC(32)))
         attributes[a].component_mask = 0xf;

      attributes[a].reg_offset = reg_offset;
@ -141,9 +163,10 @@ brw_nir_pack_vs_input(nir_shader *nir, struct brw_vs_prog_data *prog_data)
   }

   /* Generate the packing array */
-   unsigned vf_offset = 0;
-   for (unsigned a = 0; a < ARRAY_SIZE(attributes) && vf_offset < 32; a++) {
-      if (!attributes[a].is_used)
+   unsigned vf_element_count = 0;
+   for (unsigned a = 0; a < ARRAY_SIZE(attributes) && vf_element_count < 32; a++) {
+      /* Consider all attributes used when no slot compaction is active */
+      if (!attributes[a].is_used && !prog_data->no_vf_slot_compaction)
         continue;

      uint32_t mask;
@ -161,22 +184,11 @@ brw_nir_pack_vs_input(nir_shader *nir, struct brw_vs_prog_data *prog_data)
      }
      /* We should only have 4bits enabled max */
      assert((mask & ~0xfu) == 0);
-      prog_data->vf_component_packing[vf_offset / 8] |=
-         mask << (4 * (vf_offset % 8));
-      vf_offset++;
-   }

-   /* SKL PRMs, Vol 2a: Command Reference: Instructions,
-    * 3DSTATE_VF_COMPONENT_PACKING:
-    *
-    *    "At least one component of one "valid" Vertex Element must be
-    *     enabled."
-    */
-   if (prog_data->vf_component_packing[0] == 0 &&
-       prog_data->vf_component_packing[1] == 0 &&
-       prog_data->vf_component_packing[2] == 0 &&
-       prog_data->vf_component_packing[3] == 0)
-      prog_data->vf_component_packing[0] = 0x1;
+      prog_data->vf_component_packing[vf_element_count / 8] |=
+         mask << (4 * (vf_element_count % 8));
+      vf_element_count++;
+   }

   return reg_offset;
 }
@ -225,6 +237,13 @@ brw_compile_vs(const struct brw_compiler *compiler,
                                   params->base.debug_flag : DEBUG_VS);
   const unsigned dispatch_width = brw_geometry_stage_dispatch_width(compiler->devinfo);

+   /* We only expect slot compaction to be disabled when using device
+    * generated commands, to provide an independent 3DSTATE_VERTEX_ELEMENTS
+    * programming. This should always be enabled together with VF component
+    * packing to minimize the size of the payload.
+    */
+   assert(!key->no_vf_slot_compaction || key->vf_component_packing);
+
   prog_data->base.base.stage = MESA_SHADER_VERTEX;
   prog_data->base.base.ray_queries = nir->info.ray_queries;
   prog_data->base.base.total_scratch = 0;
@ -233,6 +252,7 @@ brw_compile_vs(const struct brw_compiler *compiler,

   prog_data->inputs_read = nir->info.inputs_read;
   prog_data->double_inputs_read = nir->info.vs.double_inputs;
+   prog_data->no_vf_slot_compaction = key->no_vf_slot_compaction;

   brw_nir_lower_vs_inputs(nir);
   brw_nir_lower_vue_outputs(nir);
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@ -250,7 +250,15 @@ struct brw_vs_prog_key {
    */
   bool vf_component_packing : 1;

-   uint32_t padding : 31;
+   /** Prevent compaction of slots of VF inputs
+    *
+    * So that 3DSTATE_VERTEX_ELEMENTS programming remains independent of
+    * shader inputs (essentially an unused location should have an associated
+    * VERTEX_ELEMENT_STATE).
+    */
+   bool no_vf_slot_compaction : 1;
+
+   uint32_t padding : 30;
 };

 /** The program key for Tessellation Control Shaders. */
@ -1046,6 +1054,7 @@ struct brw_vs_prog_data {
   bool uses_firstvertex;
   bool uses_baseinstance;
   bool uses_drawid;
+   bool no_vf_slot_compaction;

   uint32_t vf_component_packing[4];
 };