intel: introduce new VUE layout for separate compiled shader with mesh

Mesh shaders have per vertex block in URB pretty much identical to the VUE format. Let's just reuse that concept to do all of our layout in the payload attribute registers. This will ensure that we have consistent VUE layout between Mesh & non-Mesh pipelines. We need a new way of laying out the VUE though as we have to accomodate a HW constraint of maximum (per-primitive + per-vertex) of 32 varying. This means we cannot have 2 locations in the payload for things like PrimitiveID which can come from either the per-primitive or the per-vertex block. The new layout places the PrimitiveID at the end of the per-vertex attributes and shrinks the delivery dynamically if the mesh stage is active. The shader is compiled with a MOV_INDIRECT to read the PrimitiveID from the right location in the attributes. Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Ivan Briano <ivan.briano@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34109>
2025-12-21 00:40:10 +01:00 · 2025-03-10 23:18:30 +02:00 · 2025-03-10 23:18:30 +02:00 · 18bbcf9a63
commit 18bbcf9a63
parent 2d396f6085
15 changed files with 953 additions and 1085 deletions
--- a/src/gallium/drivers/iris/iris_state.c
+++ b/src/gallium/drivers/iris/iris_state.c
@ -4678,30 +4678,39 @@ iris_create_so_decl_list(const struct pipe_stream_output_info *info,
 }

 static inline int
-iris_compute_first_urb_slot_required(uint64_t inputs_read,
+iris_compute_first_urb_slot_required(struct iris_compiled_shader *fs_shader,
                                     const struct intel_vue_map *prev_stage_vue_map)
 {
 #if GFX_VER >= 9
-   return brw_compute_first_fs_urb_slot_required(inputs_read, prev_stage_vue_map);
+   uint32_t read_offset, read_length, num_varyings, primid_offset;
+   brw_compute_sbe_per_vertex_urb_read(prev_stage_vue_map,
+                                       false /* mesh*/,
+                                       brw_wm_prog_data(fs_shader->brw_prog_data),
+                                       &read_offset, &read_length, &num_varyings,
+                                       &primid_offset);
+   return 2 * read_offset;
 #else
-   return elk_compute_first_urb_slot_required(inputs_read, prev_stage_vue_map);
+   const struct iris_fs_data *fs_data = iris_fs_data(fs_shader);
+   return elk_compute_first_urb_slot_required(fs_data->inputs, prev_stage_vue_map);
 #endif
 }

 static void
-iris_compute_sbe_urb_read_interval(uint64_t fs_input_slots,
+iris_compute_sbe_urb_read_interval(struct iris_compiled_shader *fs_shader,
                                   const struct intel_vue_map *last_vue_map,
                                   bool two_sided_color,
                                   unsigned *out_offset,
                                   unsigned *out_length)
 {
+   const struct iris_fs_data *fs_data = iris_fs_data(fs_shader);
+
   /* The compiler computes the first URB slot without considering COL/BFC
    * swizzling (because it doesn't know whether it's enabled), so we need
    * to do that here too.  This may result in a smaller offset, which
    * should be safe.
    */
   const unsigned first_slot =
-      iris_compute_first_urb_slot_required(fs_input_slots, last_vue_map);
+      iris_compute_first_urb_slot_required(fs_shader, last_vue_map);

   /* This becomes the URB read offset (counted in pairs of slots). */
   assert(first_slot % 2 == 0);
@ -4710,6 +4719,7 @@ iris_compute_sbe_urb_read_interval(uint64_t fs_input_slots,
   /* We need to adjust the inputs read to account for front/back color
    * swizzling, as it can make the URB length longer.
    */
+   uint64_t fs_input_slots = fs_data->inputs;
   for (int c = 0; c <= 1; c++) {
      if (fs_input_slots & (VARYING_BIT_COL0 << c)) {
         /* If two sided color is enabled, the fragment shader's gl_Color
@ -4898,7 +4908,7 @@ iris_emit_sbe(struct iris_batch *batch, const struct iris_context *ice)
      &iris_vue_data(ice->shaders.last_vue_shader)->vue_map;

   unsigned urb_read_offset, urb_read_length;
-   iris_compute_sbe_urb_read_interval(fs_data->inputs,
+   iris_compute_sbe_urb_read_interval(ice->shaders.prog[MESA_SHADER_FRAGMENT],
                                      last_vue_map,
                                      cso_rast->light_twoside,
                                      &urb_read_offset, &urb_read_length);
--- a/src/intel/compiler/brw_compile_fs.cpp
+++ b/src/intel/compiler/brw_compile_fs.cpp
@ -630,223 +630,138 @@ static void
 calculate_urb_setup(const struct intel_device_info *devinfo,
                    const struct brw_wm_prog_key *key,
                    struct brw_wm_prog_data *prog_data,
-                    const nir_shader *nir,
-                    const struct brw_mue_map *mue_map)
+                    nir_shader *nir,
+                    const struct brw_mue_map *mue_map,
+                    int *per_primitive_offsets)
 {
   memset(prog_data->urb_setup, -1, sizeof(prog_data->urb_setup));
   memset(prog_data->urb_setup_channel, 0, sizeof(prog_data->urb_setup_channel));

   int urb_next = 0; /* in vec4s */

+   /* Figure out where the PrimitiveID lives, either in the per-vertex block
+    * or in the per-primitive block or both.
+    */
+   const uint64_t per_vert_primitive_id =
+      key->mesh_input == INTEL_ALWAYS ? 0 : VARYING_BIT_PRIMITIVE_ID;
+   const uint64_t per_prim_primitive_id =
+      key->mesh_input == INTEL_NEVER ? 0 : VARYING_BIT_PRIMITIVE_ID;
   const uint64_t inputs_read =
-      nir->info.inputs_read & ~nir->info.per_primitive_inputs;
+      nir->info.inputs_read &
+      (~nir->info.per_primitive_inputs | per_vert_primitive_id);
+   const uint64_t per_primitive_header_bits =
+      VARYING_BIT_PRIMITIVE_SHADING_RATE |
+      VARYING_BIT_LAYER |
+      VARYING_BIT_VIEWPORT |
+      VARYING_BIT_CULL_PRIMITIVE;
+   const uint64_t per_primitive_inputs =
+      nir->info.inputs_read &
+      (nir->info.per_primitive_inputs | per_prim_primitive_id) &
+      ~per_primitive_header_bits;
+   uint64_t unique_fs_attrs =
+      inputs_read & BRW_FS_VARYING_INPUT_MASK;
+   struct intel_vue_map vue_map;
+   uint32_t per_primitive_stride = 0, first_read_offset = UINT32_MAX;

-   /* Figure out where each of the incoming setup attributes lands. */
-   if (key->mesh_input != INTEL_NEVER) {
-      /* Per-Primitive Attributes are laid out by Hardware before the regular
-       * attributes, so order them like this to make easy later to map setup
-       * into real HW registers.
+   if (mue_map != NULL) {
+      memcpy(&vue_map, &mue_map->vue_map, sizeof(vue_map));
+
+      memcpy(per_primitive_offsets,
+             mue_map->per_primitive_offsets,
+             sizeof(mue_map->per_primitive_offsets));
+
+      u_foreach_bit64(location, per_primitive_inputs) {
+         assert(per_primitive_offsets[location] != -1);
+
+         first_read_offset = MIN2(first_read_offset,
+                                  (uint32_t)per_primitive_offsets[location]);
+         per_primitive_stride =
+            MAX2((uint32_t)per_primitive_offsets[location] + 16,
+                 per_primitive_stride);
+      }
+   } else {
+      brw_compute_vue_map(devinfo, &vue_map, inputs_read,
+                          key->base.vue_layout,
+                          1 /* pos_slots, TODO */);
+      brw_compute_per_primitive_map(per_primitive_offsets,
+                                    &per_primitive_stride,
+                                    &first_read_offset,
+                                    0, nir, nir_var_shader_in,
+                                    per_primitive_inputs,
+                                    true /* separate_shader */);
+   }
+
+   if (per_primitive_stride > first_read_offset) {
+      first_read_offset = ROUND_DOWN_TO(first_read_offset, 32);
+
+      /* Remove the first few unused registers */
+      for (uint32_t i = 0; i < VARYING_SLOT_MAX; i++) {
+         if (per_primitive_offsets[i] == -1)
+            continue;
+         per_primitive_offsets[i] -= first_read_offset;
+      }
+
+      prog_data->num_per_primitive_inputs =
+         2 * DIV_ROUND_UP(per_primitive_stride - first_read_offset, 32);
+   } else {
+      prog_data->num_per_primitive_inputs = 0;
+   }
+
+   /* Now do the per-vertex stuff (what used to be legacy pipeline) */
+   const uint64_t vue_header_bits = BRW_VUE_HEADER_VARYING_MASK;
+
+   unique_fs_attrs &= ~vue_header_bits;
+
+   /* If Mesh is involved, we cannot do any packing. Documentation doesn't say
+    * anything about this but 3DSTATE_SBE_SWIZ does not appear to work when
+    * using Mesh.
+    */
+   if (util_bitcount64(unique_fs_attrs) <= 16 && key->mesh_input == INTEL_NEVER) {
+      /* When not in Mesh pipeline mode, the SF/SBE pipeline stage can do
+       * arbitrary rearrangement of the first 16 varying inputs, so we can put
+       * them wherever we want. Just put them in order.
+       *
+       * This is useful because it means that (a) inputs not used by the
+       * fragment shader won't take up valuable register space, and (b) we
+       * won't have to recompile the fragment shader if it gets paired with a
+       * different vertex (or geometry) shader.
       */
-      if (nir->info.per_primitive_inputs) {
-         uint64_t per_prim_inputs_read =
-               nir->info.inputs_read & nir->info.per_primitive_inputs;
-
-         /* In Mesh, PRIMITIVE_SHADING_RATE, VIEWPORT and LAYER slots
-          * are always at the beginning, because they come from MUE
-          * Primitive Header, not Per-Primitive Attributes.
-          */
-         const uint64_t primitive_header_bits = VARYING_BIT_VIEWPORT |
-                                                VARYING_BIT_LAYER |
-                                                VARYING_BIT_PRIMITIVE_SHADING_RATE;
-
-         if (mue_map) {
-            unsigned per_prim_start_dw = mue_map->per_primitive_start_dw;
-            unsigned per_prim_size_dw = mue_map->per_primitive_pitch_dw;
-
-            bool reads_header = (per_prim_inputs_read & primitive_header_bits) != 0;
-
-            if (reads_header || mue_map->user_data_in_primitive_header) {
-               /* Primitive Shading Rate, Layer and Viewport live in the same
-                * 4-dwords slot (psr is dword 0, layer is dword 1, and viewport
-                * is dword 2).
-                */
-               if (per_prim_inputs_read & VARYING_BIT_PRIMITIVE_SHADING_RATE)
-                  prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_SHADING_RATE] = 0;
-
-               if (per_prim_inputs_read & VARYING_BIT_LAYER)
-                  prog_data->urb_setup[VARYING_SLOT_LAYER] = 0;
-
-               if (per_prim_inputs_read & VARYING_BIT_VIEWPORT)
-                  prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = 0;
-
-               per_prim_inputs_read &= ~primitive_header_bits;
-            } else {
-               /* If fs doesn't need primitive header, then it won't be made
-                * available through SBE_MESH, so we have to skip them when
-                * calculating offset from start of per-prim data.
-                */
-               per_prim_start_dw += mue_map->per_primitive_header_size_dw;
-               per_prim_size_dw -= mue_map->per_primitive_header_size_dw;
-            }
-
-            u_foreach_bit64(i, per_prim_inputs_read) {
-               int start = mue_map->start_dw[i];
-
-               assert(start >= 0);
-               assert(mue_map->len_dw[i] > 0);
-
-               assert(unsigned(start) >= per_prim_start_dw);
-               unsigned pos_dw = unsigned(start) - per_prim_start_dw;
-
-               prog_data->urb_setup[i] = urb_next + pos_dw / 4;
-               prog_data->urb_setup_channel[i] = pos_dw % 4;
-            }
-
-            urb_next = per_prim_size_dw / 4;
-         } else {
-            /* With no MUE map, we never read the primitive header, and
-             * per-primitive attributes won't be packed either, so just lay
-             * them in varying order.
-             */
-            per_prim_inputs_read &= ~primitive_header_bits;
-
-            for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
-               if (per_prim_inputs_read & BITFIELD64_BIT(i)) {
-                  prog_data->urb_setup[i] = urb_next++;
-               }
-            }
-
-            /* The actual setup attributes later must be aligned to a full GRF. */
-            urb_next = ALIGN(urb_next, 2);
-         }
-
-         prog_data->num_per_primitive_inputs = urb_next;
-      }
-
-      const uint64_t clip_dist_bits = VARYING_BIT_CLIP_DIST0 |
-                                      VARYING_BIT_CLIP_DIST1;
-
-      uint64_t unique_fs_attrs = inputs_read & BRW_FS_VARYING_INPUT_MASK;
-
-      if (inputs_read & clip_dist_bits) {
-         assert(!mue_map || mue_map->per_vertex_header_size_dw > 8);
-         unique_fs_attrs &= ~clip_dist_bits;
-      }
-
-      if (mue_map) {
-         unsigned per_vertex_start_dw = mue_map->per_vertex_start_dw;
-         unsigned per_vertex_size_dw = mue_map->per_vertex_pitch_dw;
-
-         /* Per-Vertex header is available to fragment shader only if there's
-          * user data there.
-          */
-         if (!mue_map->user_data_in_vertex_header) {
-            per_vertex_start_dw += 8;
-            per_vertex_size_dw -= 8;
-         }
-
-         /* In Mesh, CLIP_DIST slots are always at the beginning, because
-          * they come from MUE Vertex Header, not Per-Vertex Attributes.
-          */
-         if (inputs_read & clip_dist_bits) {
-            prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next;
-            prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next + 1;
-         } else if (mue_map && mue_map->per_vertex_header_size_dw > 8) {
-            /* Clip distances are in MUE, but we are not reading them in FS. */
-            per_vertex_start_dw += 8;
-            per_vertex_size_dw -= 8;
-         }
-
-         /* Per-Vertex attributes are laid out ordered.  Because we always link
-          * Mesh and Fragment shaders, the which slots are written and read by
-          * each of them will match. */
-         u_foreach_bit64(i, unique_fs_attrs) {
-            int start = mue_map->start_dw[i];
-
-            assert(start >= 0);
-            assert(mue_map->len_dw[i] > 0);
-
-            assert(unsigned(start) >= per_vertex_start_dw);
-            unsigned pos_dw = unsigned(start) - per_vertex_start_dw;
-
-            prog_data->urb_setup[i] = urb_next + pos_dw / 4;
-            prog_data->urb_setup_channel[i] = pos_dw % 4;
-         }
-
-         urb_next += per_vertex_size_dw / 4;
-      } else {
-         /* If we don't have an MUE map, just lay down the inputs the FS reads
-          * in varying order, as we do for the legacy pipeline.
-          */
-         if (inputs_read & clip_dist_bits) {
-            prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next++;
-            prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next++;
-         }
-
-         for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
-            if (unique_fs_attrs & BITFIELD64_BIT(i))
-               prog_data->urb_setup[i] = urb_next++;
+      for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
+         if (inputs_read & BRW_FS_VARYING_INPUT_MASK & ~vue_header_bits &
+             BITFIELD64_BIT(i)) {
+            prog_data->urb_setup[i] = urb_next++;
         }
      }
   } else {
-      assert(!nir->info.per_primitive_inputs);
-
-      const uint64_t vue_header_bits = BRW_VUE_HEADER_VARYING_MASK;
-
-      uint64_t unique_fs_attrs = inputs_read & BRW_FS_VARYING_INPUT_MASK & ~vue_header_bits;
-
-      if (util_bitcount64(unique_fs_attrs) <= 16) {
-         /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
-          * first 16 varying inputs, so we can put them wherever we want.
-          * Just put them in order.
-          *
-          * This is useful because it means that (a) inputs not used by the
-          * fragment shader won't take up valuable register space, and (b) we
-          * won't have to recompile the fragment shader if it gets paired with
-          * a different vertex (or geometry) shader.
-          */
-         for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
-            if (inputs_read & BRW_FS_VARYING_INPUT_MASK & ~vue_header_bits &
-                BITFIELD64_BIT(i)) {
-               prog_data->urb_setup[i] = urb_next++;
-            }
+      /* We have enough input varyings that the SF/SBE pipeline stage can't
+       * arbitrarily rearrange them to suit our whim; we have to put them in
+       * an order that matches the output of the previous pipeline stage
+       * (geometry or vertex shader).
+       */
+      int first_slot = 0;
+      for (int i = 0; i < vue_map.num_slots; i++) {
+         int varying = vue_map.slot_to_varying[i];
+         if (varying != BRW_VARYING_SLOT_PAD && varying > 0 &&
+             (inputs_read & BITFIELD64_BIT(varying)) != 0) {
+            first_slot = ROUND_DOWN_TO(i, 2);
+            break;
         }
-      } else {
-         /* We have enough input varyings that the SF/SBE pipeline stage can't
-          * arbitrarily rearrange them to suit our whim; we have to put them
-          * in an order that matches the output of the previous pipeline stage
-          * (geometry or vertex shader).
-          */
-
-         /* Re-compute the VUE map here in the case that the one coming from
-          * geometry has more than one position slot (used for Primitive
-          * Replication).
-          */
-         struct intel_vue_map prev_stage_vue_map;
-         brw_compute_vue_map(devinfo, &prev_stage_vue_map,
-                             key->input_slots_valid,
-                             key->base.vue_layout, 1);
-
-         int first_slot =
-            brw_compute_first_fs_urb_slot_required(unique_fs_attrs,
-                                                   &prev_stage_vue_map);
-
-         assert(prev_stage_vue_map.num_slots <= first_slot + 32);
-         for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
-              slot++) {
-            int varying = prev_stage_vue_map.slot_to_varying[slot];
-            if (varying != BRW_VARYING_SLOT_PAD &&
-                (inputs_read & BRW_FS_VARYING_INPUT_MASK &
-                 BITFIELD64_BIT(varying))) {
-               prog_data->urb_setup[varying] = slot - first_slot;
-            }
-         }
-         urb_next = prev_stage_vue_map.num_slots - first_slot;
      }
+
+      for (int slot = first_slot; slot < vue_map.num_slots; slot++) {
+         int varying = vue_map.slot_to_varying[slot];
+         if (varying != BRW_VARYING_SLOT_PAD &&
+             (inputs_read & BRW_FS_VARYING_INPUT_MASK &
+              BITFIELD64_BIT(varying))) {
+            prog_data->urb_setup[varying] = slot - first_slot;
+         }
+      }
+      urb_next = vue_map.num_slots - first_slot;
   }

-   prog_data->num_varying_inputs = urb_next - prog_data->num_per_primitive_inputs;
+   prog_data->num_varying_inputs = urb_next;
   prog_data->inputs = inputs_read;
+   prog_data->per_primitive_inputs = per_primitive_inputs;

   brw_compute_urb_setup_index(prog_data);
 }
@ -951,8 +866,6 @@ brw_compute_flat_inputs(struct brw_wm_prog_data *prog_data,
 {
   prog_data->flat_inputs = 0;

-   const unsigned per_vertex_start = prog_data->num_per_primitive_inputs;
-
   nir_foreach_shader_in_variable(var, shader) {
      /* flat shading */
      if (var->data.interpolation != INTERP_MODE_FLAT)
@ -963,7 +876,7 @@ brw_compute_flat_inputs(struct brw_wm_prog_data *prog_data,

      unsigned slots = glsl_count_attribute_slots(var->type, false);
      for (unsigned s = 0; s < slots; s++) {
-         int input_index = prog_data->urb_setup[var->data.location + s] - per_vertex_start;
+         int input_index = prog_data->urb_setup[var->data.location + s];

         if (input_index >= 0)
            prog_data->flat_inputs |= 1 << input_index;
@ -1005,7 +918,8 @@ brw_nir_populate_wm_prog_data(nir_shader *shader,
                              const struct intel_device_info *devinfo,
                              const struct brw_wm_prog_key *key,
                              struct brw_wm_prog_data *prog_data,
-                              const struct brw_mue_map *mue_map)
+                              const struct brw_mue_map *mue_map,
+                              int *per_primitive_offsets)
 {
   prog_data->uses_kill = shader->info.fs.uses_discard;
   prog_data->uses_omask = !key->ignore_sample_mask_out &&
@ -1158,7 +1072,7 @@ brw_nir_populate_wm_prog_data(nir_shader *shader,
      (BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) &&
       prog_data->coarse_pixel_dispatch != INTEL_NEVER);

-   calculate_urb_setup(devinfo, key, prog_data, shader, mue_map);
+   calculate_urb_setup(devinfo, key, prog_data, shader, mue_map, per_primitive_offsets);
   brw_compute_flat_inputs(prog_data, shader);
 }

@ -1479,6 +1393,44 @@ run_fs(brw_shader &s, bool allow_spilling, bool do_rep_send)
   return !s.failed;
 }

+static void
+brw_print_fs_urb_setup(FILE *fp, const struct brw_wm_prog_data *prog_data,
+                       int *per_primitive_offsets)
+{
+   fprintf(fp, "FS URB (inputs=0x%016" PRIx64 ", flat_inputs=0x%08x):\n",
+           prog_data->inputs, prog_data->flat_inputs);
+   fprintf(fp, "  URB setup:\n");
+   for (uint32_t i = 0; i < ARRAY_SIZE(prog_data->urb_setup); i++) {
+      if (prog_data->urb_setup[i] >= 0) {
+         fprintf(fp, "   [%02d]: %i channel=%u (%s)\n",
+                 i, prog_data->urb_setup[i], prog_data->urb_setup_channel[i],
+                 gl_varying_slot_name_for_stage((gl_varying_slot)i,
+                                                MESA_SHADER_FRAGMENT));
+      }
+   }
+   fprintf(fp, "  URB setup attributes:\n");
+   for (uint32_t i = 0; i < prog_data->urb_setup_attribs_count; i++) {
+      fprintf(fp, "   [%02d]: %i (%s)\n",
+              i, prog_data->urb_setup_attribs[i],
+              gl_varying_slot_name_for_stage(
+                 (gl_varying_slot)prog_data->urb_setup_attribs[i],
+                 MESA_SHADER_FRAGMENT));
+   }
+   if (per_primitive_offsets) {
+      fprintf(fp, "  Per Primitive URB setup:\n");
+      for (uint32_t i = 0; i < VARYING_SLOT_MAX; i++) {
+         if (per_primitive_offsets[i] == -1 ||
+             i == VARYING_SLOT_PRIMITIVE_COUNT ||
+             i == VARYING_SLOT_PRIMITIVE_INDICES)
+            continue;
+         fprintf(fp, "   [%02d]: %i (%s)\n",
+                 i, per_primitive_offsets[i],
+                 gl_varying_slot_name_for_stage((gl_varying_slot)i,
+                                                MESA_SHADER_FRAGMENT));
+      }
+   }
+}
+
 const unsigned *
 brw_compile_fs(const struct brw_compiler *compiler,
               struct brw_compile_fs_params *params)
@ -1518,8 +1470,15 @@ brw_compile_fs(const struct brw_compiler *compiler,
   brw_postprocess_nir(nir, compiler, debug_enabled,
                       key->base.robust_flags);

+   int per_primitive_offsets[VARYING_SLOT_MAX];
+   memset(per_primitive_offsets, -1, sizeof(per_primitive_offsets));
+
   brw_nir_populate_wm_prog_data(nir, compiler->devinfo, key, prog_data,
-                                 params->mue_map);
+                                 params->mue_map,
+                                 per_primitive_offsets);
+
+   if (unlikely(debug_enabled))
+      brw_print_fs_urb_setup(stderr, prog_data, per_primitive_offsets);

   /* Either an unrestricted or a fixed SIMD16 subgroup size are
    * allowed -- The latter is needed for fast clear and replicated
@ -1540,6 +1499,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
                                        prog_data, nir, 8, 1,
                                        params->base.stats != NULL,
                                        debug_enabled);
+      v8->import_per_primitive_offsets(per_primitive_offsets);
      if (!run_fs(*v8, allow_spilling, false /* do_rep_send */)) {
         params->base.error_str = ralloc_strdup(params->base.mem_ctx,
                                                v8->fail_msg);
@ -1645,6 +1605,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
                                            prog_data, nir, 32, 1,
                                            params->base.stats != NULL,
                                            debug_enabled);
+         v32->import_per_primitive_offsets(per_primitive_offsets);
         if (vbase)
            v32->import_uniforms(vbase);

@ -1670,6 +1631,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
                                            prog_data, nir, 16, 1,
                                            params->base.stats != NULL,
                                            debug_enabled);
+         v16->import_per_primitive_offsets(per_primitive_offsets);

         if (!run_fs(*v16, allow_spilling, params->use_rep_send)) {
            brw_shader_perf_log(compiler, params->base.log_data,
@ -1694,6 +1656,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
                                            prog_data, nir, 16, 1,
                                            params->base.stats != NULL,
                                            debug_enabled);
+         v16->import_per_primitive_offsets(per_primitive_offsets);
         if (v8)
            v16->import_uniforms(v8.get());
         if (!run_fs(*v16, allow_spilling, params->use_rep_send)) {
@ -1728,6 +1691,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
                                            prog_data, nir, 32, 1,
                                            params->base.stats != NULL,
                                            debug_enabled);
+         v32->import_per_primitive_offsets(per_primitive_offsets);
         if (v8)
            v32->import_uniforms(v8.get());
         else if (v16)
@ -1772,6 +1736,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
                                                  prog_data, nir, 32, 4,
                                                  params->base.stats != NULL,
                                                  debug_enabled);
+            vmulti->import_per_primitive_offsets(per_primitive_offsets);
            vmulti->import_uniforms(vbase);
            if (!run_fs(*vmulti, false, params->use_rep_send)) {
               brw_shader_perf_log(compiler, params->base.log_data,
@ -1792,6 +1757,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
                                                  prog_data, nir, 32, 2,
                                                  params->base.stats != NULL,
                                                  debug_enabled);
+            vmulti->import_per_primitive_offsets(per_primitive_offsets);
            vmulti->import_uniforms(vbase);
            if (!run_fs(*vmulti, false, params->use_rep_send)) {
               brw_shader_perf_log(compiler, params->base.log_data,
@ -1811,6 +1777,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
                                                  prog_data, nir, 16, 2,
                                                  params->base.stats != NULL,
                                                  debug_enabled);
+            vmulti->import_per_primitive_offsets(per_primitive_offsets);
            vmulti->import_uniforms(vbase);
            if (!run_fs(*vmulti, allow_spilling, params->use_rep_send)) {
               brw_shader_perf_log(compiler, params->base.log_data,
@ -1893,31 +1860,110 @@ brw_compile_fs(const struct brw_compiler *compiler,
 }

 extern "C" void
-brw_print_fs_urb_setup(FILE *fp, const struct brw_wm_prog_data *prog_data)
+brw_compute_sbe_per_vertex_urb_read(const struct intel_vue_map *prev_stage_vue_map,
+                                    bool mesh,
+                                    const struct brw_wm_prog_data *wm_prog_data,
+                                    uint32_t *out_read_offset,
+                                    uint32_t *out_read_length,
+                                    uint32_t *out_num_varyings,
+                                    uint32_t *out_primitive_id_offset)
 {
-   fprintf(fp, "FS URB (inputs=0x%016lx, flat_inputs=0x%08x):\n",
-           prog_data->inputs, prog_data->flat_inputs);
-   fprintf(fp, "  URB setup:\n");
-   for (uint32_t i = 0; i < ARRAY_SIZE(prog_data->urb_setup); i++) {
-      if (prog_data->urb_setup[i] >= 0) {
-         fprintf(fp, "   [%02d]: %i channel=%u (%s)\n",
-                 i, prog_data->urb_setup[i], prog_data->urb_setup_channel[i],
-                 gl_varying_slot_name_for_stage((gl_varying_slot)i,
-                                                MESA_SHADER_FRAGMENT));
+   int first_slot = INT32_MAX, last_slot = -1;
+
+   /* Ignore PrimitiveID in mesh pipelines, this value is coming from the
+    * per-primitive block.
+    */
+   uint64_t inputs_read = wm_prog_data->inputs;
+   if (mesh)
+      inputs_read &= ~VARYING_BIT_PRIMITIVE_ID;
+
+   for (int _i = 0; _i < prev_stage_vue_map->num_slots; _i++) {
+      uint32_t i = prev_stage_vue_map->num_slots - 1 - _i;
+      int varying = prev_stage_vue_map->slot_to_varying[i];
+      if (varying < 0)
+         continue;
+
+      if (varying == BRW_VARYING_SLOT_PAD ||
+          (inputs_read & BITFIELD64_BIT(varying)) == 0)
+         continue;
+
+      last_slot = i;
+      break;
+   }
+
+   for (int i = 0; i < prev_stage_vue_map->num_slots; i++) {
+      int varying = prev_stage_vue_map->slot_to_varying[i];
+      if (varying != BRW_VARYING_SLOT_PAD && varying > 0 &&
+          (inputs_read & BITFIELD64_BIT(varying)) != 0) {
+         first_slot = i;
+         break;
      }
   }
-   fprintf(fp, "  URB setup attributes:\n");
-   for (uint32_t i = 0; i < prog_data->urb_setup_attribs_count; i++) {
-      fprintf(fp, "   [%02d]: %i (%s)\n",
-              i, prog_data->urb_setup_attribs[i],
-              gl_varying_slot_name_for_stage((gl_varying_slot)i,
-                                             MESA_SHADER_FRAGMENT));
+
+   assert((first_slot == INT32_MAX && last_slot == -1) ||
+          (first_slot >= 0 && last_slot >= 0 && last_slot >= first_slot));
+
+   uint32_t num_varyings = wm_prog_data->num_varying_inputs;
+
+   /* When using INTEL_VUE_LAYOUT_SEPARATE_MESH, the location of the
+    * PrimitiveID is unknown at compile time, here we compute the offset
+    * inside the attribute registers which will be read with MOV_INDIRECT in
+    * the shader.
+    */
+   *out_primitive_id_offset = 0;
+   if (prev_stage_vue_map->layout == INTEL_VUE_LAYOUT_SEPARATE_MESH) {
+      if (mesh) {
+         /* When using Mesh, the PrimitiveID is in the per-primitive block. */
+         if (wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_ID] >= 0)
+            num_varyings--;
+         *out_primitive_id_offset = INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX_MESH;
+      } else if (inputs_read & VARYING_BIT_PRIMITIVE_ID) {
+         int primitive_id_slot;
+         if (prev_stage_vue_map->varying_to_slot[VARYING_SLOT_PRIMITIVE_ID] < 0) {
+            /* If the previous stage doesn't write PrimitiveID, we can have
+             * the HW generate a value (except if GS is enabled but in that
+             * case that's undefined).
+             *
+             * If the FS shader already has a slot of the PrimitiveID value,
+             * use that.
+             */
+            if (wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_ID] >= 0) {
+               if (first_slot == INT32_MAX)
+                  first_slot = 0;
+               primitive_id_slot =
+                  first_slot + wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_ID];
+            } else {
+               primitive_id_slot = ++last_slot;
+            }
+         } else {
+            primitive_id_slot =
+               prev_stage_vue_map->varying_to_slot[VARYING_SLOT_PRIMITIVE_ID];
+         }
+         last_slot = MAX2(primitive_id_slot, last_slot);
+
+         *out_primitive_id_offset = 4 * (primitive_id_slot - first_slot);
+      }
+   }
+
+   /* Compute the read parameters for SBE (those have to be 32B aligned) */
+   if (last_slot == -1) {
+      *out_read_offset = 0;
+      *out_read_length = DIV_ROUND_UP(num_varyings, 2);
+      *out_num_varyings = num_varyings;
+   } else {
+      first_slot = ROUND_DOWN_TO(first_slot, 2);
+      *out_read_offset = first_slot / 2;
+      *out_read_length = DIV_ROUND_UP(last_slot - first_slot + 1, 2);
+      *out_num_varyings = num_varyings;
   }
 }

-extern "C" int
-brw_compute_first_fs_urb_slot_required(uint64_t inputs_read,
-                                       const struct intel_vue_map *prev_stage_vue_map)
+extern "C" void
+brw_compute_sbe_per_primitive_urb_read(uint64_t inputs_read,
+                                       uint32_t num_varyings,
+                                       const struct brw_mue_map *mue_map,
+                                       uint32_t *out_read_offset,
+                                       uint32_t *out_read_length)
 {
   /* The header slots are irrelevant for the URB varying slots. They are
    * delivered somewhere else in the thread payload.
@ -1927,16 +1973,19 @@ brw_compute_first_fs_urb_slot_required(uint64_t inputs_read,
    *   - LAYER                  : R1.1, Render Target Array Index
    *   - VIEWPORT               : R1.1, Viewport Index
    *   - PSIZ                   : not available in fragment shaders
+    *   - FACE                   : R1.1, Front/Back Facing
    */
-   inputs_read &= ~BRW_VUE_HEADER_VARYING_MASK;
+   inputs_read &= ~(BRW_VUE_HEADER_VARYING_MASK | VARYING_BIT_FACE);

-   for (int i = 0; i < prev_stage_vue_map->num_slots; i++) {
-      int varying = prev_stage_vue_map->slot_to_varying[i];
-      if (varying != BRW_VARYING_SLOT_PAD && varying > 0 &&
-          (inputs_read & BITFIELD64_BIT(varying)) != 0) {
-         return ROUND_DOWN_TO(i, 2);
-      }
+   uint32_t first_read = UINT32_MAX;
+   u_foreach_bit64(varying, inputs_read) {
+      if (mue_map->per_primitive_offsets[varying] < 0)
+         continue;
+
+      first_read = mue_map->per_primitive_offsets[varying];
+      break;
   }

-   return 0;
+   *out_read_offset = DIV_ROUND_UP(first_read, 32);
+   *out_read_length = DIV_ROUND_UP(num_varyings, 2);
 }
--- a/src/intel/compiler/brw_compile_mesh.cpp
+++ b/src/intel/compiler/brw_compile_mesh.cpp
@ -516,711 +516,166 @@ struct attr_desc {
   unsigned slots;
 };

-struct attr_type_info {
-   /* order of attributes, negative values are holes */
-   std::list<struct attr_desc> *order;
-
-   /* attributes after which there's hole of size equal to array index */
-   std::list<int> holes[5];
-};
-
-static void
-brw_mue_assign_position(const struct attr_desc *attr,
-                        struct brw_mue_map *map,
-                        unsigned start_dw)
-{
-   bool is_array = glsl_type_is_array(attr->type);
-   int location = attr->location;
-   unsigned remaining = attr->dwords;
-
-   for (unsigned slot = 0; slot < attr->slots; ++slot) {
-      map->start_dw[location + slot] = start_dw;
-
-      unsigned sz;
-
-      if (is_array) {
-         assert(attr->dwords % attr->slots == 0);
-         sz = attr->dwords / attr->slots;
-      } else {
-         sz = MIN2(remaining, 4);
-      }
-
-      map->len_dw[location + slot] = sz;
-      start_dw += sz;
-      remaining -= sz;
-   }
-}
-
-static unsigned
-brw_sum_size(const std::list<struct attr_desc> &orders)
-{
-   unsigned sz = 0;
-   for (auto it = orders.cbegin(); it != orders.cend(); ++it)
-      sz += (*it).dwords;
-   return sz;
-}
-
-/* Finds order of outputs which require minimum size, without splitting
- * of URB read/write messages (which operate on vec4-aligned memory).
- */
-static void
-brw_compute_mue_layout(const struct brw_compiler *compiler,
-                       std::list<struct attr_desc> *orders,
-                       uint64_t outputs_written,
-                       struct nir_shader *nir,
-                       bool *pack_prim_data_into_header,
-                       bool *pack_vert_data_into_header)
-{
-   const struct shader_info *info = &nir->info;
-
-   struct attr_type_info data[3];
-
-   if ((compiler->mesh.mue_header_packing & 1) == 0)
-      *pack_prim_data_into_header = false;
-   if ((compiler->mesh.mue_header_packing & 2) == 0)
-      *pack_vert_data_into_header = false;
-
-   for (unsigned i = PRIM; i <= VERT_FLAT; ++i)
-      data[i].order = &orders[i];
-
-   /* If packing into header is enabled, add a hole of size 4 and add
-    * a virtual location to keep the algorithm happy (it expects holes
-    * to be preceded by some location). We'll remove those virtual
-    * locations at the end.
-    */
-   const gl_varying_slot virtual_header_location = VARYING_SLOT_POS;
-   assert((outputs_written & BITFIELD64_BIT(virtual_header_location)) == 0);
-
-   struct attr_desc d;
-   d.location = virtual_header_location;
-   d.type = NULL;
-   d.dwords = 0;
-   d.slots = 0;
-
-   struct attr_desc h;
-   h.location = -1;
-   h.type = NULL;
-   h.dwords = 4;
-   h.slots = 0;
-
-   if (*pack_prim_data_into_header) {
-      orders[PRIM].push_back(d);
-      orders[PRIM].push_back(h);
-      data[PRIM].holes[4].push_back(virtual_header_location);
-   }
-
-   if (*pack_vert_data_into_header) {
-      orders[VERT].push_back(d);
-      orders[VERT].push_back(h);
-      data[VERT].holes[4].push_back(virtual_header_location);
-   }
-
-   u_foreach_bit64(location, outputs_written) {
-      if ((BITFIELD64_BIT(location) & outputs_written) == 0)
-         continue;
-
-      /* At this point there are both complete and split variables as
-       * outputs. We need the complete variable to compute the required
-       * size.
-       */
-      nir_variable *var =
-            brw_nir_find_complete_variable_with_location(nir,
-                                                         nir_var_shader_out,
-                                                         location);
-
-      d.location = location;
-      d.type     = brw_nir_get_var_type(nir, var);
-      d.dwords   = glsl_count_dword_slots(d.type, false);
-      d.slots    = glsl_count_attribute_slots(d.type, false);
-
-      struct attr_type_info *type_data;
-
-      if (BITFIELD64_BIT(location) & info->per_primitive_outputs)
-         type_data = &data[PRIM];
-      else if (var->data.interpolation == INTERP_MODE_FLAT)
-         type_data = &data[VERT_FLAT];
-      else
-         type_data = &data[VERT];
-
-      std::list<struct attr_desc> *order = type_data->order;
-      std::list<int> *holes = type_data->holes;
-
-      outputs_written &= ~BITFIELD64_RANGE(location, d.slots);
-
-      /* special case to use hole of size 4 */
-      if (d.dwords == 4 && !holes[4].empty()) {
-         holes[4].pop_back();
-
-         assert(order->front().location == virtual_header_location);
-         order->pop_front();
-
-         assert(order->front().location == -1);
-         assert(order->front().dwords == 4);
-         order->front() = d;
-
-         continue;
-      }
-
-      int mod = d.dwords % 4;
-      if (mod == 0) {
-         order->push_back(d);
-         continue;
-      }
-
-      h.location = -1;
-      h.type = NULL;
-      h.dwords = 4 - mod;
-      h.slots = 0;
-
-      if (!compiler->mesh.mue_compaction) {
-         order->push_back(d);
-         order->push_back(h);
-         continue;
-      }
-
-      if (d.dwords > 4) {
-         order->push_back(d);
-         order->push_back(h);
-         holes[h.dwords].push_back(location);
-         continue;
-      }
-
-      assert(d.dwords < 4);
-
-      unsigned found = 0;
-      /* try to find the smallest hole big enough to hold this attribute */
-      for (unsigned sz = d.dwords; sz <= 4; sz++){
-         if (!holes[sz].empty()) {
-            found = sz;
-            break;
-         }
-      }
-
-      /* append at the end if not found */
-      if (found == 0) {
-         order->push_back(d);
-         order->push_back(h);
-         holes[h.dwords].push_back(location);
-
-         continue;
-      }
-
-      assert(found <= 4);
-      assert(!holes[found].empty());
-      int after_loc = holes[found].back();
-      holes[found].pop_back();
-
-      bool inserted_back = false;
-
-      for (auto it = order->begin(); it != order->end(); ++it) {
-         if ((*it).location != after_loc)
-            continue;
-
-         ++it;
-         /* must be a hole */
-         assert((*it).location < 0);
-         /* and it must be big enough */
-         assert(d.dwords <= (*it).dwords);
-
-         if (d.dwords == (*it).dwords) {
-            /* exact size, just replace */
-            *it = d;
-         } else {
-            /* inexact size, shrink hole */
-            (*it).dwords -= d.dwords;
-            /* and insert new attribute before it */
-            order->insert(it, d);
-
-            /* Insert shrunk hole in a spot so that the order of attributes
-             * is preserved.
-             */
-            std::list<int> &hole_list = holes[(*it).dwords];
-            std::list<int>::iterator insert_before = hole_list.end();
-
-            for (auto it2 = hole_list.begin(); it2 != hole_list.end(); ++it2) {
-               if ((*it2) >= (int)location) {
-                  insert_before = it2;
-                  break;
-               }
-            }
-
-            hole_list.insert(insert_before, location);
-         }
-
-         inserted_back = true;
-         break;
-      }
-
-      assert(inserted_back);
-   }
-
-   if (*pack_prim_data_into_header) {
-      if (orders[PRIM].front().location == virtual_header_location)
-         orders[PRIM].pop_front();
-
-      if (!data[PRIM].holes[4].empty()) {
-         *pack_prim_data_into_header = false;
-
-         assert(orders[PRIM].front().location == -1);
-         assert(orders[PRIM].front().dwords == 4);
-         orders[PRIM].pop_front();
-      }
-
-      if (*pack_prim_data_into_header) {
-         unsigned sz = brw_sum_size(orders[PRIM]);
-
-         if (sz % 8 == 0 || sz % 8 > 4)
-            *pack_prim_data_into_header = false;
-      }
-   }
-
-   if (*pack_vert_data_into_header) {
-      if (orders[VERT].front().location == virtual_header_location)
-         orders[VERT].pop_front();
-
-      if (!data[VERT].holes[4].empty()) {
-         *pack_vert_data_into_header = false;
-
-         assert(orders[VERT].front().location == -1);
-         assert(orders[VERT].front().dwords == 4);
-         orders[VERT].pop_front();
-      }
-
-      if (*pack_vert_data_into_header) {
-         unsigned sz = brw_sum_size(orders[VERT]) +
-                       brw_sum_size(orders[VERT_FLAT]);
-
-         if (sz % 8 == 0 || sz % 8 > 4)
-            *pack_vert_data_into_header = false;
-      }
-   }
-
-
-   if (INTEL_DEBUG(DEBUG_MESH)) {
-      fprintf(stderr, "MUE attribute order:\n");
-      for (unsigned i = PRIM; i <= VERT_FLAT; ++i) {
-         if (!orders[i].empty())
-            fprintf(stderr, "%d: ", i);
-         for (auto it = orders[i].cbegin(); it != orders[i].cend(); ++it) {
-            fprintf(stderr, "%d(%d) ", (*it).location, (*it).dwords);
-         }
-         if (!orders[i].empty())
-            fprintf(stderr, "\n");
-      }
-   }
-}
-
-/* Mesh URB Entry consists of an initial section
- *
- *  - Primitive Count
- *  - Primitive Indices (from 0 to Max-1)
- *  - Padding to 32B if needed
- *
- * optionally followed by a section for per-primitive data,
- * in which each primitive (from 0 to Max-1) gets
- *
- *  - Primitive Header (e.g. ViewportIndex)
- *  - Primitive Custom Attributes
- *
- * then followed by a section for per-vertex data
- *
- *  - Vertex Header (e.g. Position)
- *  - Vertex Custom Attributes
- *
- * Each per-element section has a pitch and a starting offset.  All the
- * individual attributes offsets in start_dw are considering the first entry
- * of the section (i.e. where the Position for first vertex, or ViewportIndex
- * for first primitive).  Attributes for other elements are calculated using
- * the pitch.
- */
 static void
 brw_compute_mue_map(const struct brw_compiler *compiler,
-                    struct nir_shader *nir, struct brw_mue_map *map,
-                    enum brw_mesh_index_format index_format, bool compact_mue)
+                    nir_shader *nir, struct brw_mue_map *map,
+                    enum brw_mesh_index_format index_format,
+                    enum intel_vue_layout vue_layout)
 {
   memset(map, 0, sizeof(*map));

-   memset(&map->start_dw[0], -1, sizeof(map->start_dw));
-   memset(&map->len_dw[0], 0, sizeof(map->len_dw));
-
-   unsigned vertices_per_primitive =
-      mesa_vertices_per_prim(nir->info.mesh.primitive_type);
-
   map->max_primitives = nir->info.mesh.max_primitives_out;
   map->max_vertices = nir->info.mesh.max_vertices_out;

-   uint64_t outputs_written = nir->info.outputs_written;
+   /* NumPrimitives */
+   map->size += 4;
+
+   /* PrimX indices */
+   const unsigned vertices_per_primitive =
+      mesa_vertices_per_prim(nir->info.mesh.primitive_type);

-   /* One dword for primitives count then K extra dwords for each primitive. */
   switch (index_format) {
   case BRW_INDEX_FORMAT_U32:
-      map->per_primitive_indices_dw = vertices_per_primitive;
+      map->per_primitive_indices_stride = 4 * vertices_per_primitive;
      break;
   case BRW_INDEX_FORMAT_U888X:
-      map->per_primitive_indices_dw = 1;
+      map->per_primitive_indices_stride = 4;
      break;
   default:
      unreachable("invalid index format");
   }

-   map->per_primitive_start_dw = ALIGN(map->per_primitive_indices_dw *
-                                       map->max_primitives + 1, 8);
+   map->size += map->per_primitive_indices_stride * map->max_primitives;

-   /* Assign initial section. */
-   if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT) & outputs_written) {
-      map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT] = 0;
-      map->len_dw[VARYING_SLOT_PRIMITIVE_COUNT] = 1;
-      outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT);
-   }
-   if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES) & outputs_written) {
-      map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES] = 1;
-      map->len_dw[VARYING_SLOT_PRIMITIVE_INDICES] =
-            map->per_primitive_indices_dw * map->max_primitives;
-      outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES);
-   }
+   /* Per primitive blocks */
+   map->size = align(map->size, 32);
+   map->per_primitive_offset = map->size;

+   const uint64_t count_indices_bits =
+      VARYING_BIT_PRIMITIVE_COUNT |
+      VARYING_BIT_PRIMITIVE_INDICES;
   const uint64_t per_primitive_header_bits =
-         BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE) |
-         BITFIELD64_BIT(VARYING_SLOT_LAYER) |
-         BITFIELD64_BIT(VARYING_SLOT_VIEWPORT) |
-         BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE);
+      VARYING_BIT_PRIMITIVE_SHADING_RATE |
+      VARYING_BIT_LAYER |
+      VARYING_BIT_VIEWPORT |
+      VARYING_BIT_CULL_PRIMITIVE;

-   const uint64_t per_vertex_header_bits =
-         BITFIELD64_BIT(VARYING_SLOT_PSIZ) |
-         BITFIELD64_BIT(VARYING_SLOT_POS) |
-         BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0) |
-         BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1);
+   /* Do we need a header? */
+   map->has_per_primitive_header =
+      (nir->info.outputs_written &
+       nir->info.per_primitive_outputs &
+       per_primitive_header_bits) != 0;

-   std::list<struct attr_desc> orders[3];
-   uint64_t regular_outputs = outputs_written &
-         ~(per_primitive_header_bits | per_vertex_header_bits);
+   uint32_t first_per_prim_offset;
+   brw_compute_per_primitive_map(map->per_primitive_offsets,
+                                 &map->per_primitive_stride,
+                                 &first_per_prim_offset,
+                                 map->has_per_primitive_header ? 32 : 0,
+                                 nir, nir_var_shader_out,
+                                 nir->info.outputs_written &
+                                 nir->info.per_primitive_outputs,
+                                 vue_layout != INTEL_VUE_LAYOUT_FIXED);

-   /* packing into prim header is possible only if prim header is present */
-   map->user_data_in_primitive_header = compact_mue &&
-         (outputs_written & per_primitive_header_bits) != 0;
+   map->per_primitive_offsets[VARYING_SLOT_PRIMITIVE_COUNT] = 0;
+   map->per_primitive_offsets[VARYING_SLOT_PRIMITIVE_INDICES] = 4;
+   if (map->has_per_primitive_header) {
+      /* Setup all the fields in the header */
+      map->per_primitive_offsets[VARYING_SLOT_PRIMITIVE_SHADING_RATE] = 0;
+      map->per_primitive_offsets[VARYING_SLOT_LAYER] = 4;
+      map->per_primitive_offsets[VARYING_SLOT_VIEWPORT] = 8;
+      map->per_primitive_offsets[VARYING_SLOT_CULL_PRIMITIVE] = 12;
+   }

-   /* Packing into vert header is always possible, but we allow it only
-    * if full vec4 is available (so point size is not used) and there's
-    * nothing between it and normal vertex data (so no clip distances).
+   map->per_primitive_stride = align(map->per_primitive_stride, 32);
+
+   map->size += map->per_primitive_stride * map->max_primitives;
+   assert(map->size % 32 == 0);
+
+   assert((nir->info.outputs_written & VARYING_BIT_PRIMITIVE_ID) == 0 ||
+          (nir->info.outputs_written & nir->info.per_primitive_outputs) != 0);
+
+   /* Per vertex blocks:
+    *
+    * For some selected bit that can appear either as per-primitive or
+    * per-vertex inputs to the fragment shader, we need to add them to the
+    * per-vertex block as well so that the layouts match. Even though they're
+    * not written.
    */
-   map->user_data_in_vertex_header = compact_mue &&
-         (outputs_written & per_vertex_header_bits) ==
-               BITFIELD64_BIT(VARYING_SLOT_POS);
+   const uint64_t per_primitive_outputs =
+      nir->info.outputs_written & nir->info.per_primitive_outputs;
+   const uint64_t per_vertex_outputs =
+      (nir->info.outputs_written &
+       ~(per_primitive_outputs | count_indices_bits | per_primitive_header_bits));

-   if (outputs_written & per_primitive_header_bits) {
-      bool zero_layer_viewport = false;
-      if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE)) {
-         map->start_dw[VARYING_SLOT_PRIMITIVE_SHADING_RATE] =
-               map->per_primitive_start_dw + 0;
-         map->len_dw[VARYING_SLOT_PRIMITIVE_SHADING_RATE] = 1;
-         /* Wa_16020916187: force 0 writes to layer and viewport slots */
-         zero_layer_viewport =
-            intel_needs_workaround(compiler->devinfo, 16020916187);
-      }
-
-      if ((outputs_written & BITFIELD64_BIT(VARYING_SLOT_LAYER)) ||
-          zero_layer_viewport) {
-         map->start_dw[VARYING_SLOT_LAYER] =
-               map->per_primitive_start_dw + 1; /* RTAIndex */
-         map->len_dw[VARYING_SLOT_LAYER] = 1;
-      }
-
-      if ((outputs_written & BITFIELD64_BIT(VARYING_SLOT_VIEWPORT)) ||
-          zero_layer_viewport) {
-          map->start_dw[VARYING_SLOT_VIEWPORT] =
-                map->per_primitive_start_dw + 2;
-          map->len_dw[VARYING_SLOT_VIEWPORT] = 1;
-      }
-
-      if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE)) {
-         map->start_dw[VARYING_SLOT_CULL_PRIMITIVE] =
-               map->per_primitive_start_dw + 3;
-         map->len_dw[VARYING_SLOT_CULL_PRIMITIVE] = 1;
-      }
-
-      map->per_primitive_header_size_dw = 8;
-      outputs_written &= ~per_primitive_header_bits;
-   } else {
-      map->per_primitive_header_size_dw = 0;
-   }
-
-   map->per_primitive_data_size_dw = 0;
-
-   /* For fast linked libraries, we can't pack the MUE, as the fragment shader
-    * will be compiled without access to the MUE map and won't be able to find
-    * out where everything is.
-    * Instead, keep doing things as we did before the packing, just laying out
-    * everything in varying order, which is how the FS will expect them.
-    */
-   if (compact_mue) {
-      brw_compute_mue_layout(compiler, orders, regular_outputs, nir,
-                             &map->user_data_in_primitive_header,
-                             &map->user_data_in_vertex_header);
-
-      unsigned start_dw = map->per_primitive_start_dw;
-      if (map->user_data_in_primitive_header)
-         start_dw += 4; /* first 4 dwords are used */
-      else
-         start_dw += map->per_primitive_header_size_dw;
-      unsigned header_used_dw = 0;
-
-      for (auto it = orders[PRIM].cbegin(); it != orders[PRIM].cend(); ++it) {
-         int location = (*it).location;
-         if (location < 0) {
-            start_dw += (*it).dwords;
-            if (map->user_data_in_primitive_header && header_used_dw < 4)
-               header_used_dw += (*it).dwords;
-            else
-               map->per_primitive_data_size_dw += (*it).dwords;
-            assert(header_used_dw <= 4);
-            continue;
-         }
-
-         assert(map->start_dw[location] == -1);
-
-         assert(location == VARYING_SLOT_PRIMITIVE_ID ||
-                location >= VARYING_SLOT_VAR0);
-
-         brw_mue_assign_position(&*it, map, start_dw);
-
-         start_dw += (*it).dwords;
-         if (map->user_data_in_primitive_header && header_used_dw < 4)
-            header_used_dw += (*it).dwords;
-         else
-            map->per_primitive_data_size_dw += (*it).dwords;
-         assert(header_used_dw <= 4);
-         outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
-      }
-   } else {
-      unsigned start_dw = map->per_primitive_start_dw +
-                          map->per_primitive_header_size_dw;
-
-      uint64_t per_prim_outputs = outputs_written & nir->info.per_primitive_outputs;
-      while (per_prim_outputs) {
-         uint64_t location = ffsll(per_prim_outputs) - 1;
-
-         assert(map->start_dw[location] == -1);
-         assert(location == VARYING_SLOT_PRIMITIVE_ID ||
-                location >= VARYING_SLOT_VAR0);
-
-         nir_variable *var =
-            brw_nir_find_complete_variable_with_location(nir,
-                                                         nir_var_shader_out,
-                                                         location);
-         struct attr_desc d;
-         d.location = location;
-         d.type     = brw_nir_get_var_type(nir, var);
-         d.dwords   = glsl_count_dword_slots(d.type, false);
-         d.slots    = glsl_count_attribute_slots(d.type, false);
-
-         brw_mue_assign_position(&d, map, start_dw);
-
-         map->per_primitive_data_size_dw += ALIGN(d.dwords, 4);
-         start_dw += ALIGN(d.dwords, 4);
-
-         per_prim_outputs &= ~BITFIELD64_RANGE(location, d.slots);
-      }
-   }
-
-   map->per_primitive_pitch_dw = ALIGN(map->per_primitive_header_size_dw +
-                                       map->per_primitive_data_size_dw, 8);
-
-   map->per_vertex_start_dw = ALIGN(map->per_primitive_start_dw +
-                                    map->per_primitive_pitch_dw *
-                                    map->max_primitives, 8);
-
-   /* TODO(mesh): Multiview. */
-   unsigned fixed_header_size = 8;
-   map->per_vertex_header_size_dw = ALIGN(fixed_header_size +
-                                          nir->info.clip_distance_array_size +
-                                          nir->info.cull_distance_array_size, 8);
-
-   if (outputs_written & per_vertex_header_bits) {
-      if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_PSIZ)) {
-         map->start_dw[VARYING_SLOT_PSIZ] = map->per_vertex_start_dw + 3;
-         map->len_dw[VARYING_SLOT_PSIZ] = 1;
-      }
-
-      if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_POS)) {
-         map->start_dw[VARYING_SLOT_POS] = map->per_vertex_start_dw + 4;
-         map->len_dw[VARYING_SLOT_POS] = 4;
-      }
-
-      if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0)) {
-         map->start_dw[VARYING_SLOT_CLIP_DIST0] =
-               map->per_vertex_start_dw + fixed_header_size + 0;
-         map->len_dw[VARYING_SLOT_CLIP_DIST0] = 4;
-      }
-
-      if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1)) {
-         map->start_dw[VARYING_SLOT_CLIP_DIST1] =
-               map->per_vertex_start_dw + fixed_header_size + 4;
-         map->len_dw[VARYING_SLOT_CLIP_DIST1] = 4;
-      }
-
-      outputs_written &= ~per_vertex_header_bits;
-   }
-
-   /* cull distances should be lowered earlier */
-   assert(!(outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_DIST0)));
-   assert(!(outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_DIST1)));
-
-   map->per_vertex_data_size_dw = 0;
-
-   /* For fast linked libraries, we can't pack the MUE, as the fragment shader
-    * will be compiled without access to the MUE map and won't be able to find
-    * out where everything is.
-    * Instead, keep doing things as we did before the packing, just laying out
-    * everything in varying order, which is how the FS will expect them.
-    */
-   if (compact_mue) {
-      unsigned start_dw = map->per_vertex_start_dw;
-      if (!map->user_data_in_vertex_header)
-         start_dw += map->per_vertex_header_size_dw;
-
-      unsigned header_used_dw = 0;
-      for (unsigned type = VERT; type <= VERT_FLAT; ++type) {
-         for (auto it = orders[type].cbegin(); it != orders[type].cend(); ++it) {
-            int location = (*it).location;
-            if (location < 0) {
-               start_dw += (*it).dwords;
-               if (map->user_data_in_vertex_header && header_used_dw < 4) {
-                  header_used_dw += (*it).dwords;
-                  assert(header_used_dw <= 4);
-                  if (header_used_dw == 4)
-                     start_dw += 4; /* jump over gl_position */
-               } else {
-                  map->per_vertex_data_size_dw += (*it).dwords;
-               }
-               continue;
-            }
-
-            assert(map->start_dw[location] == -1);
-
-            assert(location >= VARYING_SLOT_VAR0);
-
-            brw_mue_assign_position(&*it, map, start_dw);
-
-            start_dw += (*it).dwords;
-            if (map->user_data_in_vertex_header && header_used_dw < 4) {
-               header_used_dw += (*it).dwords;
-               assert(header_used_dw <= 4);
-               if (header_used_dw == 4)
-                  start_dw += 4; /* jump over gl_position */
-            } else {
-               map->per_vertex_data_size_dw += (*it).dwords;
-            }
-            outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
-         }
-      }
-   } else {
-      unsigned start_dw = map->per_vertex_start_dw +
-                          map->per_vertex_header_size_dw;
-
-      uint64_t per_vertex_outputs = outputs_written & ~nir->info.per_primitive_outputs;
-      while (per_vertex_outputs) {
-         uint64_t location = ffsll(per_vertex_outputs) - 1;
-
-         assert(map->start_dw[location] == -1);
-         assert(location >= VARYING_SLOT_VAR0);
-
-         nir_variable *var =
-            brw_nir_find_complete_variable_with_location(nir,
-                                                         nir_var_shader_out,
-                                                         location);
-         struct attr_desc d;
-         d.location = location;
-         d.type     = brw_nir_get_var_type(nir, var);
-         d.dwords   = glsl_count_dword_slots(d.type, false);
-         d.slots    = glsl_count_attribute_slots(d.type, false);
-
-         brw_mue_assign_position(&d, map, start_dw);
-
-         map->per_vertex_data_size_dw += ALIGN(d.dwords, 4);
-         start_dw += ALIGN(d.dwords, 4);
-
-         per_vertex_outputs &= ~BITFIELD64_RANGE(location, d.slots);
-      }
-   }
-
-   map->per_vertex_pitch_dw = ALIGN(map->per_vertex_header_size_dw +
-                                    map->per_vertex_data_size_dw, 8);
-
-   map->size_dw =
-      map->per_vertex_start_dw + map->per_vertex_pitch_dw * map->max_vertices;
-
-   assert(map->size_dw % 8 == 0);
+   map->per_vertex_offset = map->size;
+   brw_compute_vue_map(compiler->devinfo,
+                       &map->vue_map, per_vertex_outputs,
+                       vue_layout, 1 /* pos_slots, TODO: multiview */);
+   map->per_vertex_stride = align(map->vue_map.num_slots * 16, 32);
+   map->size += map->per_vertex_stride * map->max_vertices;
+   assert(map->size % 32 == 0);
 }

 static void
 brw_print_mue_map(FILE *fp, const struct brw_mue_map *map, struct nir_shader *nir)
 {
-   fprintf(fp, "MUE map (%d dwords, %d primitives, %d vertices)\n",
-           map->size_dw, map->max_primitives, map->max_vertices);
-   fprintf(fp, "  <%4d, %4d>: VARYING_SLOT_PRIMITIVE_COUNT\n",
-           map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT],
-           map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT] +
-           map->len_dw[VARYING_SLOT_PRIMITIVE_COUNT] - 1);
-   fprintf(fp, "  <%4d, %4d>: VARYING_SLOT_PRIMITIVE_INDICES\n",
-           map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES],
-           map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES] +
-           map->len_dw[VARYING_SLOT_PRIMITIVE_INDICES] - 1);
+   fprintf(fp, "MUE map (%d bytes, %d primitives, %d vertices):\n",
+           map->size, map->max_primitives, map->max_vertices);
+   fprintf(fp, "   indices_stride:   %d\n", map->per_primitive_indices_stride);
+   fprintf(fp, "   primitive_header: %d\n", map->has_per_primitive_header);
+   fprintf(fp, "   primitive_offset: %d\n", map->per_primitive_offset);
+   fprintf(fp, "   primitive_stride: %d\n", map->per_primitive_stride);
+   fprintf(fp, "   vertex_offset:    %d\n", map->per_vertex_offset);
+   fprintf(fp, "   vertex_stride:    %d\n", map->per_vertex_stride);

-   fprintf(fp, "  ----- per primitive (start %d, header_size %d, data_size %d, pitch %d)\n",
-           map->per_primitive_start_dw,
-           map->per_primitive_header_size_dw,
-           map->per_primitive_data_size_dw,
-           map->per_primitive_pitch_dw);
-
-   for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
-      if (map->start_dw[i] < 0)
+   fprintf(fp, "   primitive offsets:\n");
+   fprintf(fp, "      %s: %d\n",
+           gl_varying_slot_name_for_stage(VARYING_SLOT_PRIMITIVE_COUNT,
+                                          MESA_SHADER_MESH),
+           map->per_primitive_offsets[VARYING_SLOT_PRIMITIVE_COUNT]);
+   fprintf(fp, "      %s: %d\n",
+           gl_varying_slot_name_for_stage(VARYING_SLOT_PRIMITIVE_INDICES,
+                                          MESA_SHADER_MESH),
+           map->per_primitive_offsets[VARYING_SLOT_PRIMITIVE_INDICES]);
+   for (uint32_t i = 0; i < VARYING_SLOT_MAX; i++) {
+      if (map->per_primitive_offsets[i] < 0 ||
+          i == VARYING_SLOT_PRIMITIVE_COUNT ||
+          i == VARYING_SLOT_PRIMITIVE_INDICES)
         continue;
-
-      const unsigned offset = map->start_dw[i];
-      const unsigned len = map->len_dw[i];
-
-      if (offset < map->per_primitive_start_dw ||
-          offset >= map->per_primitive_start_dw + map->per_primitive_pitch_dw)
-         continue;
-
-      const char *name =
-            gl_varying_slot_name_for_stage((gl_varying_slot)i,
-                                           MESA_SHADER_MESH);
-
-      fprintf(fp, "  <%4d, %4d>: %s (%d)\n", offset, offset + len - 1,
-              name, i);
+      fprintf(fp, "      %s: %d (relative %d)\n",
+              gl_varying_slot_name_for_stage((gl_varying_slot)i,
+                                             MESA_SHADER_MESH),
+              map->per_primitive_offset + map->per_primitive_offsets[i],
+              map->per_primitive_offsets[i]);
   }
+   brw_print_vue_map(fp, &map->vue_map, MESA_SHADER_MESH);
+}

-   fprintf(fp, "  ----- per vertex (start %d, header_size %d, data_size %d, pitch %d)\n",
-           map->per_vertex_start_dw,
-           map->per_vertex_header_size_dw,
-           map->per_vertex_data_size_dw,
-           map->per_vertex_pitch_dw);
+static bool
+remap_io_to_dwords(nir_builder *b, nir_intrinsic_instr *intrin, void *data)
+{
+   if (intrin->intrinsic != nir_intrinsic_load_per_vertex_output &&
+       intrin->intrinsic != nir_intrinsic_load_per_primitive_output &&
+       intrin->intrinsic != nir_intrinsic_store_per_vertex_output &&
+       intrin->intrinsic != nir_intrinsic_store_per_primitive_output)
+      return false;

-   for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
-      if (map->start_dw[i] < 0)
-         continue;
+   nir_io_semantics io_sem = nir_intrinsic_io_semantics(intrin);
+   if (io_sem.location == VARYING_SLOT_PRIMITIVE_INDICES ||
+       io_sem.location == VARYING_SLOT_PRIMITIVE_COUNT)
+      return false;

-      const unsigned offset = map->start_dw[i];
-      const unsigned len = map->len_dw[i];
+   nir_intrinsic_set_base(intrin, nir_intrinsic_base(intrin) * 4);
+   nir_intrinsic_set_range(intrin, nir_intrinsic_range(intrin) * 4);

-      if (offset < map->per_vertex_start_dw ||
-          offset >= map->per_vertex_start_dw + map->per_vertex_pitch_dw)
-         continue;
+   b->cursor = nir_before_instr(&intrin->instr);

-      nir_variable *var =
-            nir_find_variable_with_location(nir, nir_var_shader_out, i);
-      bool flat = var->data.interpolation == INTERP_MODE_FLAT;
+   nir_src *offset = nir_get_io_offset_src(intrin);
+   assert(offset != NULL);

-      const char *name =
-            gl_varying_slot_name_for_stage((gl_varying_slot)i,
-                                           MESA_SHADER_MESH);
+   nir_src_rewrite(offset, nir_ishl_imm(b, offset->ssa, 2));

-      fprintf(fp, "  <%4d, %4d>: %s (%d)%s\n", offset, offset + len - 1,
-              name, i, flat ? " (flat)" : "");
-   }
-
-   fprintf(fp, "\n");
+   return true;
 }

 static void
@ -1229,12 +684,73 @@ brw_nir_lower_mue_outputs(nir_shader *nir, const struct brw_mue_map *map)
   nir_foreach_shader_out_variable(var, nir) {
      int location = var->data.location;
      assert(location >= 0);
-      assert(map->start_dw[location] != -1);
-      var->data.driver_location = map->start_dw[location];
+
+      switch (location) {
+      case VARYING_SLOT_PRIMITIVE_COUNT:
+      case VARYING_SLOT_PRIMITIVE_INDICES:
+         /* Primitive count & indices are not part of the per-primitive block,
+          * they have there own spot just before. We saved their offset in the
+          * the per-primitive array, we just don't need to add the block
+          * offset.
+          */
+         var->data.driver_location =
+            map->per_primitive_offsets[location] / 4;
+         break;
+
+      case VARYING_SLOT_PRIMITIVE_SHADING_RATE:
+         var->data.driver_location = map->per_primitive_offset / 16;
+         var->data.location_frac = 0;
+         break;
+
+      case VARYING_SLOT_LAYER:
+         var->data.driver_location = map->per_primitive_offset / 16;
+         var->data.location_frac = 1;
+         break;
+
+      case VARYING_SLOT_VIEWPORT:
+         var->data.driver_location = map->per_primitive_offset / 16;
+         var->data.location_frac = 2;
+         break;
+
+      case VARYING_SLOT_CULL_PRIMITIVE:
+         var->data.driver_location = map->per_primitive_offset / 16;
+         var->data.location_frac = 3;
+         break;
+
+      case VARYING_SLOT_PSIZ:
+         var->data.driver_location = map->per_vertex_offset / 16;
+         var->data.location_frac = 3;
+         break;
+
+      default:
+         if (nir->info.per_primitive_outputs & BITFIELD64_BIT(location))  {
+            assert(map->per_primitive_offsets[location] != -1);
+            var->data.driver_location =
+               (map->per_primitive_offset +
+                map->per_primitive_offsets[location]) / 16;
+         } else {
+            /* Each per vertex location has its own slot/vec4 (16B) of data, use
+             * map->vue_map.varying_to_slot[] to get the 16B offset and add the
+             * per-vertex block offset.
+             */
+            assert(map->vue_map.varying_to_slot[location] != -1);
+            var->data.driver_location =
+               map->per_vertex_offset / 16 +
+               map->vue_map.varying_to_slot[location];
+         }
+         break;
+      }
   }

   NIR_PASS(_, nir, nir_lower_io, nir_var_shader_out,
-            type_size_scalar_dwords, nir_lower_io_lower_64bit_to_32);
+            type_size_vec4,
+            nir_lower_io_lower_64bit_to_32);
+
+   /* Everythings works with slots in terms if IO, but our backend deals with
+    * dwords. Apply remapping.
+    */
+   NIR_PASS(_, nir, nir_shader_intrinsics_pass,
+            remap_io_to_dwords, nir_metadata_control_flow, NULL);
 }

 static void
@ -1242,8 +758,6 @@ brw_nir_initialize_mue(nir_shader *nir,
                       const struct brw_mue_map *map,
                       unsigned dispatch_width)
 {
-   assert(map->per_primitive_header_size_dw > 0);
-
   nir_builder b;
   nir_function_impl *entrypoint = nir_shader_get_entrypoint(nir);
   b = nir_builder_at(nir_before_impl(entrypoint));
@ -1277,7 +791,7 @@ brw_nir_initialize_mue(nir_shader *nir,
                                           prim_in_inv * workgroup_size);

      nir_store_per_primitive_output(&b, zerovec, prim, dw_off,
-                                     .base = (int)map->per_primitive_start_dw,
+                                     .base = (int)map->per_primitive_offset / 4,
                                     .write_mask = WRITEMASK_XYZW,
                                     .component = 0,
                                     .src_type = nir_type_uint32);
@ -1297,7 +811,7 @@ brw_nir_initialize_mue(nir_shader *nir,
                                               prims_per_inv * workgroup_size);

         nir_store_per_primitive_output(&b, zerovec, prim, dw_off,
-                                        .base = (int)map->per_primitive_start_dw,
+                                        .base = (int)map->per_primitive_offset / 4,
                                        .write_mask = WRITEMASK_XYZW,
                                        .component = 0,
                                        .src_type = nir_type_uint32);
@ -1348,8 +862,7 @@ brw_nir_adjust_offset_for_arrayed_indices_instr(nir_builder *b,
   switch (intrin->intrinsic) {
   case nir_intrinsic_load_per_vertex_output:
   case nir_intrinsic_store_per_vertex_output:
-      brw_nir_adjust_offset(b, intrin, map->per_vertex_pitch_dw);
-
+      brw_nir_adjust_offset(b, intrin, map->per_vertex_stride / 4);
      return true;

   case nir_intrinsic_load_per_primitive_output:
@ -1357,12 +870,11 @@ brw_nir_adjust_offset_for_arrayed_indices_instr(nir_builder *b,
      struct nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
      uint32_t pitch;
      if (sem.location == VARYING_SLOT_PRIMITIVE_INDICES)
-         pitch = map->per_primitive_indices_dw;
+         pitch = map->per_primitive_indices_stride;
      else
-         pitch = map->per_primitive_pitch_dw;
-
-      brw_nir_adjust_offset(b, intrin, pitch);
+         pitch = map->per_primitive_stride;

+      brw_nir_adjust_offset(b, intrin, pitch / 4);
      return true;
   }

@ -1571,11 +1083,12 @@ brw_mesh_autostrip_enable(const struct brw_compiler *compiler, struct nir_shader
    * those to 0. The workaround also requires disabling autostrip.
    */
   if (intel_needs_workaround(compiler->devinfo, 16020916187) &&
-       (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE) & outputs_written))
+       (VARYING_BIT_PRIMITIVE_SHADING_RATE & outputs_written))
       return false;

-   if (map->start_dw[VARYING_SLOT_VIEWPORT] < 0 &&
-       map->start_dw[VARYING_SLOT_LAYER] < 0)
+   /* Values not written */
+   if ((outputs_written & (VARYING_BIT_VIEWPORT |
+                           VARYING_BIT_LAYER)) == 0)
      return true;

   nir_def *vp = NULL;
@ -1694,7 +1207,7 @@ brw_compile_mesh(const struct brw_compiler *compiler,
       * When Primitive Header is enabled, we may not generates writes to all
       * fields, so let's initialize everything.
       */
-      if (prog_data->map.per_primitive_header_size_dw > 0)
+      if (prog_data->map.has_per_primitive_header)
         NIR_PASS_V(shader, brw_nir_initialize_mue, &prog_data->map, dispatch_width);

      brw_nir_apply_key(shader, compiler, &key->base, dispatch_width);
--- a/src/intel/compiler/brw_compiler.c
+++ b/src/intel/compiler/brw_compiler.c
@ -194,11 +194,6 @@ brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo)
      compiler->nir_options[i] = nir_options;
   }

-   compiler->mesh.mue_header_packing =
-         (unsigned)debug_get_num_option("INTEL_MESH_HEADER_PACKING", 3);
-   compiler->mesh.mue_compaction =
-         debug_get_bool_option("INTEL_MESH_COMPACTION", true);
-
   return compiler;
 }

@ -218,8 +213,6 @@ brw_get_compiler_config_value(const struct brw_compiler *compiler)
   bits++;
   insert_u64_bit(&config, compiler->lower_dpas);
   bits++;
-   insert_u64_bit(&config, compiler->mesh.mue_compaction);
-   bits++;

   enum intel_debug_flag debug_bits[] = {
      DEBUG_NO_DUAL_OBJECT_GS,
@ -244,9 +237,6 @@ brw_get_compiler_config_value(const struct brw_compiler *compiler)
   mask = 3;
   bits += util_bitcount64(mask);

-   u_foreach_bit64(bit, mask)
-      insert_u64_bit(&config, (compiler->mesh.mue_header_packing & (1ULL << bit)) != 0);
-
   assert(bits <= util_bitcount64(UINT64_MAX));

   return config;
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@ -122,11 +122,6 @@ struct brw_compiler {
   int spilling_rate;

   struct nir_shader *clc_shader;
-
-   struct {
-      unsigned mue_header_packing;
-      bool mue_compaction;
-   } mesh;
 };

 #define brw_shader_debug_log(compiler, data, fmt, ... ) do {    \
@ -371,7 +366,7 @@ struct brw_wm_prog_key {
   /* Whether or not we are running on a multisampled framebuffer */
   enum intel_sometimes multisample_fbo:2;

-   /* Whether the preceding shader stage is mesh */
+   /* Whether the shader is dispatch with a preceeding mesh shader */
   enum intel_sometimes mesh_input:2;

   bool coherent_fb_fetch:1;
@ -388,7 +383,7 @@ brw_wm_prog_key_is_dynamic(const struct brw_wm_prog_key *key)
   return key->alpha_to_coverage == INTEL_SOMETIMES ||
      key->persample_interp == INTEL_SOMETIMES ||
      key->multisample_fbo == INTEL_SOMETIMES ||
-      key->mesh_input == INTEL_SOMETIMES;
+      key->base.vue_layout == INTEL_VUE_LAYOUT_SEPARATE_MESH;
 }

 struct brw_cs_prog_key {
@ -784,6 +779,13 @@ struct brw_wm_prog_data {
    */
   uint64_t inputs;

+   /**
+    * The FS per-primitive inputs (some bits can be in both inputs &
+    * per_primitive_inputs if the shader is compiled without being linked to
+    * the previous stage)
+    */
+   uint64_t per_primitive_inputs;
+
   /**
    * Map from gl_varying_slot to the position within the FS setup data
    * payload where the varying's attribute vertex deltas should be delivered.
@ -1049,8 +1051,6 @@ typedef enum
 void brw_print_vue_map(FILE *fp, const struct intel_vue_map *vue_map,
                       gl_shader_stage stage);

-void brw_print_fs_urb_setup(FILE *fp, const struct brw_wm_prog_data *prog_data);
-
 /**
 * Convert a VUE slot number into a byte offset within the VUE.
 */
@ -1069,6 +1069,16 @@ brw_varying_to_offset(const struct intel_vue_map *vue_map, unsigned varying)
   return brw_vue_slot_to_offset(vue_map->varying_to_slot[varying]);
 }

+void
+brw_compute_per_primitive_map(int *out_per_primitive_map,
+                              uint32_t *out_per_primitive_stride,
+                              uint32_t *out_first_offset,
+                              uint32_t base_offset,
+                              nir_shader *nir,
+                              uint32_t variables_mode,
+                              uint64_t slots_valid,
+                              bool separate_shader);
+
 void brw_compute_vue_map(const struct intel_device_info *devinfo,
                         struct intel_vue_map *vue_map,
                         uint64_t slots_valid,
@ -1190,25 +1200,40 @@ struct brw_tue_map {
 };

 struct brw_mue_map {
-   int32_t start_dw[VARYING_SLOT_MAX];
-   uint32_t len_dw[VARYING_SLOT_MAX];
-   uint32_t per_primitive_indices_dw;
-
-   uint32_t size_dw;
+   /* Total size in bytes of the MUE (32B aligned) */
+   uint32_t size;

   uint32_t max_primitives;
-   uint32_t per_primitive_start_dw;
-   uint32_t per_primitive_header_size_dw;
-   uint32_t per_primitive_data_size_dw;
-   uint32_t per_primitive_pitch_dw;
-   bool user_data_in_primitive_header;
-
   uint32_t max_vertices;
-   uint32_t per_vertex_start_dw;
-   uint32_t per_vertex_header_size_dw;
-   uint32_t per_vertex_data_size_dw;
-   uint32_t per_vertex_pitch_dw;
-   bool user_data_in_vertex_header;
+
+   /* Stride in bytes between sets of primitive indices */
+   uint32_t per_primitive_indices_stride;
+
+   /* Per primitive offset from the start of the MUE (32B aligned) */
+   uint32_t per_primitive_offset;
+
+   /* Per primitive stride in bytes (32B aligned) */
+   uint32_t per_primitive_stride;
+
+   /* Whether the per primitive block includes a header */
+   bool has_per_primitive_header;
+
+   /* Per vertex offset in bytes from the start of the MUE (32B aligned) */
+   uint32_t per_vertex_offset;
+
+   /* Size of the per vertex header (32B aligned) */
+   uint32_t per_vertex_header_size;
+
+   /* Per vertex stride in bytes (32B aligned) */
+   uint32_t per_vertex_stride;
+
+   /* VUE map for the per vertex attributes */
+   struct intel_vue_map vue_map;
+
+   /* Offset in bytes of each per primitive relative to
+    * per_primitive_offset (-1 if unused)
+    */
+   int per_primitive_offsets[VARYING_SLOT_MAX];
 };

 struct brw_task_prog_data {
@ -1628,7 +1653,28 @@ brw_stage_has_packed_dispatch(ASSERTED const struct intel_device_info *devinfo,
 */
 int
 brw_compute_first_fs_urb_slot_required(uint64_t inputs_read,
-                                       const struct intel_vue_map *prev_stage_vue_map);
+                                       const struct intel_vue_map *prev_stage_vue_map,
+                                       bool mesh);
+
+void
+brw_compute_sbe_per_vertex_urb_read(const struct intel_vue_map *prev_stage_vue_map,
+                                    bool mesh,
+                                    const struct brw_wm_prog_data *wm_prog_data,
+                                    uint32_t *out_first_slot,
+                                    uint32_t *num_slots,
+                                    uint32_t *out_num_varyings,
+                                    uint32_t *out_primitive_id_offset);
+
+/**
+ * Computes the URB offset at which SBE should read the per primitive date
+ * written by the mesh shader.
+ */
+void
+brw_compute_sbe_per_primitive_urb_read(uint64_t inputs_read,
+                                       uint32_t num_varyings,
+                                       const struct brw_mue_map *mue_map,
+                                       uint32_t *out_read_offset,
+                                       uint32_t *out_read_length);

 /* From InlineData in 3DSTATE_TASK_SHADER_DATA and 3DSTATE_MESH_SHADER_DATA. */
 #define BRW_TASK_MESH_INLINE_DATA_SIZE_DW 8
--- a/src/intel/compiler/brw_from_nir.cpp
+++ b/src/intel/compiler/brw_from_nir.cpp
@ -4153,7 +4153,8 @@ brw_interp_reg(const brw_builder &bld, unsigned location,
 {
   brw_shader &s = *bld.shader;
   assert(s.stage == MESA_SHADER_FRAGMENT);
-   assert(BITFIELD64_BIT(location) & ~s.nir->info.per_primitive_inputs);
+   assert((BITFIELD64_BIT(location) & ~s.nir->info.per_primitive_inputs) ||
+          location == VARYING_SLOT_PRIMITIVE_ID);

   const struct brw_wm_prog_data *prog_data = brw_wm_prog_data(s.prog_data);

@ -4161,10 +4162,6 @@ brw_interp_reg(const brw_builder &bld, unsigned location,
   unsigned nr = prog_data->urb_setup[location];
   channel += prog_data->urb_setup_channel[location];

-   /* Adjust so we start counting from the first per_vertex input. */
-   assert(nr >= prog_data->num_per_primitive_inputs);
-   nr -= prog_data->num_per_primitive_inputs;
-
   const unsigned per_vertex_start = prog_data->num_per_primitive_inputs;
   const unsigned regnr = per_vertex_start + (nr * 4) + channel;

@ -4196,14 +4193,15 @@ brw_per_primitive_reg(const brw_builder &bld, int location, unsigned comp)

   const struct brw_wm_prog_data *prog_data = brw_wm_prog_data(s.prog_data);

-   comp += prog_data->urb_setup_channel[location];
+   comp += (s.fs.per_primitive_offsets[location] % 16) / 4;

-   assert(prog_data->urb_setup[location] >= 0);
-
-   const unsigned regnr = prog_data->urb_setup[location] + comp / 4;
+   const unsigned regnr = s.fs.per_primitive_offsets[location] / 16 + comp / 4;

+   assert(s.fs.per_primitive_offsets[location] >= 0);
   assert(regnr < prog_data->num_per_primitive_inputs);

+   brw_reg loc_reg = brw_attr_reg(regnr, BRW_TYPE_UD);
+
   if (s.max_polygons > 1) {
      /* In multipolygon dispatch each primitive constant is a
       * dispatch_width-wide SIMD vector (see comment in
@ -4211,11 +4209,10 @@ brw_per_primitive_reg(const brw_builder &bld, int location, unsigned comp)
       * component() to select the specified parameter.
       */
      const brw_reg tmp = bld.vgrf(BRW_TYPE_UD);
-      bld.MOV(tmp, offset(brw_attr_reg(regnr, BRW_TYPE_UD),
-                          s.dispatch_width, comp % 4));
+      bld.MOV(tmp, offset(loc_reg, s.dispatch_width, comp % 4));
      return retype(tmp, BRW_TYPE_F);
   } else {
-      return component(brw_attr_reg(regnr, BRW_TYPE_F), comp % 4);
+      return component(loc_reg, comp % 4);
   }
 }

@ -4419,7 +4416,7 @@ brw_from_nir_emit_fs_intrinsic(nir_to_brw_state &ntb,
         break;
      }

-      if (BITFIELD64_BIT(base) & s.nir->info.per_primitive_inputs) {
+      if (instr->intrinsic == nir_intrinsic_load_per_primitive_input) {
         assert(base != VARYING_SLOT_PRIMITIVE_INDICES);
         for (unsigned int i = 0; i < num_components; i++) {
            bld.MOV(offset(dest, bld, i),
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@ -658,11 +658,75 @@ lower_barycentric_at_offset(nir_builder *b, nir_intrinsic_instr *intrin,
   return true;
 }

+static bool
+lower_indirect_primitive_id(nir_builder *b,
+                            nir_intrinsic_instr *intrin,
+                            void *data)
+{
+   if (intrin->intrinsic != nir_intrinsic_load_per_primitive_input)
+      return false;
+
+   if (nir_intrinsic_io_semantics(intrin).location != VARYING_SLOT_PRIMITIVE_ID)
+      return false;
+
+   nir_def *indirect_primitive_id = data;
+   nir_def_replace(&intrin->def, indirect_primitive_id);
+
+   return true;
+}
+
 void
 brw_nir_lower_fs_inputs(nir_shader *nir,
                        const struct intel_device_info *devinfo,
                        const struct brw_wm_prog_key *key)
 {
+   nir_def *indirect_primitive_id = NULL;
+   if (key->base.vue_layout == INTEL_VUE_LAYOUT_SEPARATE_MESH &&
+       (nir->info.inputs_read & VARYING_BIT_PRIMITIVE_ID)) {
+      nir_builder _b = nir_builder_at(nir_before_impl(nir_shader_get_entrypoint(nir))), *b = &_b;
+      nir_def *index = nir_ushr_imm(b,
+                                    nir_load_fs_msaa_intel(b),
+                                    INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX_OFFSET);
+      nir_def *max_poly = nir_load_max_polygon_intel(b);
+      /* Build the per-vertex offset into the attribute section of the thread
+       * payload. There is always one GRF of padding in front.
+       *
+       * The computation is fairly complicated due to the layout of the
+       * payload. You can find a description of the layout in
+       * brw_compile_fs.cpp brw_assign_urb_setup().
+       *
+       * Gfx < 20 packs 2 slots per GRF (hence the %/ 2 in the formula)
+       * Gfx >= 20 pack 5 slots per GRF (hence the %/ 5 in the formula)
+       *
+       * Then an additional offset needs to added to handle how multiple
+       * polygon data is interleaved.
+       */
+      nir_def *per_vertex_offset = nir_iadd_imm(
+         b,
+         devinfo->ver >= 20 ?
+         nir_iadd(b,
+                  nir_imul(b, nir_udiv_imm(b, index, 5), nir_imul_imm(b, max_poly, 64)),
+                  nir_imul_imm(b, nir_umod_imm(b, index, 5), 12)) :
+         nir_iadd_imm(
+            b,
+            nir_iadd(
+               b,
+               nir_imul(b, nir_udiv_imm(b, index, 2), nir_imul_imm(b, max_poly, 32)),
+               nir_imul_imm(b, nir_umod_imm(b, index, 2), 16)),
+            12),
+         devinfo->grf_size);
+      /* When the attribute index is INTEL_MSAA_FLAG_PRIMITIVE_ID_MESH_INDEX,
+       * it means the value is coming from the per-primitive block. We always
+       * lay out PrimitiveID at offset 0 in the per-primitive block.
+       */
+      nir_def *attribute_offset = nir_bcsel(
+         b,
+         nir_ieq_imm(b, index, INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX_MESH),
+         nir_imm_int(b, 0), per_vertex_offset);
+      indirect_primitive_id =
+         nir_read_attribute_payload_intel(b, attribute_offset);
+   }
+
   nir_foreach_shader_in_variable(var, nir) {
      var->data.driver_location = var->data.location;

@ -679,6 +743,14 @@ brw_nir_lower_fs_inputs(nir_shader *nir,
         var->data.interpolation = flat ? INTERP_MODE_FLAT
                                        : INTERP_MODE_SMOOTH;
      }
+
+      /* Always pull the PrimitiveID from the per-primitive block if mesh can be involved.
+       */
+      if (var->data.location == VARYING_SLOT_PRIMITIVE_ID &&
+         key->mesh_input != INTEL_NEVER) {
+         var->data.per_primitive = true;
+         nir->info.per_primitive_inputs |= VARYING_BIT_PRIMITIVE_ID;
+      }
   }

   NIR_PASS(_, nir, nir_lower_io,
@ -704,6 +776,13 @@ brw_nir_lower_fs_inputs(nir_shader *nir,
               NULL);
   }

+   if (indirect_primitive_id != NULL) {
+      NIR_PASS(_, nir, nir_shader_intrinsics_pass,
+               lower_indirect_primitive_id,
+               nir_metadata_control_flow,
+               indirect_primitive_id);
+   }
+
   /* This pass needs actual constants */
   NIR_PASS(_, nir, nir_opt_constant_folding);

--- a/src/intel/compiler/brw_shader.cpp
+++ b/src/intel/compiler/brw_shader.cpp
@ -477,6 +477,9 @@ brw_shader::init()

   this->gs.control_data_bits_per_vertex = 0;
   this->gs.control_data_header_size_bits = 0;
+
+   memset(&this->fs.per_primitive_offsets, -1,
+          sizeof(this->fs.per_primitive_offsets));
 }

 brw_shader::~brw_shader()
@ -548,6 +551,16 @@ brw_shader::import_uniforms(brw_shader *v)
   this->uniforms = v->uniforms;
 }

+/* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
+ * This brings in those uniform definitions
+ */
+void
+brw_shader::import_per_primitive_offsets(const int *per_primitive_offsets)
+{
+   memcpy(this->fs.per_primitive_offsets, per_primitive_offsets,
+          sizeof(this->fs.per_primitive_offsets));
+}
+
 enum intel_barycentric_mode
 brw_barycentric_mode(const struct brw_wm_prog_key *key,
                     nir_intrinsic_instr *intr)
@ -1309,4 +1322,3 @@ brw_allocate_vgrf_units(brw_shader &s, unsigned units_of_REGSIZE)
 {
   return brw_vgrf(brw_allocate_vgrf_number(s, units_of_REGSIZE), BRW_TYPE_UD);
 }
-
--- a/src/intel/compiler/brw_shader.h
+++ b/src/intel/compiler/brw_shader.h
@ -83,6 +83,7 @@ public:
   ~brw_shader();

   void import_uniforms(brw_shader *v);
+   void import_per_primitive_offsets(const int *per_primitive_offsets);

   void assign_curb_setup();
   void convert_attr_sources_to_hw_regs(brw_inst *inst);
@ -202,6 +203,11 @@ public:
      unsigned control_data_header_size_bits;
   } gs;

+   struct {
+      /* Offset of per-primitive locations in bytes */
+      int per_primitive_offsets[VARYING_SLOT_MAX];
+   } fs;
+
   unsigned grf_used;
   bool spilled_any_registers;
   bool needs_register_pressure;
--- a/src/intel/compiler/brw_vue_map.c
+++ b/src/intel/compiler/brw_vue_map.c
@ -42,6 +42,105 @@

 #include "brw_compiler.h"
 #include "dev/intel_debug.h"
+#include "brw_nir.h"
+
+static unsigned
+get_var_slots(gl_shader_stage stage, const nir_variable *var)
+{
+   const struct glsl_type *type = var->type;
+
+   if (nir_is_arrayed_io(var, stage)) {
+      assert(glsl_type_is_array(type));
+      type = glsl_get_array_element(type);
+   }
+
+   /* EXT_mesh_shader: PRIMITIVE_INDICES is a flat array, not a proper arrayed
+    * output, as opposed to D3D-style mesh shaders where it's addressed by the
+    * primitive index. Prevent assigning several slots to primitive indices,
+    * to avoid some issues.
+    */
+   if (stage == MESA_SHADER_MESH &&
+       var->data.location == VARYING_SLOT_PRIMITIVE_INDICES &&
+       !nir_is_arrayed_io(var, stage))
+      return 1;
+
+   return glsl_count_vec4_slots(type, false, var->data.bindless);
+}
+
+void
+brw_compute_per_primitive_map(int *out_per_primitive_map,
+                              uint32_t *out_per_primitive_stride,
+                              uint32_t *out_first_offset,
+                              uint32_t base_offset,
+                              nir_shader *nir,
+                              uint32_t variables_mode,
+                              uint64_t slots_valid,
+                              bool separate_shader)
+{
+   memset(out_per_primitive_map, -1, sizeof(*out_per_primitive_map) * VARYING_SLOT_MAX);
+   *out_per_primitive_stride = base_offset;
+   *out_first_offset = UINT32_MAX;
+
+   const uint64_t count_indices_bits =
+      VARYING_BIT_PRIMITIVE_COUNT |
+      VARYING_BIT_PRIMITIVE_INDICES;
+   const uint64_t per_primitive_header_bits =
+      VARYING_BIT_PRIMITIVE_SHADING_RATE |
+      VARYING_BIT_LAYER |
+      VARYING_BIT_VIEWPORT |
+      VARYING_BIT_CULL_PRIMITIVE;
+   const uint64_t per_primitive_outputs_written =
+      slots_valid & ~(count_indices_bits | per_primitive_header_bits);
+
+   *out_first_offset = base_offset;
+
+   /* We put each variable in its own 16B slot. Technically we could do a lot
+    * better by allocating the space needed for the variable since the data is
+    * constant and not interpolated for the fragment shader. Unfortunately the
+    * backend treats those values similarly to vertex attributes and making
+    * that change would require a pretty large change in the backend. Let's do
+    * this later.
+    */
+
+   /* Lay out builtins first */
+   const uint64_t builtins =
+      per_primitive_outputs_written & BITFIELD64_MASK(VARYING_SLOT_VAR0);
+   u_foreach_bit64(location, builtins) {
+      assert(out_per_primitive_map[location] == -1);
+
+      out_per_primitive_map[location] = *out_per_primitive_stride;
+      *out_per_primitive_stride += 16;
+   }
+
+   uint32_t generics_offset = *out_per_primitive_stride;
+
+   /* Lay out generics */
+   const uint64_t generics =
+      per_primitive_outputs_written & ~BITFIELD64_MASK(VARYING_SLOT_VAR0);
+   const int first_generic_output = ffsl(generics) - 1;
+   u_foreach_bit64(location, generics) {
+      assert(out_per_primitive_map[location] == -1);
+      if (!separate_shader) {
+         /* Just append the location at the back */
+         out_per_primitive_map[location] = *out_per_primitive_stride;
+      } else {
+         assert(location >= VARYING_SLOT_VAR0);
+         /* Each location has its fixed spot */
+         out_per_primitive_map[location] = generics_offset +
+            16 * (location - first_generic_output);
+      }
+
+      *out_per_primitive_stride =
+         MAX2(out_per_primitive_map[location] + 16,
+              *out_per_primitive_stride);
+
+      *out_first_offset = MIN2(out_per_primitive_map[location],
+                               *out_first_offset);
+   }
+
+   *out_first_offset = *out_first_offset == UINT32_MAX ? 0 :
+      ROUND_DOWN_TO(*out_first_offset, 32);
+}

 static inline void
 assign_vue_slot(struct intel_vue_map *vue_map, int varying, int slot)
@ -63,6 +162,9 @@ brw_compute_vue_map(const struct intel_device_info *devinfo,
                    enum intel_vue_layout layout,
                    uint32_t pos_slots)
 {
+   vue_map->slots_valid = slots_valid;
+   vue_map->layout = layout;
+
   if (layout != INTEL_VUE_LAYOUT_FIXED) {
      /* In SSO mode, we don't know whether the adjacent stage will
       * read/write gl_ClipDistance, which has a fixed slot location.
@ -76,15 +178,17 @@ brw_compute_vue_map(const struct intel_device_info *devinfo,
      slots_valid |= VARYING_BIT_CLIP_DIST1;
   }

-   vue_map->slots_valid = slots_valid;
-   vue_map->layout = layout;
-
   /* gl_Layer, gl_ViewportIndex & gl_PrimitiveShadingRateEXT don't get their
    * own varying slots -- they are stored in the first VUE slot
    * (VARYING_SLOT_PSIZ).
    */
   slots_valid &= ~(VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PRIMITIVE_SHADING_RATE);

+   /* gl_FrontFace is provided somewhere else in the FS thread payload, it's
+    * never in the VUE.
+    */
+   slots_valid &= ~VARYING_BIT_FACE;
+
   /* Make sure that the values we store in vue_map->varying_to_slot and
    * vue_map->slot_to_varying won't overflow the signed chars that are used
    * to store them.  Note that since vue_map->slot_to_varying sometimes holds
@ -153,35 +257,38 @@ brw_compute_vue_map(const struct intel_device_info *devinfo,
    * can assign them however we like.  For normal programs, we simply assign
    * them contiguously.
    *
-    * For separate shader pipelines, we first assign built-in varyings
-    * contiguous slots.  This works because ARB_separate_shader_objects
-    * requires that all shaders have matching built-in varying interface
-    * blocks.  Next, we assign generic varyings based on their location
-    * (either explicit or linker assigned).  This guarantees a fixed layout.
-    *
    * We generally don't need to assign a slot for VARYING_SLOT_CLIP_VERTEX,
    * since it's encoded as the clip distances by emit_clip_distances().
    * However, it may be output by transform feedback, and we'd rather not
    * recompute state when TF changes, so we just always include it.
    */
-   uint64_t builtins = slots_valid & BITFIELD64_MASK(VARYING_SLOT_VAR0);
-   while (builtins != 0) {
-      const int varying = ffsll(builtins) - 1;
-      if (vue_map->varying_to_slot[varying] == -1) {
+   if (layout != INTEL_VUE_LAYOUT_SEPARATE_MESH) {
+      const uint64_t builtins = slots_valid & BITFIELD64_MASK(VARYING_SLOT_VAR0);
+      u_foreach_bit64(varying, builtins) {
+         /* Already assigned above? */
+         if (vue_map->varying_to_slot[varying] != -1)
+         continue;
         assign_vue_slot(vue_map, varying, slot++);
      }
-      builtins &= ~BITFIELD64_BIT(varying);
   }

   const int first_generic_slot = slot;
-   uint64_t generics = slots_valid & ~BITFIELD64_MASK(VARYING_SLOT_VAR0);
-   while (generics != 0) {
-      const int varying = ffsll(generics) - 1;
+   const uint64_t generics = slots_valid & ~BITFIELD64_MASK(VARYING_SLOT_VAR0);
+   u_foreach_bit64(varying, generics) {
      if (layout != INTEL_VUE_LAYOUT_FIXED) {
         slot = first_generic_slot + varying - VARYING_SLOT_VAR0;
      }
      assign_vue_slot(vue_map, varying, slot++);
-      generics &= ~BITFIELD64_BIT(varying);
+   }
+
+   if (layout == INTEL_VUE_LAYOUT_SEPARATE_MESH) {
+      const uint64_t builtins = slots_valid & BITFIELD64_MASK(VARYING_SLOT_VAR0);
+      u_foreach_bit64(varying, builtins) {
+         /* Already assigned above? */
+         if (vue_map->varying_to_slot[varying] != -1)
+         continue;
+         assign_vue_slot(vue_map, varying, slot++);
+      }
   }

   vue_map->num_slots = slot;
--- a/src/intel/compiler/intel_shader_enums.h
+++ b/src/intel/compiler/intel_shader_enums.h
@ -30,6 +30,9 @@ intel_sometimes_invert(enum intel_sometimes x)
   return (enum intel_sometimes)((int)INTEL_ALWAYS - (int)x);
 }

+#define INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX_OFFSET (20)
+#define INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX_MESH   (32)
+
 enum intel_msaa_flags {
   /** Must be set whenever any dynamic MSAA is used
    *
@ -63,6 +66,15 @@ enum intel_msaa_flags {
    * in the render target messages.
    */
   INTEL_MSAA_FLAG_COARSE_RT_WRITES = (1 << 18),
+
+   /** Index of the PrimitiveID attribute relative to the first read
+    * attribute.
+    *
+    * This is not a flag but a value that cover bits 20:31. Value 32 means the
+    * PrimitiveID is coming from the PerPrimitive block, written by the Mesh
+    * shader.
+    */
+   INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX = (1 << INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX_OFFSET),
 };
 MESA_DEFINE_CPP_ENUM_BITFIELD_OPERATORS(intel_msaa_flags)

@ -133,6 +145,10 @@ enum intel_vue_layout {
    * Mesh support.
    */
   INTEL_VUE_LAYOUT_SEPARATE,
+   /**
+    * Layout is separate and works with Mesh shaders.
+    */
+   INTEL_VUE_LAYOUT_SEPARATE_MESH,
 };

 /**
@ -157,11 +173,70 @@ struct intel_vue_map {
   /**
    * The layout of the VUE
    *
-    * Separable programs (GL_ARB_separate_shader_objects) can be mixed and matched
-    * without the linker having a chance to dead code eliminate unused varyings.
+    * Separable programs (GL_ARB_separate_shader_objects) can be mixed and
+    * matched without the linker having a chance to dead code eliminate unused
+    * varyings.
    *
    * This means that we have to use a fixed slot layout, based on the output's
    * location field, rather than assigning slots in a compact contiguous block.
+    *
+    * When using Mesh, another constraint arises which is the HW limits for
+    * loading per-primitive & per-vertex data, limited to 32 varying in total.
+    * This requires us to be quite inventive with the way we lay things out.
+    * Take a fragment shader loading the following data :
+    *
+    *    float gl_ClipDistance[];
+    *    uint gl_PrimitiveID;
+    *    vec4 someAppValue[29];
+    *
+    * According to the Vulkan spec, someAppValue will occupy 29 slots,
+    * gl_PrimitiveID 1 slot, gl_ClipDistance[] up to 2 slots. If the input is
+    * coming from a VS/DS/GS shader, we can load all of this through a single
+    * block using 3DSTATE_SBE::VertexURBEntryReadLength = 16 (maximum
+    * programmable value) and the layout with
+    * BRW_VUE_MAP_LAYOUT_FIXED/BRW_VUE_MAP_LAYOUT_SEPARATE will be this :
+    *
+    *   -----------------------
+    *   | gl_ClipDistance 0-3 |
+    *   |---------------------|
+    *   | gl_ClipDistance 4-7 |
+    *   |---------------------|
+    *   |   gl_PrimitiveID    |
+    *   |---------------------|
+    *   |   someAppValue[]    |
+    *   |---------------------|
+    *
+    * This works nicely as everything is coming from the same location in the
+    * URB.
+    *
+    * When mesh shaders are involved, gl_PrimitiveID is located in a different
+    * place in the URB (the per-primitive block) and requires programming
+    * 3DSTATE_SBE_MESH::PerPrimitiveURBEntryOutputReadLength to load some
+    * additional data. The HW has a limit such that
+    * 3DSTATE_SBE_MESH::PerPrimitiveURBEntryOutputReadLength +
+    * 3DSTATE_SBE_MESH::PerVertexURBEntryOutputReadLength <= 16. With the
+    * layout above, we would not be able to accomodate that HW limit.
+    *
+    * The solution to this is to lay the built-in varyings out
+    * (gl_ClipDistance omitted since it's part of the VUE header and cannot
+    * live any other place) at the end of the VUE like this :
+    *
+    *   -----------------------
+    *   | gl_ClipDistance 0-3 |
+    *   |---------------------|
+    *   | gl_ClipDistance 4-7 |
+    *   |---------------------|
+    *   |   someAppValue[]    |
+    *   |---------------------|
+    *   |   gl_PrimitiveID    |
+    *   |---------------------|
+    *
+    * This layout adds another challenge because with separate shader
+    * compilations, we cannot tell in the consumer shader how many outputs the
+    * producer has, so we don't know where the gl_PrimitiveID lives. The
+    * solution to this other problem is to read the built-in with a
+    * MOV_INDIRECT and have the offset of the MOV_INDIRECT loaded through a
+    * push constant.
    */
   enum intel_vue_layout layout;

@ -361,6 +436,7 @@ struct intel_fs_params {
   uint32_t rasterization_samples;
   bool coarse_pixel;
   bool alpha_to_coverage;
+   uint32_t primitive_id_index;
 };

 static inline enum intel_msaa_flags
@ -392,6 +468,9 @@ intel_fs_msaa_flags(struct intel_fs_params params)
   if (params.alpha_to_coverage)
      fs_msaa_flags |= INTEL_MSAA_FLAG_ALPHA_TO_COVERAGE;

+   fs_msaa_flags |= (enum intel_msaa_flags)(
+      params.primitive_id_index << INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX_OFFSET);
+
   return fs_msaa_flags;
 }

--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@ -1757,7 +1757,10 @@ anv_graphics_pipeline_init_keys(struct anv_graphics_base_pipeline *pipeline,
       !device->vk.enabled_extensions.EXT_graphics_pipeline_library) {
      vue_layout = INTEL_VUE_LAYOUT_FIXED;
   } else {
-      vue_layout = INTEL_VUE_LAYOUT_SEPARATE;
+      vue_layout =
+         (pipeline->base.type == ANV_PIPELINE_GRAPHICS_LIB &&
+          device->vk.enabled_extensions.EXT_mesh_shader) ?
+         INTEL_VUE_LAYOUT_SEPARATE_MESH : INTEL_VUE_LAYOUT_SEPARATE;
   }

   for (uint32_t s = 0; s < ANV_GRAPHICS_SHADER_STAGE_COUNT; s++) {
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@ -4976,6 +4976,9 @@ struct anv_graphics_pipeline {
   uint32_t                                     view_mask;
   uint32_t                                     instance_multiplier;

+   /* Attribute index of the PrimitiveID in the delivered attributes */
+   uint32_t                                     primitive_id_index;
+
   bool                                         kill_pixel;
   bool                                         uses_xfb;
   bool                                         sample_shading_enable;
--- a/src/intel/vulkan/genX_gfx_state.c
+++ b/src/intel/vulkan/genX_gfx_state.c
@ -780,6 +780,7 @@ update_fs_msaa_flags(struct anv_gfx_dynamic_state *hw_state,
            .rasterization_samples     = dyn->ms.rasterization_samples,
            .coarse_pixel              = !vk_fragment_shading_rate_is_disabled(&dyn->fsr),
            .alpha_to_coverage         = dyn->ms.alpha_to_coverage_enable,
+            .primitive_id_index        = pipeline->primitive_id_index,
         });

   SET(FS_MSAA_FLAGS, fs_msaa_flags, fs_msaa_flags);
--- a/src/intel/vulkan/genX_pipeline.c
+++ b/src/intel/vulkan/genX_pipeline.c
@ -531,7 +531,7 @@ emit_urb_setup_mesh(struct anv_graphics_pipeline *pipeline,
   const struct intel_mesh_urb_allocation alloc =
      intel_get_mesh_urb_config(devinfo, pipeline->base.base.l3_config,
                                task_prog_data ? task_prog_data->map.size_dw : 0,
-                                mesh_prog_data->map.size_dw);
+                                mesh_prog_data->map.size / 4);

   /* Zero out the primitive pipeline URB allocations. */
   for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
@ -641,14 +641,14 @@ sbe_primitive_id_override(struct anv_graphics_pipeline *pipeline)
         get_mesh_prog_data(pipeline);
      const struct brw_mue_map *mue = &mesh_prog_data->map;
      return (wm_prog_data->inputs & VARYING_BIT_PRIMITIVE_ID) &&
-              mue->start_dw[VARYING_SLOT_PRIMITIVE_ID] == -1;
+              mue->per_primitive_offsets[VARYING_SLOT_PRIMITIVE_ID] == -1;
   }

   const struct intel_vue_map *fs_input_map =
      &anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map;

   return (wm_prog_data->inputs & VARYING_BIT_PRIMITIVE_ID) &&
-          fs_input_map->varying_to_slot[VARYING_SLOT_PRIMITIVE_ID] == -1;
+          (fs_input_map->slots_valid & VARYING_BIT_PRIMITIVE_ID) == 0;
 }

 static void
@ -667,30 +667,34 @@ emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline)
      return;
   }

+   const struct intel_vue_map *vue_map =
+      anv_pipeline_is_mesh(pipeline) ?
+      &get_mesh_prog_data(pipeline)->map.vue_map :
+      &anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map;
+
   anv_pipeline_emit(pipeline, final.sbe, GENX(3DSTATE_SBE), sbe) {
   anv_pipeline_emit(pipeline, final.sbe_swiz, GENX(3DSTATE_SBE_SWIZ), swiz) {
+      int max_source_attr = 0;
+      uint32_t vertex_read_offset, vertex_read_length, vertex_varyings;
+      brw_compute_sbe_per_vertex_urb_read(
+         vue_map, anv_pipeline_is_mesh(pipeline), wm_prog_data,
+         &vertex_read_offset, &vertex_read_length, &vertex_varyings,
+         &pipeline->primitive_id_index);

-      /* TODO(mesh): Figure out cases where we need attribute swizzling.  See also
-       * calculate_urb_setup() and related functions.
-       */
      sbe.AttributeSwizzleEnable = anv_pipeline_is_primitive(pipeline);
      sbe.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
-      sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
-      sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
+      sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs &
+                                        ((1u << vertex_varyings) - 1);
+      sbe.NumberofSFOutputAttributes = vertex_varyings;

      for (unsigned i = 0; i < 32; i++)
         sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;

+      /* As far as we can test, some of the fields in 3DSTATE_SBE & all of
+       * 3DSTATE_SBE_SWIZ has no effect when the pipeline is using Mesh so
+       * don't bother filling those fields.
+       */
      if (anv_pipeline_is_primitive(pipeline)) {
-         const struct intel_vue_map *fs_input_map =
-            &anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map;
-
-         int first_slot =
-            brw_compute_first_fs_urb_slot_required(wm_prog_data->inputs,
-                                                   fs_input_map);
-         assert(first_slot % 2 == 0);
-         unsigned urb_entry_read_offset = first_slot / 2;
-         int max_source_attr = 0;
         for (uint8_t idx = 0; idx < wm_prog_data->urb_setup_attribs_count; idx++) {
            uint8_t attr = wm_prog_data->urb_setup_attribs[idx];
            int input_index = wm_prog_data->urb_setup[attr];
@ -702,7 +706,7 @@ emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline)
               continue;
            }

-            const int slot = fs_input_map->varying_to_slot[attr];
+            const int slot = vue_map->varying_to_slot[attr];

            if (slot == -1) {
               /* This attribute does not exist in the VUE--that means that
@ -723,7 +727,7 @@ emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline)
            /* We have to subtract two slots to account for the URB entry
             * output read offset in the VS and GS stages.
             */
-            const int source_attr = slot - 2 * urb_entry_read_offset;
+            const int source_attr = slot - 2 * vertex_read_offset;
            assert(source_attr >= 0 && source_attr < 32);
            max_source_attr = MAX2(max_source_attr, source_attr);
            /* The hardware can only do overrides on 16 overrides at a time,
@ -737,83 +741,51 @@ emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline)
               assert(source_attr == input_index);
         }

-         sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
-         sbe.VertexURBEntryReadLength = DIV_ROUND_UP(max_source_attr + 1, 2);
+         sbe.VertexURBEntryReadOffset = vertex_read_offset;
+         sbe.VertexURBEntryReadLength = vertex_read_length;
         sbe.ForceVertexURBEntryReadOffset = true;
         sbe.ForceVertexURBEntryReadLength = true;
-
-         /* Ask the hardware to supply PrimitiveID if the fragment shader
-          * reads it but a previous stage didn't write one.
-          */
-         if (sbe_primitive_id_override(pipeline)) {
-            sbe.PrimitiveIDOverrideAttributeSelect =
-               wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_ID];
-            sbe.PrimitiveIDOverrideComponentX = true;
-            sbe.PrimitiveIDOverrideComponentY = true;
-            sbe.PrimitiveIDOverrideComponentZ = true;
-            sbe.PrimitiveIDOverrideComponentW = true;
-         }
      }
-   }
-   }
+
+      /* Ask the hardware to supply PrimitiveID if the fragment shader reads
+       * it but a previous stage didn't write one.
+       */
+      if (sbe_primitive_id_override(pipeline)) {
+         sbe.PrimitiveIDOverrideAttributeSelect =
+            wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_ID];
+         sbe.PrimitiveIDOverrideComponentX = true;
+         sbe.PrimitiveIDOverrideComponentY = true;
+         sbe.PrimitiveIDOverrideComponentZ = true;
+         sbe.PrimitiveIDOverrideComponentW = true;
+      }

 #if GFX_VERx10 >= 125
-   if (device->vk.enabled_extensions.EXT_mesh_shader) {
-      anv_pipeline_emit(pipeline, final.sbe_mesh,
-                        GENX(3DSTATE_SBE_MESH), sbe_mesh) {
-         if (!anv_pipeline_is_mesh(pipeline))
-            continue;
+      if (device->vk.enabled_extensions.EXT_mesh_shader) {
+         anv_pipeline_emit(pipeline, final.sbe_mesh,
+                           GENX(3DSTATE_SBE_MESH), sbe_mesh) {
+            if (!anv_pipeline_is_mesh(pipeline))
+               continue;

-         const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
-         const struct brw_mue_map *mue = &mesh_prog_data->map;
+            const struct brw_mesh_prog_data *mesh_prog_data =
+               get_mesh_prog_data(pipeline);

-         assert(mue->per_vertex_header_size_dw % 8 == 0);
-         sbe_mesh.PerVertexURBEntryOutputReadOffset = mue->per_vertex_header_size_dw / 8;
-         sbe_mesh.PerVertexURBEntryOutputReadLength = DIV_ROUND_UP(mue->per_vertex_data_size_dw, 8);
+            sbe_mesh.PerVertexURBEntryOutputReadOffset = vertex_read_offset;
+            sbe_mesh.PerVertexURBEntryOutputReadLength = vertex_read_length;

-         /* Clip distance array is passed in the per-vertex header so that it
-          * can be consumed by the HW. If user wants to read it in the FS,
-          * adjust the offset and length to cover it. Conveniently it is at
-          * the end of the per-vertex header, right before per-vertex
-          * attributes.
-          *
-          * Note that FS attribute reading must be aware that the clip
-          * distances have fixed position.
-          */
-         if (mue->per_vertex_header_size_dw > 8 &&
-             (wm_prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] >= 0 ||
-              wm_prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] >= 0)) {
-            sbe_mesh.PerVertexURBEntryOutputReadOffset -= 1;
-            sbe_mesh.PerVertexURBEntryOutputReadLength += 1;
-         }
+            uint32_t prim_read_offset, prim_read_length;
+            brw_compute_sbe_per_primitive_urb_read(wm_prog_data->per_primitive_inputs,
+                                                   wm_prog_data->num_per_primitive_inputs,
+                                                   &mesh_prog_data->map,
+                                                   &prim_read_offset,
+                                                   &prim_read_length);

-         if (mue->user_data_in_vertex_header) {
-            sbe_mesh.PerVertexURBEntryOutputReadOffset -= 1;
-            sbe_mesh.PerVertexURBEntryOutputReadLength += 1;
-         }
-
-         assert(mue->per_primitive_header_size_dw % 8 == 0);
-         sbe_mesh.PerPrimitiveURBEntryOutputReadOffset =
-            mue->per_primitive_header_size_dw / 8;
-         sbe_mesh.PerPrimitiveURBEntryOutputReadLength =
-            DIV_ROUND_UP(mue->per_primitive_data_size_dw, 8);
-
-         /* Just like with clip distances, if Primitive Shading Rate, Viewport
-          * Index or Layer is read back in the FS, adjust the offset and
-          * length to cover the Primitive Header, where PSR, Viewport Index &
-          * Layer are stored.
-          */
-         if (wm_prog_data->urb_setup[VARYING_SLOT_VIEWPORT] >= 0 ||
-             wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_SHADING_RATE] >= 0 ||
-             wm_prog_data->urb_setup[VARYING_SLOT_LAYER] >= 0 ||
-             mue->user_data_in_primitive_header) {
-            assert(sbe_mesh.PerPrimitiveURBEntryOutputReadOffset > 0);
-            sbe_mesh.PerPrimitiveURBEntryOutputReadOffset -= 1;
-            sbe_mesh.PerPrimitiveURBEntryOutputReadLength += 1;
+            sbe_mesh.PerPrimitiveURBEntryOutputReadOffset = prim_read_offset;
+            sbe_mesh.PerPrimitiveURBEntryOutputReadLength = prim_read_length;
         }
      }
-   }
 #endif
+   }
+   }
 }

 static void
@ -842,7 +814,7 @@ emit_rs_state(struct anv_graphics_pipeline *pipeline,
      } else {
         assert(anv_pipeline_is_mesh(pipeline));
         const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
-         point_from_shader = mesh_prog_data->map.start_dw[VARYING_SLOT_PSIZ] >= 0;
+         point_from_shader = mesh_prog_data->map.vue_map.slots_valid & VARYING_BIT_PSIZ;
      }

      if (point_from_shader) {
@ -893,7 +865,7 @@ emit_3dstate_clip(struct anv_graphics_pipeline *pipeline,
         const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);

         clip.ForceZeroRTAIndexEnable =
-            mesh_prog_data->map.start_dw[VARYING_SLOT_LAYER] < 0;
+            mesh_prog_data->map.per_primitive_offsets[VARYING_SLOT_LAYER] < 0;
      }

      clip.NonPerspectiveBarycentricEnable = wm_prog_data ?
@ -905,12 +877,13 @@ emit_3dstate_clip(struct anv_graphics_pipeline *pipeline,
   if (device->vk.enabled_extensions.EXT_mesh_shader) {
      anv_pipeline_emit(pipeline, final.clip_mesh,
                        GENX(3DSTATE_CLIP_MESH), clip_mesh) {
-         if (anv_pipeline_is_mesh(pipeline)) {
-            const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
-            clip_mesh.PrimitiveHeaderEnable = mesh_prog_data->map.per_primitive_header_size_dw > 0;
-            clip_mesh.UserClipDistanceClipTestEnableBitmask = mesh_prog_data->clip_distance_mask;
-            clip_mesh.UserClipDistanceCullTestEnableBitmask = mesh_prog_data->cull_distance_mask;
-         }
+         if (!anv_pipeline_is_mesh(pipeline))
+            continue;
+
+         const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
+         clip_mesh.PrimitiveHeaderEnable = mesh_prog_data->map.has_per_primitive_header;
+         clip_mesh.UserClipDistanceClipTestEnableBitmask = mesh_prog_data->clip_distance_mask;
+         clip_mesh.UserClipDistanceCullTestEnableBitmask = mesh_prog_data->cull_distance_mask;
      }
   }
 #endif
@ -1903,9 +1876,9 @@ emit_mesh_state(struct anv_graphics_pipeline *pipeline)

      mesh.MaximumPrimitiveCount             = MAX2(mesh_prog_data->map.max_primitives, 1) - 1;
      mesh.OutputTopology                    = output_topology;
-      mesh.PerVertexDataPitch                = mesh_prog_data->map.per_vertex_pitch_dw / 8;
-      mesh.PerPrimitiveDataPresent           = mesh_prog_data->map.per_primitive_pitch_dw > 0;
-      mesh.PerPrimitiveDataPitch             = mesh_prog_data->map.per_primitive_pitch_dw / 8;
+      mesh.PerVertexDataPitch                = mesh_prog_data->map.per_vertex_stride / 32;
+      mesh.PerPrimitiveDataPresent           = mesh_prog_data->map.per_primitive_stride > 0;
+      mesh.PerPrimitiveDataPitch             = mesh_prog_data->map.per_primitive_stride / 32;
      mesh.IndexFormat                       = index_format;

      mesh.NumberofBarriers                  = mesh_prog_data->base.uses_barrier;