diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c
index e134853ec0f..d1245068bae 100644
--- a/src/gallium/drivers/iris/iris_state.c
+++ b/src/gallium/drivers/iris/iris_state.c
@@ -4678,30 +4678,39 @@ iris_create_so_decl_list(const struct pipe_stream_output_info *info,
 }
 
 static inline int
-iris_compute_first_urb_slot_required(uint64_t inputs_read,
+iris_compute_first_urb_slot_required(struct iris_compiled_shader *fs_shader,
                                      const struct intel_vue_map *prev_stage_vue_map)
 {
 #if GFX_VER >= 9
-   return brw_compute_first_fs_urb_slot_required(inputs_read, prev_stage_vue_map);
+   uint32_t read_offset, read_length, num_varyings, primid_offset;
+   brw_compute_sbe_per_vertex_urb_read(prev_stage_vue_map,
+                                       false /* mesh*/,
+                                       brw_wm_prog_data(fs_shader->brw_prog_data),
+                                       &read_offset, &read_length, &num_varyings,
+                                       &primid_offset);
+   return 2 * read_offset;
 #else
-   return elk_compute_first_urb_slot_required(inputs_read, prev_stage_vue_map);
+   const struct iris_fs_data *fs_data = iris_fs_data(fs_shader);
+   return elk_compute_first_urb_slot_required(fs_data->inputs, prev_stage_vue_map);
 #endif
 }
 
 static void
-iris_compute_sbe_urb_read_interval(uint64_t fs_input_slots,
+iris_compute_sbe_urb_read_interval(struct iris_compiled_shader *fs_shader,
                                    const struct intel_vue_map *last_vue_map,
                                    bool two_sided_color,
                                    unsigned *out_offset,
                                    unsigned *out_length)
 {
+   const struct iris_fs_data *fs_data = iris_fs_data(fs_shader);
+
    /* The compiler computes the first URB slot without considering COL/BFC
     * swizzling (because it doesn't know whether it's enabled), so we need
     * to do that here too.  This may result in a smaller offset, which
     * should be safe.
     */
    const unsigned first_slot =
-      iris_compute_first_urb_slot_required(fs_input_slots, last_vue_map);
+      iris_compute_first_urb_slot_required(fs_shader, last_vue_map);
 
    /* This becomes the URB read offset (counted in pairs of slots). */
    assert(first_slot % 2 == 0);
@@ -4710,6 +4719,7 @@ iris_compute_sbe_urb_read_interval(uint64_t fs_input_slots,
    /* We need to adjust the inputs read to account for front/back color
     * swizzling, as it can make the URB length longer.
     */
+   uint64_t fs_input_slots = fs_data->inputs;
    for (int c = 0; c <= 1; c++) {
       if (fs_input_slots & (VARYING_BIT_COL0 << c)) {
          /* If two sided color is enabled, the fragment shader's gl_Color
@@ -4898,7 +4908,7 @@ iris_emit_sbe(struct iris_batch *batch, const struct iris_context *ice)
       &iris_vue_data(ice->shaders.last_vue_shader)->vue_map;
 
    unsigned urb_read_offset, urb_read_length;
-   iris_compute_sbe_urb_read_interval(fs_data->inputs,
+   iris_compute_sbe_urb_read_interval(ice->shaders.prog[MESA_SHADER_FRAGMENT],
                                       last_vue_map,
                                       cso_rast->light_twoside,
                                       &urb_read_offset, &urb_read_length);
diff --git a/src/intel/compiler/brw_compile_fs.cpp b/src/intel/compiler/brw_compile_fs.cpp
index 0ad3c9cfcab..c7faffeffd7 100644
--- a/src/intel/compiler/brw_compile_fs.cpp
+++ b/src/intel/compiler/brw_compile_fs.cpp
@@ -630,223 +630,138 @@ static void
 calculate_urb_setup(const struct intel_device_info *devinfo,
                     const struct brw_wm_prog_key *key,
                     struct brw_wm_prog_data *prog_data,
-                    const nir_shader *nir,
-                    const struct brw_mue_map *mue_map)
+                    nir_shader *nir,
+                    const struct brw_mue_map *mue_map,
+                    int *per_primitive_offsets)
 {
    memset(prog_data->urb_setup, -1, sizeof(prog_data->urb_setup));
    memset(prog_data->urb_setup_channel, 0, sizeof(prog_data->urb_setup_channel));
 
    int urb_next = 0; /* in vec4s */
 
+   /* Figure out where the PrimitiveID lives, either in the per-vertex block
+    * or in the per-primitive block or both.
+    */
+   const uint64_t per_vert_primitive_id =
+      key->mesh_input == INTEL_ALWAYS ? 0 : VARYING_BIT_PRIMITIVE_ID;
+   const uint64_t per_prim_primitive_id =
+      key->mesh_input == INTEL_NEVER ? 0 : VARYING_BIT_PRIMITIVE_ID;
    const uint64_t inputs_read =
-      nir->info.inputs_read & ~nir->info.per_primitive_inputs;
+      nir->info.inputs_read &
+      (~nir->info.per_primitive_inputs | per_vert_primitive_id);
+   const uint64_t per_primitive_header_bits =
+      VARYING_BIT_PRIMITIVE_SHADING_RATE |
+      VARYING_BIT_LAYER |
+      VARYING_BIT_VIEWPORT |
+      VARYING_BIT_CULL_PRIMITIVE;
+   const uint64_t per_primitive_inputs =
+      nir->info.inputs_read &
+      (nir->info.per_primitive_inputs | per_prim_primitive_id) &
+      ~per_primitive_header_bits;
+   uint64_t unique_fs_attrs =
+      inputs_read & BRW_FS_VARYING_INPUT_MASK;
+   struct intel_vue_map vue_map;
+   uint32_t per_primitive_stride = 0, first_read_offset = UINT32_MAX;
 
-   /* Figure out where each of the incoming setup attributes lands. */
-   if (key->mesh_input != INTEL_NEVER) {
-      /* Per-Primitive Attributes are laid out by Hardware before the regular
-       * attributes, so order them like this to make easy later to map setup
-       * into real HW registers.
+   if (mue_map != NULL) {
+      memcpy(&vue_map, &mue_map->vue_map, sizeof(vue_map));
+
+      memcpy(per_primitive_offsets,
+             mue_map->per_primitive_offsets,
+             sizeof(mue_map->per_primitive_offsets));
+
+      u_foreach_bit64(location, per_primitive_inputs) {
+         assert(per_primitive_offsets[location] != -1);
+
+         first_read_offset = MIN2(first_read_offset,
+                                  (uint32_t)per_primitive_offsets[location]);
+         per_primitive_stride =
+            MAX2((uint32_t)per_primitive_offsets[location] + 16,
+                 per_primitive_stride);
+      }
+   } else {
+      brw_compute_vue_map(devinfo, &vue_map, inputs_read,
+                          key->base.vue_layout,
+                          1 /* pos_slots, TODO */);
+      brw_compute_per_primitive_map(per_primitive_offsets,
+                                    &per_primitive_stride,
+                                    &first_read_offset,
+                                    0, nir, nir_var_shader_in,
+                                    per_primitive_inputs,
+                                    true /* separate_shader */);
+   }
+
+   if (per_primitive_stride > first_read_offset) {
+      first_read_offset = ROUND_DOWN_TO(first_read_offset, 32);
+
+      /* Remove the first few unused registers */
+      for (uint32_t i = 0; i < VARYING_SLOT_MAX; i++) {
+         if (per_primitive_offsets[i] == -1)
+            continue;
+         per_primitive_offsets[i] -= first_read_offset;
+      }
+
+      prog_data->num_per_primitive_inputs =
+         2 * DIV_ROUND_UP(per_primitive_stride - first_read_offset, 32);
+   } else {
+      prog_data->num_per_primitive_inputs = 0;
+   }
+
+   /* Now do the per-vertex stuff (what used to be legacy pipeline) */
+   const uint64_t vue_header_bits = BRW_VUE_HEADER_VARYING_MASK;
+
+   unique_fs_attrs &= ~vue_header_bits;
+
+   /* If Mesh is involved, we cannot do any packing. Documentation doesn't say
+    * anything about this but 3DSTATE_SBE_SWIZ does not appear to work when
+    * using Mesh.
+    */
+   if (util_bitcount64(unique_fs_attrs) <= 16 && key->mesh_input == INTEL_NEVER) {
+      /* When not in Mesh pipeline mode, the SF/SBE pipeline stage can do
+       * arbitrary rearrangement of the first 16 varying inputs, so we can put
+       * them wherever we want. Just put them in order.
+       *
+       * This is useful because it means that (a) inputs not used by the
+       * fragment shader won't take up valuable register space, and (b) we
+       * won't have to recompile the fragment shader if it gets paired with a
+       * different vertex (or geometry) shader.
        */
-      if (nir->info.per_primitive_inputs) {
-         uint64_t per_prim_inputs_read =
-               nir->info.inputs_read & nir->info.per_primitive_inputs;
-
-         /* In Mesh, PRIMITIVE_SHADING_RATE, VIEWPORT and LAYER slots
-          * are always at the beginning, because they come from MUE
-          * Primitive Header, not Per-Primitive Attributes.
-          */
-         const uint64_t primitive_header_bits = VARYING_BIT_VIEWPORT |
-                                                VARYING_BIT_LAYER |
-                                                VARYING_BIT_PRIMITIVE_SHADING_RATE;
-
-         if (mue_map) {
-            unsigned per_prim_start_dw = mue_map->per_primitive_start_dw;
-            unsigned per_prim_size_dw = mue_map->per_primitive_pitch_dw;
-
-            bool reads_header = (per_prim_inputs_read & primitive_header_bits) != 0;
-
-            if (reads_header || mue_map->user_data_in_primitive_header) {
-               /* Primitive Shading Rate, Layer and Viewport live in the same
-                * 4-dwords slot (psr is dword 0, layer is dword 1, and viewport
-                * is dword 2).
-                */
-               if (per_prim_inputs_read & VARYING_BIT_PRIMITIVE_SHADING_RATE)
-                  prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_SHADING_RATE] = 0;
-
-               if (per_prim_inputs_read & VARYING_BIT_LAYER)
-                  prog_data->urb_setup[VARYING_SLOT_LAYER] = 0;
-
-               if (per_prim_inputs_read & VARYING_BIT_VIEWPORT)
-                  prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = 0;
-
-               per_prim_inputs_read &= ~primitive_header_bits;
-            } else {
-               /* If fs doesn't need primitive header, then it won't be made
-                * available through SBE_MESH, so we have to skip them when
-                * calculating offset from start of per-prim data.
-                */
-               per_prim_start_dw += mue_map->per_primitive_header_size_dw;
-               per_prim_size_dw -= mue_map->per_primitive_header_size_dw;
-            }
-
-            u_foreach_bit64(i, per_prim_inputs_read) {
-               int start = mue_map->start_dw[i];
-
-               assert(start >= 0);
-               assert(mue_map->len_dw[i] > 0);
-
-               assert(unsigned(start) >= per_prim_start_dw);
-               unsigned pos_dw = unsigned(start) - per_prim_start_dw;
-
-               prog_data->urb_setup[i] = urb_next + pos_dw / 4;
-               prog_data->urb_setup_channel[i] = pos_dw % 4;
-            }
-
-            urb_next = per_prim_size_dw / 4;
-         } else {
-            /* With no MUE map, we never read the primitive header, and
-             * per-primitive attributes won't be packed either, so just lay
-             * them in varying order.
-             */
-            per_prim_inputs_read &= ~primitive_header_bits;
-
-            for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
-               if (per_prim_inputs_read & BITFIELD64_BIT(i)) {
-                  prog_data->urb_setup[i] = urb_next++;
-               }
-            }
-
-            /* The actual setup attributes later must be aligned to a full GRF. */
-            urb_next = ALIGN(urb_next, 2);
-         }
-
-         prog_data->num_per_primitive_inputs = urb_next;
-      }
-
-      const uint64_t clip_dist_bits = VARYING_BIT_CLIP_DIST0 |
-                                      VARYING_BIT_CLIP_DIST1;
-
-      uint64_t unique_fs_attrs = inputs_read & BRW_FS_VARYING_INPUT_MASK;
-
-      if (inputs_read & clip_dist_bits) {
-         assert(!mue_map || mue_map->per_vertex_header_size_dw > 8);
-         unique_fs_attrs &= ~clip_dist_bits;
-      }
-
-      if (mue_map) {
-         unsigned per_vertex_start_dw = mue_map->per_vertex_start_dw;
-         unsigned per_vertex_size_dw = mue_map->per_vertex_pitch_dw;
-
-         /* Per-Vertex header is available to fragment shader only if there's
-          * user data there.
-          */
-         if (!mue_map->user_data_in_vertex_header) {
-            per_vertex_start_dw += 8;
-            per_vertex_size_dw -= 8;
-         }
-
-         /* In Mesh, CLIP_DIST slots are always at the beginning, because
-          * they come from MUE Vertex Header, not Per-Vertex Attributes.
-          */
-         if (inputs_read & clip_dist_bits) {
-            prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next;
-            prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next + 1;
-         } else if (mue_map && mue_map->per_vertex_header_size_dw > 8) {
-            /* Clip distances are in MUE, but we are not reading them in FS. */
-            per_vertex_start_dw += 8;
-            per_vertex_size_dw -= 8;
-         }
-
-         /* Per-Vertex attributes are laid out ordered.  Because we always link
-          * Mesh and Fragment shaders, the which slots are written and read by
-          * each of them will match. */
-         u_foreach_bit64(i, unique_fs_attrs) {
-            int start = mue_map->start_dw[i];
-
-            assert(start >= 0);
-            assert(mue_map->len_dw[i] > 0);
-
-            assert(unsigned(start) >= per_vertex_start_dw);
-            unsigned pos_dw = unsigned(start) - per_vertex_start_dw;
-
-            prog_data->urb_setup[i] = urb_next + pos_dw / 4;
-            prog_data->urb_setup_channel[i] = pos_dw % 4;
-         }
-
-         urb_next += per_vertex_size_dw / 4;
-      } else {
-         /* If we don't have an MUE map, just lay down the inputs the FS reads
-          * in varying order, as we do for the legacy pipeline.
-          */
-         if (inputs_read & clip_dist_bits) {
-            prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next++;
-            prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next++;
-         }
-
-         for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
-            if (unique_fs_attrs & BITFIELD64_BIT(i))
-               prog_data->urb_setup[i] = urb_next++;
+      for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
+         if (inputs_read & BRW_FS_VARYING_INPUT_MASK & ~vue_header_bits &
+             BITFIELD64_BIT(i)) {
+            prog_data->urb_setup[i] = urb_next++;
          }
       }
    } else {
-      assert(!nir->info.per_primitive_inputs);
-
-      const uint64_t vue_header_bits = BRW_VUE_HEADER_VARYING_MASK;
-
-      uint64_t unique_fs_attrs = inputs_read & BRW_FS_VARYING_INPUT_MASK & ~vue_header_bits;
-
-      if (util_bitcount64(unique_fs_attrs) <= 16) {
-         /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
-          * first 16 varying inputs, so we can put them wherever we want.
-          * Just put them in order.
-          *
-          * This is useful because it means that (a) inputs not used by the
-          * fragment shader won't take up valuable register space, and (b) we
-          * won't have to recompile the fragment shader if it gets paired with
-          * a different vertex (or geometry) shader.
-          */
-         for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
-            if (inputs_read & BRW_FS_VARYING_INPUT_MASK & ~vue_header_bits &
-                BITFIELD64_BIT(i)) {
-               prog_data->urb_setup[i] = urb_next++;
-            }
+      /* We have enough input varyings that the SF/SBE pipeline stage can't
+       * arbitrarily rearrange them to suit our whim; we have to put them in
+       * an order that matches the output of the previous pipeline stage
+       * (geometry or vertex shader).
+       */
+      int first_slot = 0;
+      for (int i = 0; i < vue_map.num_slots; i++) {
+         int varying = vue_map.slot_to_varying[i];
+         if (varying != BRW_VARYING_SLOT_PAD && varying > 0 &&
+             (inputs_read & BITFIELD64_BIT(varying)) != 0) {
+            first_slot = ROUND_DOWN_TO(i, 2);
+            break;
          }
-      } else {
-         /* We have enough input varyings that the SF/SBE pipeline stage can't
-          * arbitrarily rearrange them to suit our whim; we have to put them
-          * in an order that matches the output of the previous pipeline stage
-          * (geometry or vertex shader).
-          */
-
-         /* Re-compute the VUE map here in the case that the one coming from
-          * geometry has more than one position slot (used for Primitive
-          * Replication).
-          */
-         struct intel_vue_map prev_stage_vue_map;
-         brw_compute_vue_map(devinfo, &prev_stage_vue_map,
-                             key->input_slots_valid,
-                             key->base.vue_layout, 1);
-
-         int first_slot =
-            brw_compute_first_fs_urb_slot_required(unique_fs_attrs,
-                                                   &prev_stage_vue_map);
-
-         assert(prev_stage_vue_map.num_slots <= first_slot + 32);
-         for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
-              slot++) {
-            int varying = prev_stage_vue_map.slot_to_varying[slot];
-            if (varying != BRW_VARYING_SLOT_PAD &&
-                (inputs_read & BRW_FS_VARYING_INPUT_MASK &
-                 BITFIELD64_BIT(varying))) {
-               prog_data->urb_setup[varying] = slot - first_slot;
-            }
-         }
-         urb_next = prev_stage_vue_map.num_slots - first_slot;
       }
+
+      for (int slot = first_slot; slot < vue_map.num_slots; slot++) {
+         int varying = vue_map.slot_to_varying[slot];
+         if (varying != BRW_VARYING_SLOT_PAD &&
+             (inputs_read & BRW_FS_VARYING_INPUT_MASK &
+              BITFIELD64_BIT(varying))) {
+            prog_data->urb_setup[varying] = slot - first_slot;
+         }
+      }
+      urb_next = vue_map.num_slots - first_slot;
    }
 
-   prog_data->num_varying_inputs = urb_next - prog_data->num_per_primitive_inputs;
+   prog_data->num_varying_inputs = urb_next;
    prog_data->inputs = inputs_read;
+   prog_data->per_primitive_inputs = per_primitive_inputs;
 
    brw_compute_urb_setup_index(prog_data);
 }
@@ -951,8 +866,6 @@ brw_compute_flat_inputs(struct brw_wm_prog_data *prog_data,
 {
    prog_data->flat_inputs = 0;
 
-   const unsigned per_vertex_start = prog_data->num_per_primitive_inputs;
-
    nir_foreach_shader_in_variable(var, shader) {
       /* flat shading */
       if (var->data.interpolation != INTERP_MODE_FLAT)
@@ -963,7 +876,7 @@ brw_compute_flat_inputs(struct brw_wm_prog_data *prog_data,
 
       unsigned slots = glsl_count_attribute_slots(var->type, false);
       for (unsigned s = 0; s < slots; s++) {
-         int input_index = prog_data->urb_setup[var->data.location + s] - per_vertex_start;
+         int input_index = prog_data->urb_setup[var->data.location + s];
 
          if (input_index >= 0)
             prog_data->flat_inputs |= 1 << input_index;
@@ -1005,7 +918,8 @@ brw_nir_populate_wm_prog_data(nir_shader *shader,
                               const struct intel_device_info *devinfo,
                               const struct brw_wm_prog_key *key,
                               struct brw_wm_prog_data *prog_data,
-                              const struct brw_mue_map *mue_map)
+                              const struct brw_mue_map *mue_map,
+                              int *per_primitive_offsets)
 {
    prog_data->uses_kill = shader->info.fs.uses_discard;
    prog_data->uses_omask = !key->ignore_sample_mask_out &&
@@ -1158,7 +1072,7 @@ brw_nir_populate_wm_prog_data(nir_shader *shader,
       (BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) &&
        prog_data->coarse_pixel_dispatch != INTEL_NEVER);
 
-   calculate_urb_setup(devinfo, key, prog_data, shader, mue_map);
+   calculate_urb_setup(devinfo, key, prog_data, shader, mue_map, per_primitive_offsets);
    brw_compute_flat_inputs(prog_data, shader);
 }
 
@@ -1479,6 +1393,44 @@ run_fs(brw_shader &s, bool allow_spilling, bool do_rep_send)
    return !s.failed;
 }
 
+static void
+brw_print_fs_urb_setup(FILE *fp, const struct brw_wm_prog_data *prog_data,
+                       int *per_primitive_offsets)
+{
+   fprintf(fp, "FS URB (inputs=0x%016" PRIx64 ", flat_inputs=0x%08x):\n",
+           prog_data->inputs, prog_data->flat_inputs);
+   fprintf(fp, "  URB setup:\n");
+   for (uint32_t i = 0; i < ARRAY_SIZE(prog_data->urb_setup); i++) {
+      if (prog_data->urb_setup[i] >= 0) {
+         fprintf(fp, "   [%02d]: %i channel=%u (%s)\n",
+                 i, prog_data->urb_setup[i], prog_data->urb_setup_channel[i],
+                 gl_varying_slot_name_for_stage((gl_varying_slot)i,
+                                                MESA_SHADER_FRAGMENT));
+      }
+   }
+   fprintf(fp, "  URB setup attributes:\n");
+   for (uint32_t i = 0; i < prog_data->urb_setup_attribs_count; i++) {
+      fprintf(fp, "   [%02d]: %i (%s)\n",
+              i, prog_data->urb_setup_attribs[i],
+              gl_varying_slot_name_for_stage(
+                 (gl_varying_slot)prog_data->urb_setup_attribs[i],
+                 MESA_SHADER_FRAGMENT));
+   }
+   if (per_primitive_offsets) {
+      fprintf(fp, "  Per Primitive URB setup:\n");
+      for (uint32_t i = 0; i < VARYING_SLOT_MAX; i++) {
+         if (per_primitive_offsets[i] == -1 ||
+             i == VARYING_SLOT_PRIMITIVE_COUNT ||
+             i == VARYING_SLOT_PRIMITIVE_INDICES)
+            continue;
+         fprintf(fp, "   [%02d]: %i (%s)\n",
+                 i, per_primitive_offsets[i],
+                 gl_varying_slot_name_for_stage((gl_varying_slot)i,
+                                                MESA_SHADER_FRAGMENT));
+      }
+   }
+}
+
 const unsigned *
 brw_compile_fs(const struct brw_compiler *compiler,
                struct brw_compile_fs_params *params)
@@ -1518,8 +1470,15 @@ brw_compile_fs(const struct brw_compiler *compiler,
    brw_postprocess_nir(nir, compiler, debug_enabled,
                        key->base.robust_flags);
 
+   int per_primitive_offsets[VARYING_SLOT_MAX];
+   memset(per_primitive_offsets, -1, sizeof(per_primitive_offsets));
+
    brw_nir_populate_wm_prog_data(nir, compiler->devinfo, key, prog_data,
-                                 params->mue_map);
+                                 params->mue_map,
+                                 per_primitive_offsets);
+
+   if (unlikely(debug_enabled))
+      brw_print_fs_urb_setup(stderr, prog_data, per_primitive_offsets);
 
    /* Either an unrestricted or a fixed SIMD16 subgroup size are
     * allowed -- The latter is needed for fast clear and replicated
@@ -1540,6 +1499,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
                                         prog_data, nir, 8, 1,
                                         params->base.stats != NULL,
                                         debug_enabled);
+      v8->import_per_primitive_offsets(per_primitive_offsets);
       if (!run_fs(*v8, allow_spilling, false /* do_rep_send */)) {
          params->base.error_str = ralloc_strdup(params->base.mem_ctx,
                                                 v8->fail_msg);
@@ -1645,6 +1605,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
                                             prog_data, nir, 32, 1,
                                             params->base.stats != NULL,
                                             debug_enabled);
+         v32->import_per_primitive_offsets(per_primitive_offsets);
          if (vbase)
             v32->import_uniforms(vbase);
 
@@ -1670,6 +1631,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
                                             prog_data, nir, 16, 1,
                                             params->base.stats != NULL,
                                             debug_enabled);
+         v16->import_per_primitive_offsets(per_primitive_offsets);
 
          if (!run_fs(*v16, allow_spilling, params->use_rep_send)) {
             brw_shader_perf_log(compiler, params->base.log_data,
@@ -1694,6 +1656,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
                                             prog_data, nir, 16, 1,
                                             params->base.stats != NULL,
                                             debug_enabled);
+         v16->import_per_primitive_offsets(per_primitive_offsets);
          if (v8)
             v16->import_uniforms(v8.get());
          if (!run_fs(*v16, allow_spilling, params->use_rep_send)) {
@@ -1728,6 +1691,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
                                             prog_data, nir, 32, 1,
                                             params->base.stats != NULL,
                                             debug_enabled);
+         v32->import_per_primitive_offsets(per_primitive_offsets);
          if (v8)
             v32->import_uniforms(v8.get());
          else if (v16)
@@ -1772,6 +1736,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
                                                   prog_data, nir, 32, 4,
                                                   params->base.stats != NULL,
                                                   debug_enabled);
+            vmulti->import_per_primitive_offsets(per_primitive_offsets);
             vmulti->import_uniforms(vbase);
             if (!run_fs(*vmulti, false, params->use_rep_send)) {
                brw_shader_perf_log(compiler, params->base.log_data,
@@ -1792,6 +1757,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
                                                   prog_data, nir, 32, 2,
                                                   params->base.stats != NULL,
                                                   debug_enabled);
+            vmulti->import_per_primitive_offsets(per_primitive_offsets);
             vmulti->import_uniforms(vbase);
             if (!run_fs(*vmulti, false, params->use_rep_send)) {
                brw_shader_perf_log(compiler, params->base.log_data,
@@ -1811,6 +1777,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
                                                   prog_data, nir, 16, 2,
                                                   params->base.stats != NULL,
                                                   debug_enabled);
+            vmulti->import_per_primitive_offsets(per_primitive_offsets);
             vmulti->import_uniforms(vbase);
             if (!run_fs(*vmulti, allow_spilling, params->use_rep_send)) {
                brw_shader_perf_log(compiler, params->base.log_data,
@@ -1893,31 +1860,110 @@ brw_compile_fs(const struct brw_compiler *compiler,
 }
 
 extern "C" void
-brw_print_fs_urb_setup(FILE *fp, const struct brw_wm_prog_data *prog_data)
+brw_compute_sbe_per_vertex_urb_read(const struct intel_vue_map *prev_stage_vue_map,
+                                    bool mesh,
+                                    const struct brw_wm_prog_data *wm_prog_data,
+                                    uint32_t *out_read_offset,
+                                    uint32_t *out_read_length,
+                                    uint32_t *out_num_varyings,
+                                    uint32_t *out_primitive_id_offset)
 {
-   fprintf(fp, "FS URB (inputs=0x%016lx, flat_inputs=0x%08x):\n",
-           prog_data->inputs, prog_data->flat_inputs);
-   fprintf(fp, "  URB setup:\n");
-   for (uint32_t i = 0; i < ARRAY_SIZE(prog_data->urb_setup); i++) {
-      if (prog_data->urb_setup[i] >= 0) {
-         fprintf(fp, "   [%02d]: %i channel=%u (%s)\n",
-                 i, prog_data->urb_setup[i], prog_data->urb_setup_channel[i],
-                 gl_varying_slot_name_for_stage((gl_varying_slot)i,
-                                                MESA_SHADER_FRAGMENT));
+   int first_slot = INT32_MAX, last_slot = -1;
+
+   /* Ignore PrimitiveID in mesh pipelines, this value is coming from the
+    * per-primitive block.
+    */
+   uint64_t inputs_read = wm_prog_data->inputs;
+   if (mesh)
+      inputs_read &= ~VARYING_BIT_PRIMITIVE_ID;
+
+   for (int _i = 0; _i < prev_stage_vue_map->num_slots; _i++) {
+      uint32_t i = prev_stage_vue_map->num_slots - 1 - _i;
+      int varying = prev_stage_vue_map->slot_to_varying[i];
+      if (varying < 0)
+         continue;
+
+      if (varying == BRW_VARYING_SLOT_PAD ||
+          (inputs_read & BITFIELD64_BIT(varying)) == 0)
+         continue;
+
+      last_slot = i;
+      break;
+   }
+
+   for (int i = 0; i < prev_stage_vue_map->num_slots; i++) {
+      int varying = prev_stage_vue_map->slot_to_varying[i];
+      if (varying != BRW_VARYING_SLOT_PAD && varying > 0 &&
+          (inputs_read & BITFIELD64_BIT(varying)) != 0) {
+         first_slot = i;
+         break;
       }
    }
-   fprintf(fp, "  URB setup attributes:\n");
-   for (uint32_t i = 0; i < prog_data->urb_setup_attribs_count; i++) {
-      fprintf(fp, "   [%02d]: %i (%s)\n",
-              i, prog_data->urb_setup_attribs[i],
-              gl_varying_slot_name_for_stage((gl_varying_slot)i,
-                                             MESA_SHADER_FRAGMENT));
+
+   assert((first_slot == INT32_MAX && last_slot == -1) ||
+          (first_slot >= 0 && last_slot >= 0 && last_slot >= first_slot));
+
+   uint32_t num_varyings = wm_prog_data->num_varying_inputs;
+
+   /* When using INTEL_VUE_LAYOUT_SEPARATE_MESH, the location of the
+    * PrimitiveID is unknown at compile time, here we compute the offset
+    * inside the attribute registers which will be read with MOV_INDIRECT in
+    * the shader.
+    */
+   *out_primitive_id_offset = 0;
+   if (prev_stage_vue_map->layout == INTEL_VUE_LAYOUT_SEPARATE_MESH) {
+      if (mesh) {
+         /* When using Mesh, the PrimitiveID is in the per-primitive block. */
+         if (wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_ID] >= 0)
+            num_varyings--;
+         *out_primitive_id_offset = INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX_MESH;
+      } else if (inputs_read & VARYING_BIT_PRIMITIVE_ID) {
+         int primitive_id_slot;
+         if (prev_stage_vue_map->varying_to_slot[VARYING_SLOT_PRIMITIVE_ID] < 0) {
+            /* If the previous stage doesn't write PrimitiveID, we can have
+             * the HW generate a value (except if GS is enabled but in that
+             * case that's undefined).
+             *
+             * If the FS shader already has a slot of the PrimitiveID value,
+             * use that.
+             */
+            if (wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_ID] >= 0) {
+               if (first_slot == INT32_MAX)
+                  first_slot = 0;
+               primitive_id_slot =
+                  first_slot + wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_ID];
+            } else {
+               primitive_id_slot = ++last_slot;
+            }
+         } else {
+            primitive_id_slot =
+               prev_stage_vue_map->varying_to_slot[VARYING_SLOT_PRIMITIVE_ID];
+         }
+         last_slot = MAX2(primitive_id_slot, last_slot);
+
+         *out_primitive_id_offset = 4 * (primitive_id_slot - first_slot);
+      }
+   }
+
+   /* Compute the read parameters for SBE (those have to be 32B aligned) */
+   if (last_slot == -1) {
+      *out_read_offset = 0;
+      *out_read_length = DIV_ROUND_UP(num_varyings, 2);
+      *out_num_varyings = num_varyings;
+   } else {
+      first_slot = ROUND_DOWN_TO(first_slot, 2);
+      *out_read_offset = first_slot / 2;
+      *out_read_length = DIV_ROUND_UP(last_slot - first_slot + 1, 2);
+      *out_num_varyings = num_varyings;
    }
 }
 
-extern "C" int
-brw_compute_first_fs_urb_slot_required(uint64_t inputs_read,
-                                       const struct intel_vue_map *prev_stage_vue_map)
+extern "C" void
+brw_compute_sbe_per_primitive_urb_read(uint64_t inputs_read,
+                                       uint32_t num_varyings,
+                                       const struct brw_mue_map *mue_map,
+                                       uint32_t *out_read_offset,
+                                       uint32_t *out_read_length)
 {
    /* The header slots are irrelevant for the URB varying slots. They are
     * delivered somewhere else in the thread payload.
@@ -1927,16 +1973,19 @@ brw_compute_first_fs_urb_slot_required(uint64_t inputs_read,
     *   - LAYER                  : R1.1, Render Target Array Index
     *   - VIEWPORT               : R1.1, Viewport Index
     *   - PSIZ                   : not available in fragment shaders
+    *   - FACE                   : R1.1, Front/Back Facing
     */
-   inputs_read &= ~BRW_VUE_HEADER_VARYING_MASK;
+   inputs_read &= ~(BRW_VUE_HEADER_VARYING_MASK | VARYING_BIT_FACE);
 
-   for (int i = 0; i < prev_stage_vue_map->num_slots; i++) {
-      int varying = prev_stage_vue_map->slot_to_varying[i];
-      if (varying != BRW_VARYING_SLOT_PAD && varying > 0 &&
-          (inputs_read & BITFIELD64_BIT(varying)) != 0) {
-         return ROUND_DOWN_TO(i, 2);
-      }
+   uint32_t first_read = UINT32_MAX;
+   u_foreach_bit64(varying, inputs_read) {
+      if (mue_map->per_primitive_offsets[varying] < 0)
+         continue;
+
+      first_read = mue_map->per_primitive_offsets[varying];
+      break;
    }
 
-   return 0;
+   *out_read_offset = DIV_ROUND_UP(first_read, 32);
+   *out_read_length = DIV_ROUND_UP(num_varyings, 2);
 }
diff --git a/src/intel/compiler/brw_compile_mesh.cpp b/src/intel/compiler/brw_compile_mesh.cpp
index 82562e358a6..5939e3a42e0 100644
--- a/src/intel/compiler/brw_compile_mesh.cpp
+++ b/src/intel/compiler/brw_compile_mesh.cpp
@@ -516,711 +516,166 @@ struct attr_desc {
    unsigned slots;
 };
 
-struct attr_type_info {
-   /* order of attributes, negative values are holes */
-   std::list<struct attr_desc> *order;
-
-   /* attributes after which there's hole of size equal to array index */
-   std::list<int> holes[5];
-};
-
-static void
-brw_mue_assign_position(const struct attr_desc *attr,
-                        struct brw_mue_map *map,
-                        unsigned start_dw)
-{
-   bool is_array = glsl_type_is_array(attr->type);
-   int location = attr->location;
-   unsigned remaining = attr->dwords;
-
-   for (unsigned slot = 0; slot < attr->slots; ++slot) {
-      map->start_dw[location + slot] = start_dw;
-
-      unsigned sz;
-
-      if (is_array) {
-         assert(attr->dwords % attr->slots == 0);
-         sz = attr->dwords / attr->slots;
-      } else {
-         sz = MIN2(remaining, 4);
-      }
-
-      map->len_dw[location + slot] = sz;
-      start_dw += sz;
-      remaining -= sz;
-   }
-}
-
-static unsigned
-brw_sum_size(const std::list<struct attr_desc> &orders)
-{
-   unsigned sz = 0;
-   for (auto it = orders.cbegin(); it != orders.cend(); ++it)
-      sz += (*it).dwords;
-   return sz;
-}
-
-/* Finds order of outputs which require minimum size, without splitting
- * of URB read/write messages (which operate on vec4-aligned memory).
- */
-static void
-brw_compute_mue_layout(const struct brw_compiler *compiler,
-                       std::list<struct attr_desc> *orders,
-                       uint64_t outputs_written,
-                       struct nir_shader *nir,
-                       bool *pack_prim_data_into_header,
-                       bool *pack_vert_data_into_header)
-{
-   const struct shader_info *info = &nir->info;
-
-   struct attr_type_info data[3];
-
-   if ((compiler->mesh.mue_header_packing & 1) == 0)
-      *pack_prim_data_into_header = false;
-   if ((compiler->mesh.mue_header_packing & 2) == 0)
-      *pack_vert_data_into_header = false;
-
-   for (unsigned i = PRIM; i <= VERT_FLAT; ++i)
-      data[i].order = &orders[i];
-
-   /* If packing into header is enabled, add a hole of size 4 and add
-    * a virtual location to keep the algorithm happy (it expects holes
-    * to be preceded by some location). We'll remove those virtual
-    * locations at the end.
-    */
-   const gl_varying_slot virtual_header_location = VARYING_SLOT_POS;
-   assert((outputs_written & BITFIELD64_BIT(virtual_header_location)) == 0);
-
-   struct attr_desc d;
-   d.location = virtual_header_location;
-   d.type = NULL;
-   d.dwords = 0;
-   d.slots = 0;
-
-   struct attr_desc h;
-   h.location = -1;
-   h.type = NULL;
-   h.dwords = 4;
-   h.slots = 0;
-
-   if (*pack_prim_data_into_header) {
-      orders[PRIM].push_back(d);
-      orders[PRIM].push_back(h);
-      data[PRIM].holes[4].push_back(virtual_header_location);
-   }
-
-   if (*pack_vert_data_into_header) {
-      orders[VERT].push_back(d);
-      orders[VERT].push_back(h);
-      data[VERT].holes[4].push_back(virtual_header_location);
-   }
-
-   u_foreach_bit64(location, outputs_written) {
-      if ((BITFIELD64_BIT(location) & outputs_written) == 0)
-         continue;
-
-      /* At this point there are both complete and split variables as
-       * outputs. We need the complete variable to compute the required
-       * size.
-       */
-      nir_variable *var =
-            brw_nir_find_complete_variable_with_location(nir,
-                                                         nir_var_shader_out,
-                                                         location);
-
-      d.location = location;
-      d.type     = brw_nir_get_var_type(nir, var);
-      d.dwords   = glsl_count_dword_slots(d.type, false);
-      d.slots    = glsl_count_attribute_slots(d.type, false);
-
-      struct attr_type_info *type_data;
-
-      if (BITFIELD64_BIT(location) & info->per_primitive_outputs)
-         type_data = &data[PRIM];
-      else if (var->data.interpolation == INTERP_MODE_FLAT)
-         type_data = &data[VERT_FLAT];
-      else
-         type_data = &data[VERT];
-
-      std::list<struct attr_desc> *order = type_data->order;
-      std::list<int> *holes = type_data->holes;
-
-      outputs_written &= ~BITFIELD64_RANGE(location, d.slots);
-
-      /* special case to use hole of size 4 */
-      if (d.dwords == 4 && !holes[4].empty()) {
-         holes[4].pop_back();
-
-         assert(order->front().location == virtual_header_location);
-         order->pop_front();
-
-         assert(order->front().location == -1);
-         assert(order->front().dwords == 4);
-         order->front() = d;
-
-         continue;
-      }
-
-      int mod = d.dwords % 4;
-      if (mod == 0) {
-         order->push_back(d);
-         continue;
-      }
-
-      h.location = -1;
-      h.type = NULL;
-      h.dwords = 4 - mod;
-      h.slots = 0;
-
-      if (!compiler->mesh.mue_compaction) {
-         order->push_back(d);
-         order->push_back(h);
-         continue;
-      }
-
-      if (d.dwords > 4) {
-         order->push_back(d);
-         order->push_back(h);
-         holes[h.dwords].push_back(location);
-         continue;
-      }
-
-      assert(d.dwords < 4);
-
-      unsigned found = 0;
-      /* try to find the smallest hole big enough to hold this attribute */
-      for (unsigned sz = d.dwords; sz <= 4; sz++){
-         if (!holes[sz].empty()) {
-            found = sz;
-            break;
-         }
-      }
-
-      /* append at the end if not found */
-      if (found == 0) {
-         order->push_back(d);
-         order->push_back(h);
-         holes[h.dwords].push_back(location);
-
-         continue;
-      }
-
-      assert(found <= 4);
-      assert(!holes[found].empty());
-      int after_loc = holes[found].back();
-      holes[found].pop_back();
-
-      bool inserted_back = false;
-
-      for (auto it = order->begin(); it != order->end(); ++it) {
-         if ((*it).location != after_loc)
-            continue;
-
-         ++it;
-         /* must be a hole */
-         assert((*it).location < 0);
-         /* and it must be big enough */
-         assert(d.dwords <= (*it).dwords);
-
-         if (d.dwords == (*it).dwords) {
-            /* exact size, just replace */
-            *it = d;
-         } else {
-            /* inexact size, shrink hole */
-            (*it).dwords -= d.dwords;
-            /* and insert new attribute before it */
-            order->insert(it, d);
-
-            /* Insert shrunk hole in a spot so that the order of attributes
-             * is preserved.
-             */
-            std::list<int> &hole_list = holes[(*it).dwords];
-            std::list<int>::iterator insert_before = hole_list.end();
-
-            for (auto it2 = hole_list.begin(); it2 != hole_list.end(); ++it2) {
-               if ((*it2) >= (int)location) {
-                  insert_before = it2;
-                  break;
-               }
-            }
-
-            hole_list.insert(insert_before, location);
-         }
-
-         inserted_back = true;
-         break;
-      }
-
-      assert(inserted_back);
-   }
-
-   if (*pack_prim_data_into_header) {
-      if (orders[PRIM].front().location == virtual_header_location)
-         orders[PRIM].pop_front();
-
-      if (!data[PRIM].holes[4].empty()) {
-         *pack_prim_data_into_header = false;
-
-         assert(orders[PRIM].front().location == -1);
-         assert(orders[PRIM].front().dwords == 4);
-         orders[PRIM].pop_front();
-      }
-
-      if (*pack_prim_data_into_header) {
-         unsigned sz = brw_sum_size(orders[PRIM]);
-
-         if (sz % 8 == 0 || sz % 8 > 4)
-            *pack_prim_data_into_header = false;
-      }
-   }
-
-   if (*pack_vert_data_into_header) {
-      if (orders[VERT].front().location == virtual_header_location)
-         orders[VERT].pop_front();
-
-      if (!data[VERT].holes[4].empty()) {
-         *pack_vert_data_into_header = false;
-
-         assert(orders[VERT].front().location == -1);
-         assert(orders[VERT].front().dwords == 4);
-         orders[VERT].pop_front();
-      }
-
-      if (*pack_vert_data_into_header) {
-         unsigned sz = brw_sum_size(orders[VERT]) +
-                       brw_sum_size(orders[VERT_FLAT]);
-
-         if (sz % 8 == 0 || sz % 8 > 4)
-            *pack_vert_data_into_header = false;
-      }
-   }
-
-
-   if (INTEL_DEBUG(DEBUG_MESH)) {
-      fprintf(stderr, "MUE attribute order:\n");
-      for (unsigned i = PRIM; i <= VERT_FLAT; ++i) {
-         if (!orders[i].empty())
-            fprintf(stderr, "%d: ", i);
-         for (auto it = orders[i].cbegin(); it != orders[i].cend(); ++it) {
-            fprintf(stderr, "%d(%d) ", (*it).location, (*it).dwords);
-         }
-         if (!orders[i].empty())
-            fprintf(stderr, "\n");
-      }
-   }
-}
-
-/* Mesh URB Entry consists of an initial section
- *
- *  - Primitive Count
- *  - Primitive Indices (from 0 to Max-1)
- *  - Padding to 32B if needed
- *
- * optionally followed by a section for per-primitive data,
- * in which each primitive (from 0 to Max-1) gets
- *
- *  - Primitive Header (e.g. ViewportIndex)
- *  - Primitive Custom Attributes
- *
- * then followed by a section for per-vertex data
- *
- *  - Vertex Header (e.g. Position)
- *  - Vertex Custom Attributes
- *
- * Each per-element section has a pitch and a starting offset.  All the
- * individual attributes offsets in start_dw are considering the first entry
- * of the section (i.e. where the Position for first vertex, or ViewportIndex
- * for first primitive).  Attributes for other elements are calculated using
- * the pitch.
- */
 static void
 brw_compute_mue_map(const struct brw_compiler *compiler,
-                    struct nir_shader *nir, struct brw_mue_map *map,
-                    enum brw_mesh_index_format index_format, bool compact_mue)
+                    nir_shader *nir, struct brw_mue_map *map,
+                    enum brw_mesh_index_format index_format,
+                    enum intel_vue_layout vue_layout)
 {
    memset(map, 0, sizeof(*map));
 
-   memset(&map->start_dw[0], -1, sizeof(map->start_dw));
-   memset(&map->len_dw[0], 0, sizeof(map->len_dw));
-
-   unsigned vertices_per_primitive =
-      mesa_vertices_per_prim(nir->info.mesh.primitive_type);
-
    map->max_primitives = nir->info.mesh.max_primitives_out;
    map->max_vertices = nir->info.mesh.max_vertices_out;
 
-   uint64_t outputs_written = nir->info.outputs_written;
+   /* NumPrimitives */
+   map->size += 4;
+
+   /* PrimX indices */
+   const unsigned vertices_per_primitive =
+      mesa_vertices_per_prim(nir->info.mesh.primitive_type);
 
-   /* One dword for primitives count then K extra dwords for each primitive. */
    switch (index_format) {
    case BRW_INDEX_FORMAT_U32:
-      map->per_primitive_indices_dw = vertices_per_primitive;
+      map->per_primitive_indices_stride = 4 * vertices_per_primitive;
       break;
    case BRW_INDEX_FORMAT_U888X:
-      map->per_primitive_indices_dw = 1;
+      map->per_primitive_indices_stride = 4;
       break;
    default:
       unreachable("invalid index format");
    }
 
-   map->per_primitive_start_dw = ALIGN(map->per_primitive_indices_dw *
-                                       map->max_primitives + 1, 8);
+   map->size += map->per_primitive_indices_stride * map->max_primitives;
 
-   /* Assign initial section. */
-   if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT) & outputs_written) {
-      map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT] = 0;
-      map->len_dw[VARYING_SLOT_PRIMITIVE_COUNT] = 1;
-      outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT);
-   }
-   if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES) & outputs_written) {
-      map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES] = 1;
-      map->len_dw[VARYING_SLOT_PRIMITIVE_INDICES] =
-            map->per_primitive_indices_dw * map->max_primitives;
-      outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES);
-   }
+   /* Per primitive blocks */
+   map->size = align(map->size, 32);
+   map->per_primitive_offset = map->size;
 
+   const uint64_t count_indices_bits =
+      VARYING_BIT_PRIMITIVE_COUNT |
+      VARYING_BIT_PRIMITIVE_INDICES;
    const uint64_t per_primitive_header_bits =
-         BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE) |
-         BITFIELD64_BIT(VARYING_SLOT_LAYER) |
-         BITFIELD64_BIT(VARYING_SLOT_VIEWPORT) |
-         BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE);
+      VARYING_BIT_PRIMITIVE_SHADING_RATE |
+      VARYING_BIT_LAYER |
+      VARYING_BIT_VIEWPORT |
+      VARYING_BIT_CULL_PRIMITIVE;
 
-   const uint64_t per_vertex_header_bits =
-         BITFIELD64_BIT(VARYING_SLOT_PSIZ) |
-         BITFIELD64_BIT(VARYING_SLOT_POS) |
-         BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0) |
-         BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1);
+   /* Do we need a header? */
+   map->has_per_primitive_header =
+      (nir->info.outputs_written &
+       nir->info.per_primitive_outputs &
+       per_primitive_header_bits) != 0;
 
-   std::list<struct attr_desc> orders[3];
-   uint64_t regular_outputs = outputs_written &
-         ~(per_primitive_header_bits | per_vertex_header_bits);
+   uint32_t first_per_prim_offset;
+   brw_compute_per_primitive_map(map->per_primitive_offsets,
+                                 &map->per_primitive_stride,
+                                 &first_per_prim_offset,
+                                 map->has_per_primitive_header ? 32 : 0,
+                                 nir, nir_var_shader_out,
+                                 nir->info.outputs_written &
+                                 nir->info.per_primitive_outputs,
+                                 vue_layout != INTEL_VUE_LAYOUT_FIXED);
 
-   /* packing into prim header is possible only if prim header is present */
-   map->user_data_in_primitive_header = compact_mue &&
-         (outputs_written & per_primitive_header_bits) != 0;
+   map->per_primitive_offsets[VARYING_SLOT_PRIMITIVE_COUNT] = 0;
+   map->per_primitive_offsets[VARYING_SLOT_PRIMITIVE_INDICES] = 4;
+   if (map->has_per_primitive_header) {
+      /* Setup all the fields in the header */
+      map->per_primitive_offsets[VARYING_SLOT_PRIMITIVE_SHADING_RATE] = 0;
+      map->per_primitive_offsets[VARYING_SLOT_LAYER] = 4;
+      map->per_primitive_offsets[VARYING_SLOT_VIEWPORT] = 8;
+      map->per_primitive_offsets[VARYING_SLOT_CULL_PRIMITIVE] = 12;
+   }
 
-   /* Packing into vert header is always possible, but we allow it only
-    * if full vec4 is available (so point size is not used) and there's
-    * nothing between it and normal vertex data (so no clip distances).
+   map->per_primitive_stride = align(map->per_primitive_stride, 32);
+
+   map->size += map->per_primitive_stride * map->max_primitives;
+   assert(map->size % 32 == 0);
+
+   assert((nir->info.outputs_written & VARYING_BIT_PRIMITIVE_ID) == 0 ||
+          (nir->info.outputs_written & nir->info.per_primitive_outputs) != 0);
+
+   /* Per vertex blocks:
+    *
+    * For some selected bit that can appear either as per-primitive or
+    * per-vertex inputs to the fragment shader, we need to add them to the
+    * per-vertex block as well so that the layouts match. Even though they're
+    * not written.
     */
-   map->user_data_in_vertex_header = compact_mue &&
-         (outputs_written & per_vertex_header_bits) ==
-               BITFIELD64_BIT(VARYING_SLOT_POS);
+   const uint64_t per_primitive_outputs =
+      nir->info.outputs_written & nir->info.per_primitive_outputs;
+   const uint64_t per_vertex_outputs =
+      (nir->info.outputs_written &
+       ~(per_primitive_outputs | count_indices_bits | per_primitive_header_bits));
 
-   if (outputs_written & per_primitive_header_bits) {
-      bool zero_layer_viewport = false;
-      if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE)) {
-         map->start_dw[VARYING_SLOT_PRIMITIVE_SHADING_RATE] =
-               map->per_primitive_start_dw + 0;
-         map->len_dw[VARYING_SLOT_PRIMITIVE_SHADING_RATE] = 1;
-         /* Wa_16020916187: force 0 writes to layer and viewport slots */
-         zero_layer_viewport =
-            intel_needs_workaround(compiler->devinfo, 16020916187);
-      }
-
-      if ((outputs_written & BITFIELD64_BIT(VARYING_SLOT_LAYER)) ||
-          zero_layer_viewport) {
-         map->start_dw[VARYING_SLOT_LAYER] =
-               map->per_primitive_start_dw + 1; /* RTAIndex */
-         map->len_dw[VARYING_SLOT_LAYER] = 1;
-      }
-
-      if ((outputs_written & BITFIELD64_BIT(VARYING_SLOT_VIEWPORT)) ||
-          zero_layer_viewport) {
-          map->start_dw[VARYING_SLOT_VIEWPORT] =
-                map->per_primitive_start_dw + 2;
-          map->len_dw[VARYING_SLOT_VIEWPORT] = 1;
-      }
-
-      if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE)) {
-         map->start_dw[VARYING_SLOT_CULL_PRIMITIVE] =
-               map->per_primitive_start_dw + 3;
-         map->len_dw[VARYING_SLOT_CULL_PRIMITIVE] = 1;
-      }
-
-      map->per_primitive_header_size_dw = 8;
-      outputs_written &= ~per_primitive_header_bits;
-   } else {
-      map->per_primitive_header_size_dw = 0;
-   }
-
-   map->per_primitive_data_size_dw = 0;
-
-   /* For fast linked libraries, we can't pack the MUE, as the fragment shader
-    * will be compiled without access to the MUE map and won't be able to find
-    * out where everything is.
-    * Instead, keep doing things as we did before the packing, just laying out
-    * everything in varying order, which is how the FS will expect them.
-    */
-   if (compact_mue) {
-      brw_compute_mue_layout(compiler, orders, regular_outputs, nir,
-                             &map->user_data_in_primitive_header,
-                             &map->user_data_in_vertex_header);
-
-      unsigned start_dw = map->per_primitive_start_dw;
-      if (map->user_data_in_primitive_header)
-         start_dw += 4; /* first 4 dwords are used */
-      else
-         start_dw += map->per_primitive_header_size_dw;
-      unsigned header_used_dw = 0;
-
-      for (auto it = orders[PRIM].cbegin(); it != orders[PRIM].cend(); ++it) {
-         int location = (*it).location;
-         if (location < 0) {
-            start_dw += (*it).dwords;
-            if (map->user_data_in_primitive_header && header_used_dw < 4)
-               header_used_dw += (*it).dwords;
-            else
-               map->per_primitive_data_size_dw += (*it).dwords;
-            assert(header_used_dw <= 4);
-            continue;
-         }
-
-         assert(map->start_dw[location] == -1);
-
-         assert(location == VARYING_SLOT_PRIMITIVE_ID ||
-                location >= VARYING_SLOT_VAR0);
-
-         brw_mue_assign_position(&*it, map, start_dw);
-
-         start_dw += (*it).dwords;
-         if (map->user_data_in_primitive_header && header_used_dw < 4)
-            header_used_dw += (*it).dwords;
-         else
-            map->per_primitive_data_size_dw += (*it).dwords;
-         assert(header_used_dw <= 4);
-         outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
-      }
-   } else {
-      unsigned start_dw = map->per_primitive_start_dw +
-                          map->per_primitive_header_size_dw;
-
-      uint64_t per_prim_outputs = outputs_written & nir->info.per_primitive_outputs;
-      while (per_prim_outputs) {
-         uint64_t location = ffsll(per_prim_outputs) - 1;
-
-         assert(map->start_dw[location] == -1);
-         assert(location == VARYING_SLOT_PRIMITIVE_ID ||
-                location >= VARYING_SLOT_VAR0);
-
-         nir_variable *var =
-            brw_nir_find_complete_variable_with_location(nir,
-                                                         nir_var_shader_out,
-                                                         location);
-         struct attr_desc d;
-         d.location = location;
-         d.type     = brw_nir_get_var_type(nir, var);
-         d.dwords   = glsl_count_dword_slots(d.type, false);
-         d.slots    = glsl_count_attribute_slots(d.type, false);
-
-         brw_mue_assign_position(&d, map, start_dw);
-
-         map->per_primitive_data_size_dw += ALIGN(d.dwords, 4);
-         start_dw += ALIGN(d.dwords, 4);
-
-         per_prim_outputs &= ~BITFIELD64_RANGE(location, d.slots);
-      }
-   }
-
-   map->per_primitive_pitch_dw = ALIGN(map->per_primitive_header_size_dw +
-                                       map->per_primitive_data_size_dw, 8);
-
-   map->per_vertex_start_dw = ALIGN(map->per_primitive_start_dw +
-                                    map->per_primitive_pitch_dw *
-                                    map->max_primitives, 8);
-
-   /* TODO(mesh): Multiview. */
-   unsigned fixed_header_size = 8;
-   map->per_vertex_header_size_dw = ALIGN(fixed_header_size +
-                                          nir->info.clip_distance_array_size +
-                                          nir->info.cull_distance_array_size, 8);
-
-   if (outputs_written & per_vertex_header_bits) {
-      if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_PSIZ)) {
-         map->start_dw[VARYING_SLOT_PSIZ] = map->per_vertex_start_dw + 3;
-         map->len_dw[VARYING_SLOT_PSIZ] = 1;
-      }
-
-      if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_POS)) {
-         map->start_dw[VARYING_SLOT_POS] = map->per_vertex_start_dw + 4;
-         map->len_dw[VARYING_SLOT_POS] = 4;
-      }
-
-      if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0)) {
-         map->start_dw[VARYING_SLOT_CLIP_DIST0] =
-               map->per_vertex_start_dw + fixed_header_size + 0;
-         map->len_dw[VARYING_SLOT_CLIP_DIST0] = 4;
-      }
-
-      if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1)) {
-         map->start_dw[VARYING_SLOT_CLIP_DIST1] =
-               map->per_vertex_start_dw + fixed_header_size + 4;
-         map->len_dw[VARYING_SLOT_CLIP_DIST1] = 4;
-      }
-
-      outputs_written &= ~per_vertex_header_bits;
-   }
-
-   /* cull distances should be lowered earlier */
-   assert(!(outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_DIST0)));
-   assert(!(outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_DIST1)));
-
-   map->per_vertex_data_size_dw = 0;
-
-   /* For fast linked libraries, we can't pack the MUE, as the fragment shader
-    * will be compiled without access to the MUE map and won't be able to find
-    * out where everything is.
-    * Instead, keep doing things as we did before the packing, just laying out
-    * everything in varying order, which is how the FS will expect them.
-    */
-   if (compact_mue) {
-      unsigned start_dw = map->per_vertex_start_dw;
-      if (!map->user_data_in_vertex_header)
-         start_dw += map->per_vertex_header_size_dw;
-
-      unsigned header_used_dw = 0;
-      for (unsigned type = VERT; type <= VERT_FLAT; ++type) {
-         for (auto it = orders[type].cbegin(); it != orders[type].cend(); ++it) {
-            int location = (*it).location;
-            if (location < 0) {
-               start_dw += (*it).dwords;
-               if (map->user_data_in_vertex_header && header_used_dw < 4) {
-                  header_used_dw += (*it).dwords;
-                  assert(header_used_dw <= 4);
-                  if (header_used_dw == 4)
-                     start_dw += 4; /* jump over gl_position */
-               } else {
-                  map->per_vertex_data_size_dw += (*it).dwords;
-               }
-               continue;
-            }
-
-            assert(map->start_dw[location] == -1);
-
-            assert(location >= VARYING_SLOT_VAR0);
-
-            brw_mue_assign_position(&*it, map, start_dw);
-
-            start_dw += (*it).dwords;
-            if (map->user_data_in_vertex_header && header_used_dw < 4) {
-               header_used_dw += (*it).dwords;
-               assert(header_used_dw <= 4);
-               if (header_used_dw == 4)
-                  start_dw += 4; /* jump over gl_position */
-            } else {
-               map->per_vertex_data_size_dw += (*it).dwords;
-            }
-            outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
-         }
-      }
-   } else {
-      unsigned start_dw = map->per_vertex_start_dw +
-                          map->per_vertex_header_size_dw;
-
-      uint64_t per_vertex_outputs = outputs_written & ~nir->info.per_primitive_outputs;
-      while (per_vertex_outputs) {
-         uint64_t location = ffsll(per_vertex_outputs) - 1;
-
-         assert(map->start_dw[location] == -1);
-         assert(location >= VARYING_SLOT_VAR0);
-
-         nir_variable *var =
-            brw_nir_find_complete_variable_with_location(nir,
-                                                         nir_var_shader_out,
-                                                         location);
-         struct attr_desc d;
-         d.location = location;
-         d.type     = brw_nir_get_var_type(nir, var);
-         d.dwords   = glsl_count_dword_slots(d.type, false);
-         d.slots    = glsl_count_attribute_slots(d.type, false);
-
-         brw_mue_assign_position(&d, map, start_dw);
-
-         map->per_vertex_data_size_dw += ALIGN(d.dwords, 4);
-         start_dw += ALIGN(d.dwords, 4);
-
-         per_vertex_outputs &= ~BITFIELD64_RANGE(location, d.slots);
-      }
-   }
-
-   map->per_vertex_pitch_dw = ALIGN(map->per_vertex_header_size_dw +
-                                    map->per_vertex_data_size_dw, 8);
-
-   map->size_dw =
-      map->per_vertex_start_dw + map->per_vertex_pitch_dw * map->max_vertices;
-
-   assert(map->size_dw % 8 == 0);
+   map->per_vertex_offset = map->size;
+   brw_compute_vue_map(compiler->devinfo,
+                       &map->vue_map, per_vertex_outputs,
+                       vue_layout, 1 /* pos_slots, TODO: multiview */);
+   map->per_vertex_stride = align(map->vue_map.num_slots * 16, 32);
+   map->size += map->per_vertex_stride * map->max_vertices;
+   assert(map->size % 32 == 0);
 }
 
 static void
 brw_print_mue_map(FILE *fp, const struct brw_mue_map *map, struct nir_shader *nir)
 {
-   fprintf(fp, "MUE map (%d dwords, %d primitives, %d vertices)\n",
-           map->size_dw, map->max_primitives, map->max_vertices);
-   fprintf(fp, "  <%4d, %4d>: VARYING_SLOT_PRIMITIVE_COUNT\n",
-           map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT],
-           map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT] +
-           map->len_dw[VARYING_SLOT_PRIMITIVE_COUNT] - 1);
-   fprintf(fp, "  <%4d, %4d>: VARYING_SLOT_PRIMITIVE_INDICES\n",
-           map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES],
-           map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES] +
-           map->len_dw[VARYING_SLOT_PRIMITIVE_INDICES] - 1);
+   fprintf(fp, "MUE map (%d bytes, %d primitives, %d vertices):\n",
+           map->size, map->max_primitives, map->max_vertices);
+   fprintf(fp, "   indices_stride:   %d\n", map->per_primitive_indices_stride);
+   fprintf(fp, "   primitive_header: %d\n", map->has_per_primitive_header);
+   fprintf(fp, "   primitive_offset: %d\n", map->per_primitive_offset);
+   fprintf(fp, "   primitive_stride: %d\n", map->per_primitive_stride);
+   fprintf(fp, "   vertex_offset:    %d\n", map->per_vertex_offset);
+   fprintf(fp, "   vertex_stride:    %d\n", map->per_vertex_stride);
 
-   fprintf(fp, "  ----- per primitive (start %d, header_size %d, data_size %d, pitch %d)\n",
-           map->per_primitive_start_dw,
-           map->per_primitive_header_size_dw,
-           map->per_primitive_data_size_dw,
-           map->per_primitive_pitch_dw);
-
-   for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
-      if (map->start_dw[i] < 0)
+   fprintf(fp, "   primitive offsets:\n");
+   fprintf(fp, "      %s: %d\n",
+           gl_varying_slot_name_for_stage(VARYING_SLOT_PRIMITIVE_COUNT,
+                                          MESA_SHADER_MESH),
+           map->per_primitive_offsets[VARYING_SLOT_PRIMITIVE_COUNT]);
+   fprintf(fp, "      %s: %d\n",
+           gl_varying_slot_name_for_stage(VARYING_SLOT_PRIMITIVE_INDICES,
+                                          MESA_SHADER_MESH),
+           map->per_primitive_offsets[VARYING_SLOT_PRIMITIVE_INDICES]);
+   for (uint32_t i = 0; i < VARYING_SLOT_MAX; i++) {
+      if (map->per_primitive_offsets[i] < 0 ||
+          i == VARYING_SLOT_PRIMITIVE_COUNT ||
+          i == VARYING_SLOT_PRIMITIVE_INDICES)
          continue;
-
-      const unsigned offset = map->start_dw[i];
-      const unsigned len = map->len_dw[i];
-
-      if (offset < map->per_primitive_start_dw ||
-          offset >= map->per_primitive_start_dw + map->per_primitive_pitch_dw)
-         continue;
-
-      const char *name =
-            gl_varying_slot_name_for_stage((gl_varying_slot)i,
-                                           MESA_SHADER_MESH);
-
-      fprintf(fp, "  <%4d, %4d>: %s (%d)\n", offset, offset + len - 1,
-              name, i);
+      fprintf(fp, "      %s: %d (relative %d)\n",
+              gl_varying_slot_name_for_stage((gl_varying_slot)i,
+                                             MESA_SHADER_MESH),
+              map->per_primitive_offset + map->per_primitive_offsets[i],
+              map->per_primitive_offsets[i]);
    }
+   brw_print_vue_map(fp, &map->vue_map, MESA_SHADER_MESH);
+}
 
-   fprintf(fp, "  ----- per vertex (start %d, header_size %d, data_size %d, pitch %d)\n",
-           map->per_vertex_start_dw,
-           map->per_vertex_header_size_dw,
-           map->per_vertex_data_size_dw,
-           map->per_vertex_pitch_dw);
+static bool
+remap_io_to_dwords(nir_builder *b, nir_intrinsic_instr *intrin, void *data)
+{
+   if (intrin->intrinsic != nir_intrinsic_load_per_vertex_output &&
+       intrin->intrinsic != nir_intrinsic_load_per_primitive_output &&
+       intrin->intrinsic != nir_intrinsic_store_per_vertex_output &&
+       intrin->intrinsic != nir_intrinsic_store_per_primitive_output)
+      return false;
 
-   for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
-      if (map->start_dw[i] < 0)
-         continue;
+   nir_io_semantics io_sem = nir_intrinsic_io_semantics(intrin);
+   if (io_sem.location == VARYING_SLOT_PRIMITIVE_INDICES ||
+       io_sem.location == VARYING_SLOT_PRIMITIVE_COUNT)
+      return false;
 
-      const unsigned offset = map->start_dw[i];
-      const unsigned len = map->len_dw[i];
+   nir_intrinsic_set_base(intrin, nir_intrinsic_base(intrin) * 4);
+   nir_intrinsic_set_range(intrin, nir_intrinsic_range(intrin) * 4);
 
-      if (offset < map->per_vertex_start_dw ||
-          offset >= map->per_vertex_start_dw + map->per_vertex_pitch_dw)
-         continue;
+   b->cursor = nir_before_instr(&intrin->instr);
 
-      nir_variable *var =
-            nir_find_variable_with_location(nir, nir_var_shader_out, i);
-      bool flat = var->data.interpolation == INTERP_MODE_FLAT;
+   nir_src *offset = nir_get_io_offset_src(intrin);
+   assert(offset != NULL);
 
-      const char *name =
-            gl_varying_slot_name_for_stage((gl_varying_slot)i,
-                                           MESA_SHADER_MESH);
+   nir_src_rewrite(offset, nir_ishl_imm(b, offset->ssa, 2));
 
-      fprintf(fp, "  <%4d, %4d>: %s (%d)%s\n", offset, offset + len - 1,
-              name, i, flat ? " (flat)" : "");
-   }
-
-   fprintf(fp, "\n");
+   return true;
 }
 
 static void
@@ -1229,12 +684,73 @@ brw_nir_lower_mue_outputs(nir_shader *nir, const struct brw_mue_map *map)
    nir_foreach_shader_out_variable(var, nir) {
       int location = var->data.location;
       assert(location >= 0);
-      assert(map->start_dw[location] != -1);
-      var->data.driver_location = map->start_dw[location];
+
+      switch (location) {
+      case VARYING_SLOT_PRIMITIVE_COUNT:
+      case VARYING_SLOT_PRIMITIVE_INDICES:
+         /* Primitive count & indices are not part of the per-primitive block,
+          * they have there own spot just before. We saved their offset in the
+          * the per-primitive array, we just don't need to add the block
+          * offset.
+          */
+         var->data.driver_location =
+            map->per_primitive_offsets[location] / 4;
+         break;
+
+      case VARYING_SLOT_PRIMITIVE_SHADING_RATE:
+         var->data.driver_location = map->per_primitive_offset / 16;
+         var->data.location_frac = 0;
+         break;
+
+      case VARYING_SLOT_LAYER:
+         var->data.driver_location = map->per_primitive_offset / 16;
+         var->data.location_frac = 1;
+         break;
+
+      case VARYING_SLOT_VIEWPORT:
+         var->data.driver_location = map->per_primitive_offset / 16;
+         var->data.location_frac = 2;
+         break;
+
+      case VARYING_SLOT_CULL_PRIMITIVE:
+         var->data.driver_location = map->per_primitive_offset / 16;
+         var->data.location_frac = 3;
+         break;
+
+      case VARYING_SLOT_PSIZ:
+         var->data.driver_location = map->per_vertex_offset / 16;
+         var->data.location_frac = 3;
+         break;
+
+      default:
+         if (nir->info.per_primitive_outputs & BITFIELD64_BIT(location))  {
+            assert(map->per_primitive_offsets[location] != -1);
+            var->data.driver_location =
+               (map->per_primitive_offset +
+                map->per_primitive_offsets[location]) / 16;
+         } else {
+            /* Each per vertex location has its own slot/vec4 (16B) of data, use
+             * map->vue_map.varying_to_slot[] to get the 16B offset and add the
+             * per-vertex block offset.
+             */
+            assert(map->vue_map.varying_to_slot[location] != -1);
+            var->data.driver_location =
+               map->per_vertex_offset / 16 +
+               map->vue_map.varying_to_slot[location];
+         }
+         break;
+      }
    }
 
    NIR_PASS(_, nir, nir_lower_io, nir_var_shader_out,
-            type_size_scalar_dwords, nir_lower_io_lower_64bit_to_32);
+            type_size_vec4,
+            nir_lower_io_lower_64bit_to_32);
+
+   /* Everythings works with slots in terms if IO, but our backend deals with
+    * dwords. Apply remapping.
+    */
+   NIR_PASS(_, nir, nir_shader_intrinsics_pass,
+            remap_io_to_dwords, nir_metadata_control_flow, NULL);
 }
 
 static void
@@ -1242,8 +758,6 @@ brw_nir_initialize_mue(nir_shader *nir,
                        const struct brw_mue_map *map,
                        unsigned dispatch_width)
 {
-   assert(map->per_primitive_header_size_dw > 0);
-
    nir_builder b;
    nir_function_impl *entrypoint = nir_shader_get_entrypoint(nir);
    b = nir_builder_at(nir_before_impl(entrypoint));
@@ -1277,7 +791,7 @@ brw_nir_initialize_mue(nir_shader *nir,
                                            prim_in_inv * workgroup_size);
 
       nir_store_per_primitive_output(&b, zerovec, prim, dw_off,
-                                     .base = (int)map->per_primitive_start_dw,
+                                     .base = (int)map->per_primitive_offset / 4,
                                      .write_mask = WRITEMASK_XYZW,
                                      .component = 0,
                                      .src_type = nir_type_uint32);
@@ -1297,7 +811,7 @@ brw_nir_initialize_mue(nir_shader *nir,
                                                prims_per_inv * workgroup_size);
 
          nir_store_per_primitive_output(&b, zerovec, prim, dw_off,
-                                        .base = (int)map->per_primitive_start_dw,
+                                        .base = (int)map->per_primitive_offset / 4,
                                         .write_mask = WRITEMASK_XYZW,
                                         .component = 0,
                                         .src_type = nir_type_uint32);
@@ -1348,8 +862,7 @@ brw_nir_adjust_offset_for_arrayed_indices_instr(nir_builder *b,
    switch (intrin->intrinsic) {
    case nir_intrinsic_load_per_vertex_output:
    case nir_intrinsic_store_per_vertex_output:
-      brw_nir_adjust_offset(b, intrin, map->per_vertex_pitch_dw);
-
+      brw_nir_adjust_offset(b, intrin, map->per_vertex_stride / 4);
       return true;
 
    case nir_intrinsic_load_per_primitive_output:
@@ -1357,12 +870,11 @@ brw_nir_adjust_offset_for_arrayed_indices_instr(nir_builder *b,
       struct nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
       uint32_t pitch;
       if (sem.location == VARYING_SLOT_PRIMITIVE_INDICES)
-         pitch = map->per_primitive_indices_dw;
+         pitch = map->per_primitive_indices_stride;
       else
-         pitch = map->per_primitive_pitch_dw;
-
-      brw_nir_adjust_offset(b, intrin, pitch);
+         pitch = map->per_primitive_stride;
 
+      brw_nir_adjust_offset(b, intrin, pitch / 4);
       return true;
    }
 
@@ -1571,11 +1083,12 @@ brw_mesh_autostrip_enable(const struct brw_compiler *compiler, struct nir_shader
     * those to 0. The workaround also requires disabling autostrip.
     */
    if (intel_needs_workaround(compiler->devinfo, 16020916187) &&
-       (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE) & outputs_written))
+       (VARYING_BIT_PRIMITIVE_SHADING_RATE & outputs_written))
        return false;
 
-   if (map->start_dw[VARYING_SLOT_VIEWPORT] < 0 &&
-       map->start_dw[VARYING_SLOT_LAYER] < 0)
+   /* Values not written */
+   if ((outputs_written & (VARYING_BIT_VIEWPORT |
+                           VARYING_BIT_LAYER)) == 0)
       return true;
 
    nir_def *vp = NULL;
@@ -1694,7 +1207,7 @@ brw_compile_mesh(const struct brw_compiler *compiler,
        * When Primitive Header is enabled, we may not generates writes to all
        * fields, so let's initialize everything.
        */
-      if (prog_data->map.per_primitive_header_size_dw > 0)
+      if (prog_data->map.has_per_primitive_header)
          NIR_PASS_V(shader, brw_nir_initialize_mue, &prog_data->map, dispatch_width);
 
       brw_nir_apply_key(shader, compiler, &key->base, dispatch_width);
diff --git a/src/intel/compiler/brw_compiler.c b/src/intel/compiler/brw_compiler.c
index 1ce6eea63e0..4e3f185c805 100644
--- a/src/intel/compiler/brw_compiler.c
+++ b/src/intel/compiler/brw_compiler.c
@@ -194,11 +194,6 @@ brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo)
       compiler->nir_options[i] = nir_options;
    }
 
-   compiler->mesh.mue_header_packing =
-         (unsigned)debug_get_num_option("INTEL_MESH_HEADER_PACKING", 3);
-   compiler->mesh.mue_compaction =
-         debug_get_bool_option("INTEL_MESH_COMPACTION", true);
-
    return compiler;
 }
 
@@ -218,8 +213,6 @@ brw_get_compiler_config_value(const struct brw_compiler *compiler)
    bits++;
    insert_u64_bit(&config, compiler->lower_dpas);
    bits++;
-   insert_u64_bit(&config, compiler->mesh.mue_compaction);
-   bits++;
 
    enum intel_debug_flag debug_bits[] = {
       DEBUG_NO_DUAL_OBJECT_GS,
@@ -244,9 +237,6 @@ brw_get_compiler_config_value(const struct brw_compiler *compiler)
    mask = 3;
    bits += util_bitcount64(mask);
 
-   u_foreach_bit64(bit, mask)
-      insert_u64_bit(&config, (compiler->mesh.mue_header_packing & (1ULL << bit)) != 0);
-
    assert(bits <= util_bitcount64(UINT64_MAX));
 
    return config;
diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h
index b45f0eb0393..8727b3ec00f 100644
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -122,11 +122,6 @@ struct brw_compiler {
    int spilling_rate;
 
    struct nir_shader *clc_shader;
-
-   struct {
-      unsigned mue_header_packing;
-      bool mue_compaction;
-   } mesh;
 };
 
 #define brw_shader_debug_log(compiler, data, fmt, ... ) do {    \
@@ -371,7 +366,7 @@ struct brw_wm_prog_key {
    /* Whether or not we are running on a multisampled framebuffer */
    enum intel_sometimes multisample_fbo:2;
 
-   /* Whether the preceding shader stage is mesh */
+   /* Whether the shader is dispatch with a preceeding mesh shader */
    enum intel_sometimes mesh_input:2;
 
    bool coherent_fb_fetch:1;
@@ -388,7 +383,7 @@ brw_wm_prog_key_is_dynamic(const struct brw_wm_prog_key *key)
    return key->alpha_to_coverage == INTEL_SOMETIMES ||
       key->persample_interp == INTEL_SOMETIMES ||
       key->multisample_fbo == INTEL_SOMETIMES ||
-      key->mesh_input == INTEL_SOMETIMES;
+      key->base.vue_layout == INTEL_VUE_LAYOUT_SEPARATE_MESH;
 }
 
 struct brw_cs_prog_key {
@@ -784,6 +779,13 @@ struct brw_wm_prog_data {
     */
    uint64_t inputs;
 
+   /**
+    * The FS per-primitive inputs (some bits can be in both inputs &
+    * per_primitive_inputs if the shader is compiled without being linked to
+    * the previous stage)
+    */
+   uint64_t per_primitive_inputs;
+
    /**
     * Map from gl_varying_slot to the position within the FS setup data
     * payload where the varying's attribute vertex deltas should be delivered.
@@ -1049,8 +1051,6 @@ typedef enum
 void brw_print_vue_map(FILE *fp, const struct intel_vue_map *vue_map,
                        gl_shader_stage stage);
 
-void brw_print_fs_urb_setup(FILE *fp, const struct brw_wm_prog_data *prog_data);
-
 /**
  * Convert a VUE slot number into a byte offset within the VUE.
  */
@@ -1069,6 +1069,16 @@ brw_varying_to_offset(const struct intel_vue_map *vue_map, unsigned varying)
    return brw_vue_slot_to_offset(vue_map->varying_to_slot[varying]);
 }
 
+void
+brw_compute_per_primitive_map(int *out_per_primitive_map,
+                              uint32_t *out_per_primitive_stride,
+                              uint32_t *out_first_offset,
+                              uint32_t base_offset,
+                              nir_shader *nir,
+                              uint32_t variables_mode,
+                              uint64_t slots_valid,
+                              bool separate_shader);
+
 void brw_compute_vue_map(const struct intel_device_info *devinfo,
                          struct intel_vue_map *vue_map,
                          uint64_t slots_valid,
@@ -1190,25 +1200,40 @@ struct brw_tue_map {
 };
 
 struct brw_mue_map {
-   int32_t start_dw[VARYING_SLOT_MAX];
-   uint32_t len_dw[VARYING_SLOT_MAX];
-   uint32_t per_primitive_indices_dw;
-
-   uint32_t size_dw;
+   /* Total size in bytes of the MUE (32B aligned) */
+   uint32_t size;
 
    uint32_t max_primitives;
-   uint32_t per_primitive_start_dw;
-   uint32_t per_primitive_header_size_dw;
-   uint32_t per_primitive_data_size_dw;
-   uint32_t per_primitive_pitch_dw;
-   bool user_data_in_primitive_header;
-
    uint32_t max_vertices;
-   uint32_t per_vertex_start_dw;
-   uint32_t per_vertex_header_size_dw;
-   uint32_t per_vertex_data_size_dw;
-   uint32_t per_vertex_pitch_dw;
-   bool user_data_in_vertex_header;
+
+   /* Stride in bytes between sets of primitive indices */
+   uint32_t per_primitive_indices_stride;
+
+   /* Per primitive offset from the start of the MUE (32B aligned) */
+   uint32_t per_primitive_offset;
+
+   /* Per primitive stride in bytes (32B aligned) */
+   uint32_t per_primitive_stride;
+
+   /* Whether the per primitive block includes a header */
+   bool has_per_primitive_header;
+
+   /* Per vertex offset in bytes from the start of the MUE (32B aligned) */
+   uint32_t per_vertex_offset;
+
+   /* Size of the per vertex header (32B aligned) */
+   uint32_t per_vertex_header_size;
+
+   /* Per vertex stride in bytes (32B aligned) */
+   uint32_t per_vertex_stride;
+
+   /* VUE map for the per vertex attributes */
+   struct intel_vue_map vue_map;
+
+   /* Offset in bytes of each per primitive relative to
+    * per_primitive_offset (-1 if unused)
+    */
+   int per_primitive_offsets[VARYING_SLOT_MAX];
 };
 
 struct brw_task_prog_data {
@@ -1628,7 +1653,28 @@ brw_stage_has_packed_dispatch(ASSERTED const struct intel_device_info *devinfo,
  */
 int
 brw_compute_first_fs_urb_slot_required(uint64_t inputs_read,
-                                       const struct intel_vue_map *prev_stage_vue_map);
+                                       const struct intel_vue_map *prev_stage_vue_map,
+                                       bool mesh);
+
+void
+brw_compute_sbe_per_vertex_urb_read(const struct intel_vue_map *prev_stage_vue_map,
+                                    bool mesh,
+                                    const struct brw_wm_prog_data *wm_prog_data,
+                                    uint32_t *out_first_slot,
+                                    uint32_t *num_slots,
+                                    uint32_t *out_num_varyings,
+                                    uint32_t *out_primitive_id_offset);
+
+/**
+ * Computes the URB offset at which SBE should read the per primitive date
+ * written by the mesh shader.
+ */
+void
+brw_compute_sbe_per_primitive_urb_read(uint64_t inputs_read,
+                                       uint32_t num_varyings,
+                                       const struct brw_mue_map *mue_map,
+                                       uint32_t *out_read_offset,
+                                       uint32_t *out_read_length);
 
 /* From InlineData in 3DSTATE_TASK_SHADER_DATA and 3DSTATE_MESH_SHADER_DATA. */
 #define BRW_TASK_MESH_INLINE_DATA_SIZE_DW 8
diff --git a/src/intel/compiler/brw_from_nir.cpp b/src/intel/compiler/brw_from_nir.cpp
index ebaf2670d92..a397f01cba0 100644
--- a/src/intel/compiler/brw_from_nir.cpp
+++ b/src/intel/compiler/brw_from_nir.cpp
@@ -4153,7 +4153,8 @@ brw_interp_reg(const brw_builder &bld, unsigned location,
 {
    brw_shader &s = *bld.shader;
    assert(s.stage == MESA_SHADER_FRAGMENT);
-   assert(BITFIELD64_BIT(location) & ~s.nir->info.per_primitive_inputs);
+   assert((BITFIELD64_BIT(location) & ~s.nir->info.per_primitive_inputs) ||
+          location == VARYING_SLOT_PRIMITIVE_ID);
 
    const struct brw_wm_prog_data *prog_data = brw_wm_prog_data(s.prog_data);
 
@@ -4161,10 +4162,6 @@ brw_interp_reg(const brw_builder &bld, unsigned location,
    unsigned nr = prog_data->urb_setup[location];
    channel += prog_data->urb_setup_channel[location];
 
-   /* Adjust so we start counting from the first per_vertex input. */
-   assert(nr >= prog_data->num_per_primitive_inputs);
-   nr -= prog_data->num_per_primitive_inputs;
-
    const unsigned per_vertex_start = prog_data->num_per_primitive_inputs;
    const unsigned regnr = per_vertex_start + (nr * 4) + channel;
 
@@ -4196,14 +4193,15 @@ brw_per_primitive_reg(const brw_builder &bld, int location, unsigned comp)
 
    const struct brw_wm_prog_data *prog_data = brw_wm_prog_data(s.prog_data);
 
-   comp += prog_data->urb_setup_channel[location];
+   comp += (s.fs.per_primitive_offsets[location] % 16) / 4;
 
-   assert(prog_data->urb_setup[location] >= 0);
-
-   const unsigned regnr = prog_data->urb_setup[location] + comp / 4;
+   const unsigned regnr = s.fs.per_primitive_offsets[location] / 16 + comp / 4;
 
+   assert(s.fs.per_primitive_offsets[location] >= 0);
    assert(regnr < prog_data->num_per_primitive_inputs);
 
+   brw_reg loc_reg = brw_attr_reg(regnr, BRW_TYPE_UD);
+
    if (s.max_polygons > 1) {
       /* In multipolygon dispatch each primitive constant is a
        * dispatch_width-wide SIMD vector (see comment in
@@ -4211,11 +4209,10 @@ brw_per_primitive_reg(const brw_builder &bld, int location, unsigned comp)
        * component() to select the specified parameter.
        */
       const brw_reg tmp = bld.vgrf(BRW_TYPE_UD);
-      bld.MOV(tmp, offset(brw_attr_reg(regnr, BRW_TYPE_UD),
-                          s.dispatch_width, comp % 4));
+      bld.MOV(tmp, offset(loc_reg, s.dispatch_width, comp % 4));
       return retype(tmp, BRW_TYPE_F);
    } else {
-      return component(brw_attr_reg(regnr, BRW_TYPE_F), comp % 4);
+      return component(loc_reg, comp % 4);
    }
 }
 
@@ -4419,7 +4416,7 @@ brw_from_nir_emit_fs_intrinsic(nir_to_brw_state &ntb,
          break;
       }
 
-      if (BITFIELD64_BIT(base) & s.nir->info.per_primitive_inputs) {
+      if (instr->intrinsic == nir_intrinsic_load_per_primitive_input) {
          assert(base != VARYING_SLOT_PRIMITIVE_INDICES);
          for (unsigned int i = 0; i < num_components; i++) {
             bld.MOV(offset(dest, bld, i),
diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c
index d71a5b641f4..90d172617f9 100644
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -658,11 +658,75 @@ lower_barycentric_at_offset(nir_builder *b, nir_intrinsic_instr *intrin,
    return true;
 }
 
+static bool
+lower_indirect_primitive_id(nir_builder *b,
+                            nir_intrinsic_instr *intrin,
+                            void *data)
+{
+   if (intrin->intrinsic != nir_intrinsic_load_per_primitive_input)
+      return false;
+
+   if (nir_intrinsic_io_semantics(intrin).location != VARYING_SLOT_PRIMITIVE_ID)
+      return false;
+
+   nir_def *indirect_primitive_id = data;
+   nir_def_replace(&intrin->def, indirect_primitive_id);
+
+   return true;
+}
+
 void
 brw_nir_lower_fs_inputs(nir_shader *nir,
                         const struct intel_device_info *devinfo,
                         const struct brw_wm_prog_key *key)
 {
+   nir_def *indirect_primitive_id = NULL;
+   if (key->base.vue_layout == INTEL_VUE_LAYOUT_SEPARATE_MESH &&
+       (nir->info.inputs_read & VARYING_BIT_PRIMITIVE_ID)) {
+      nir_builder _b = nir_builder_at(nir_before_impl(nir_shader_get_entrypoint(nir))), *b = &_b;
+      nir_def *index = nir_ushr_imm(b,
+                                    nir_load_fs_msaa_intel(b),
+                                    INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX_OFFSET);
+      nir_def *max_poly = nir_load_max_polygon_intel(b);
+      /* Build the per-vertex offset into the attribute section of the thread
+       * payload. There is always one GRF of padding in front.
+       *
+       * The computation is fairly complicated due to the layout of the
+       * payload. You can find a description of the layout in
+       * brw_compile_fs.cpp brw_assign_urb_setup().
+       *
+       * Gfx < 20 packs 2 slots per GRF (hence the %/ 2 in the formula)
+       * Gfx >= 20 pack 5 slots per GRF (hence the %/ 5 in the formula)
+       *
+       * Then an additional offset needs to added to handle how multiple
+       * polygon data is interleaved.
+       */
+      nir_def *per_vertex_offset = nir_iadd_imm(
+         b,
+         devinfo->ver >= 20 ?
+         nir_iadd(b,
+                  nir_imul(b, nir_udiv_imm(b, index, 5), nir_imul_imm(b, max_poly, 64)),
+                  nir_imul_imm(b, nir_umod_imm(b, index, 5), 12)) :
+         nir_iadd_imm(
+            b,
+            nir_iadd(
+               b,
+               nir_imul(b, nir_udiv_imm(b, index, 2), nir_imul_imm(b, max_poly, 32)),
+               nir_imul_imm(b, nir_umod_imm(b, index, 2), 16)),
+            12),
+         devinfo->grf_size);
+      /* When the attribute index is INTEL_MSAA_FLAG_PRIMITIVE_ID_MESH_INDEX,
+       * it means the value is coming from the per-primitive block. We always
+       * lay out PrimitiveID at offset 0 in the per-primitive block.
+       */
+      nir_def *attribute_offset = nir_bcsel(
+         b,
+         nir_ieq_imm(b, index, INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX_MESH),
+         nir_imm_int(b, 0), per_vertex_offset);
+      indirect_primitive_id =
+         nir_read_attribute_payload_intel(b, attribute_offset);
+   }
+
    nir_foreach_shader_in_variable(var, nir) {
       var->data.driver_location = var->data.location;
 
@@ -679,6 +743,14 @@ brw_nir_lower_fs_inputs(nir_shader *nir,
          var->data.interpolation = flat ? INTERP_MODE_FLAT
                                         : INTERP_MODE_SMOOTH;
       }
+
+      /* Always pull the PrimitiveID from the per-primitive block if mesh can be involved.
+       */
+      if (var->data.location == VARYING_SLOT_PRIMITIVE_ID &&
+         key->mesh_input != INTEL_NEVER) {
+         var->data.per_primitive = true;
+         nir->info.per_primitive_inputs |= VARYING_BIT_PRIMITIVE_ID;
+      }
    }
 
    NIR_PASS(_, nir, nir_lower_io,
@@ -704,6 +776,13 @@ brw_nir_lower_fs_inputs(nir_shader *nir,
                NULL);
    }
 
+   if (indirect_primitive_id != NULL) {
+      NIR_PASS(_, nir, nir_shader_intrinsics_pass,
+               lower_indirect_primitive_id,
+               nir_metadata_control_flow,
+               indirect_primitive_id);
+   }
+
    /* This pass needs actual constants */
    NIR_PASS(_, nir, nir_opt_constant_folding);
 
diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp
index d4b9aec396e..3440d2874fa 100644
--- a/src/intel/compiler/brw_shader.cpp
+++ b/src/intel/compiler/brw_shader.cpp
@@ -477,6 +477,9 @@ brw_shader::init()
 
    this->gs.control_data_bits_per_vertex = 0;
    this->gs.control_data_header_size_bits = 0;
+
+   memset(&this->fs.per_primitive_offsets, -1,
+          sizeof(this->fs.per_primitive_offsets));
 }
 
 brw_shader::~brw_shader()
@@ -548,6 +551,16 @@ brw_shader::import_uniforms(brw_shader *v)
    this->uniforms = v->uniforms;
 }
 
+/* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
+ * This brings in those uniform definitions
+ */
+void
+brw_shader::import_per_primitive_offsets(const int *per_primitive_offsets)
+{
+   memcpy(this->fs.per_primitive_offsets, per_primitive_offsets,
+          sizeof(this->fs.per_primitive_offsets));
+}
+
 enum intel_barycentric_mode
 brw_barycentric_mode(const struct brw_wm_prog_key *key,
                      nir_intrinsic_instr *intr)
@@ -1309,4 +1322,3 @@ brw_allocate_vgrf_units(brw_shader &s, unsigned units_of_REGSIZE)
 {
    return brw_vgrf(brw_allocate_vgrf_number(s, units_of_REGSIZE), BRW_TYPE_UD);
 }
-
diff --git a/src/intel/compiler/brw_shader.h b/src/intel/compiler/brw_shader.h
index f317aaf79ad..6ba50bdc500 100644
--- a/src/intel/compiler/brw_shader.h
+++ b/src/intel/compiler/brw_shader.h
@@ -83,6 +83,7 @@ public:
    ~brw_shader();
 
    void import_uniforms(brw_shader *v);
+   void import_per_primitive_offsets(const int *per_primitive_offsets);
 
    void assign_curb_setup();
    void convert_attr_sources_to_hw_regs(brw_inst *inst);
@@ -202,6 +203,11 @@ public:
       unsigned control_data_header_size_bits;
    } gs;
 
+   struct {
+      /* Offset of per-primitive locations in bytes */
+      int per_primitive_offsets[VARYING_SLOT_MAX];
+   } fs;
+
    unsigned grf_used;
    bool spilled_any_registers;
    bool needs_register_pressure;
diff --git a/src/intel/compiler/brw_vue_map.c b/src/intel/compiler/brw_vue_map.c
index 922c208df8e..d5c2a32d3ef 100644
--- a/src/intel/compiler/brw_vue_map.c
+++ b/src/intel/compiler/brw_vue_map.c
@@ -42,6 +42,105 @@
 
 #include "brw_compiler.h"
 #include "dev/intel_debug.h"
+#include "brw_nir.h"
+
+static unsigned
+get_var_slots(gl_shader_stage stage, const nir_variable *var)
+{
+   const struct glsl_type *type = var->type;
+
+   if (nir_is_arrayed_io(var, stage)) {
+      assert(glsl_type_is_array(type));
+      type = glsl_get_array_element(type);
+   }
+
+   /* EXT_mesh_shader: PRIMITIVE_INDICES is a flat array, not a proper arrayed
+    * output, as opposed to D3D-style mesh shaders where it's addressed by the
+    * primitive index. Prevent assigning several slots to primitive indices,
+    * to avoid some issues.
+    */
+   if (stage == MESA_SHADER_MESH &&
+       var->data.location == VARYING_SLOT_PRIMITIVE_INDICES &&
+       !nir_is_arrayed_io(var, stage))
+      return 1;
+
+   return glsl_count_vec4_slots(type, false, var->data.bindless);
+}
+
+void
+brw_compute_per_primitive_map(int *out_per_primitive_map,
+                              uint32_t *out_per_primitive_stride,
+                              uint32_t *out_first_offset,
+                              uint32_t base_offset,
+                              nir_shader *nir,
+                              uint32_t variables_mode,
+                              uint64_t slots_valid,
+                              bool separate_shader)
+{
+   memset(out_per_primitive_map, -1, sizeof(*out_per_primitive_map) * VARYING_SLOT_MAX);
+   *out_per_primitive_stride = base_offset;
+   *out_first_offset = UINT32_MAX;
+
+   const uint64_t count_indices_bits =
+      VARYING_BIT_PRIMITIVE_COUNT |
+      VARYING_BIT_PRIMITIVE_INDICES;
+   const uint64_t per_primitive_header_bits =
+      VARYING_BIT_PRIMITIVE_SHADING_RATE |
+      VARYING_BIT_LAYER |
+      VARYING_BIT_VIEWPORT |
+      VARYING_BIT_CULL_PRIMITIVE;
+   const uint64_t per_primitive_outputs_written =
+      slots_valid & ~(count_indices_bits | per_primitive_header_bits);
+
+   *out_first_offset = base_offset;
+
+   /* We put each variable in its own 16B slot. Technically we could do a lot
+    * better by allocating the space needed for the variable since the data is
+    * constant and not interpolated for the fragment shader. Unfortunately the
+    * backend treats those values similarly to vertex attributes and making
+    * that change would require a pretty large change in the backend. Let's do
+    * this later.
+    */
+
+   /* Lay out builtins first */
+   const uint64_t builtins =
+      per_primitive_outputs_written & BITFIELD64_MASK(VARYING_SLOT_VAR0);
+   u_foreach_bit64(location, builtins) {
+      assert(out_per_primitive_map[location] == -1);
+
+      out_per_primitive_map[location] = *out_per_primitive_stride;
+      *out_per_primitive_stride += 16;
+   }
+
+   uint32_t generics_offset = *out_per_primitive_stride;
+
+   /* Lay out generics */
+   const uint64_t generics =
+      per_primitive_outputs_written & ~BITFIELD64_MASK(VARYING_SLOT_VAR0);
+   const int first_generic_output = ffsl(generics) - 1;
+   u_foreach_bit64(location, generics) {
+      assert(out_per_primitive_map[location] == -1);
+      if (!separate_shader) {
+         /* Just append the location at the back */
+         out_per_primitive_map[location] = *out_per_primitive_stride;
+      } else {
+         assert(location >= VARYING_SLOT_VAR0);
+         /* Each location has its fixed spot */
+         out_per_primitive_map[location] = generics_offset +
+            16 * (location - first_generic_output);
+      }
+
+      *out_per_primitive_stride =
+         MAX2(out_per_primitive_map[location] + 16,
+              *out_per_primitive_stride);
+
+      *out_first_offset = MIN2(out_per_primitive_map[location],
+                               *out_first_offset);
+   }
+
+   *out_first_offset = *out_first_offset == UINT32_MAX ? 0 :
+      ROUND_DOWN_TO(*out_first_offset, 32);
+}
 
 static inline void
 assign_vue_slot(struct intel_vue_map *vue_map, int varying, int slot)
@@ -63,6 +162,9 @@ brw_compute_vue_map(const struct intel_device_info *devinfo,
                     enum intel_vue_layout layout,
                     uint32_t pos_slots)
 {
+   vue_map->slots_valid = slots_valid;
+   vue_map->layout = layout;
+
    if (layout != INTEL_VUE_LAYOUT_FIXED) {
       /* In SSO mode, we don't know whether the adjacent stage will
        * read/write gl_ClipDistance, which has a fixed slot location.
@@ -76,15 +178,17 @@ brw_compute_vue_map(const struct intel_device_info *devinfo,
       slots_valid |= VARYING_BIT_CLIP_DIST1;
    }
 
-   vue_map->slots_valid = slots_valid;
-   vue_map->layout = layout;
-
    /* gl_Layer, gl_ViewportIndex & gl_PrimitiveShadingRateEXT don't get their
     * own varying slots -- they are stored in the first VUE slot
     * (VARYING_SLOT_PSIZ).
     */
    slots_valid &= ~(VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PRIMITIVE_SHADING_RATE);
 
+   /* gl_FrontFace is provided somewhere else in the FS thread payload, it's
+    * never in the VUE.
+    */
+   slots_valid &= ~VARYING_BIT_FACE;
+
    /* Make sure that the values we store in vue_map->varying_to_slot and
     * vue_map->slot_to_varying won't overflow the signed chars that are used
     * to store them.  Note that since vue_map->slot_to_varying sometimes holds
@@ -153,35 +257,38 @@ brw_compute_vue_map(const struct intel_device_info *devinfo,
     * can assign them however we like.  For normal programs, we simply assign
     * them contiguously.
     *
-    * For separate shader pipelines, we first assign built-in varyings
-    * contiguous slots.  This works because ARB_separate_shader_objects
-    * requires that all shaders have matching built-in varying interface
-    * blocks.  Next, we assign generic varyings based on their location
-    * (either explicit or linker assigned).  This guarantees a fixed layout.
-    *
     * We generally don't need to assign a slot for VARYING_SLOT_CLIP_VERTEX,
     * since it's encoded as the clip distances by emit_clip_distances().
     * However, it may be output by transform feedback, and we'd rather not
     * recompute state when TF changes, so we just always include it.
     */
-   uint64_t builtins = slots_valid & BITFIELD64_MASK(VARYING_SLOT_VAR0);
-   while (builtins != 0) {
-      const int varying = ffsll(builtins) - 1;
-      if (vue_map->varying_to_slot[varying] == -1) {
+   if (layout != INTEL_VUE_LAYOUT_SEPARATE_MESH) {
+      const uint64_t builtins = slots_valid & BITFIELD64_MASK(VARYING_SLOT_VAR0);
+      u_foreach_bit64(varying, builtins) {
+         /* Already assigned above? */
+         if (vue_map->varying_to_slot[varying] != -1)
+         continue;
          assign_vue_slot(vue_map, varying, slot++);
       }
-      builtins &= ~BITFIELD64_BIT(varying);
    }
 
    const int first_generic_slot = slot;
-   uint64_t generics = slots_valid & ~BITFIELD64_MASK(VARYING_SLOT_VAR0);
-   while (generics != 0) {
-      const int varying = ffsll(generics) - 1;
+   const uint64_t generics = slots_valid & ~BITFIELD64_MASK(VARYING_SLOT_VAR0);
+   u_foreach_bit64(varying, generics) {
       if (layout != INTEL_VUE_LAYOUT_FIXED) {
          slot = first_generic_slot + varying - VARYING_SLOT_VAR0;
       }
       assign_vue_slot(vue_map, varying, slot++);
-      generics &= ~BITFIELD64_BIT(varying);
+   }
+
+   if (layout == INTEL_VUE_LAYOUT_SEPARATE_MESH) {
+      const uint64_t builtins = slots_valid & BITFIELD64_MASK(VARYING_SLOT_VAR0);
+      u_foreach_bit64(varying, builtins) {
+         /* Already assigned above? */
+         if (vue_map->varying_to_slot[varying] != -1)
+         continue;
+         assign_vue_slot(vue_map, varying, slot++);
+      }
    }
 
    vue_map->num_slots = slot;
diff --git a/src/intel/compiler/intel_shader_enums.h b/src/intel/compiler/intel_shader_enums.h
index 91255da816c..3326655433a 100644
--- a/src/intel/compiler/intel_shader_enums.h
+++ b/src/intel/compiler/intel_shader_enums.h
@@ -30,6 +30,9 @@ intel_sometimes_invert(enum intel_sometimes x)
    return (enum intel_sometimes)((int)INTEL_ALWAYS - (int)x);
 }
 
+#define INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX_OFFSET (20)
+#define INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX_MESH   (32)
+
 enum intel_msaa_flags {
    /** Must be set whenever any dynamic MSAA is used
     *
@@ -63,6 +66,15 @@ enum intel_msaa_flags {
     * in the render target messages.
     */
    INTEL_MSAA_FLAG_COARSE_RT_WRITES = (1 << 18),
+
+   /** Index of the PrimitiveID attribute relative to the first read
+    * attribute.
+    *
+    * This is not a flag but a value that cover bits 20:31. Value 32 means the
+    * PrimitiveID is coming from the PerPrimitive block, written by the Mesh
+    * shader.
+    */
+   INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX = (1 << INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX_OFFSET),
 };
 MESA_DEFINE_CPP_ENUM_BITFIELD_OPERATORS(intel_msaa_flags)
 
@@ -133,6 +145,10 @@ enum intel_vue_layout {
     * Mesh support.
     */
    INTEL_VUE_LAYOUT_SEPARATE,
+   /**
+    * Layout is separate and works with Mesh shaders.
+    */
+   INTEL_VUE_LAYOUT_SEPARATE_MESH,
 };
 
 /**
@@ -157,11 +173,70 @@ struct intel_vue_map {
    /**
     * The layout of the VUE
     *
-    * Separable programs (GL_ARB_separate_shader_objects) can be mixed and matched
-    * without the linker having a chance to dead code eliminate unused varyings.
+    * Separable programs (GL_ARB_separate_shader_objects) can be mixed and
+    * matched without the linker having a chance to dead code eliminate unused
+    * varyings.
     *
     * This means that we have to use a fixed slot layout, based on the output's
     * location field, rather than assigning slots in a compact contiguous block.
+    *
+    * When using Mesh, another constraint arises which is the HW limits for
+    * loading per-primitive & per-vertex data, limited to 32 varying in total.
+    * This requires us to be quite inventive with the way we lay things out.
+    * Take a fragment shader loading the following data :
+    *
+    *    float gl_ClipDistance[];
+    *    uint gl_PrimitiveID;
+    *    vec4 someAppValue[29];
+    *
+    * According to the Vulkan spec, someAppValue will occupy 29 slots,
+    * gl_PrimitiveID 1 slot, gl_ClipDistance[] up to 2 slots. If the input is
+    * coming from a VS/DS/GS shader, we can load all of this through a single
+    * block using 3DSTATE_SBE::VertexURBEntryReadLength = 16 (maximum
+    * programmable value) and the layout with
+    * BRW_VUE_MAP_LAYOUT_FIXED/BRW_VUE_MAP_LAYOUT_SEPARATE will be this :
+    *
+    *   -----------------------
+    *   | gl_ClipDistance 0-3 |
+    *   |---------------------|
+    *   | gl_ClipDistance 4-7 |
+    *   |---------------------|
+    *   |   gl_PrimitiveID    |
+    *   |---------------------|
+    *   |   someAppValue[]    |
+    *   |---------------------|
+    *
+    * This works nicely as everything is coming from the same location in the
+    * URB.
+    *
+    * When mesh shaders are involved, gl_PrimitiveID is located in a different
+    * place in the URB (the per-primitive block) and requires programming
+    * 3DSTATE_SBE_MESH::PerPrimitiveURBEntryOutputReadLength to load some
+    * additional data. The HW has a limit such that
+    * 3DSTATE_SBE_MESH::PerPrimitiveURBEntryOutputReadLength +
+    * 3DSTATE_SBE_MESH::PerVertexURBEntryOutputReadLength <= 16. With the
+    * layout above, we would not be able to accomodate that HW limit.
+    *
+    * The solution to this is to lay the built-in varyings out
+    * (gl_ClipDistance omitted since it's part of the VUE header and cannot
+    * live any other place) at the end of the VUE like this :
+    *
+    *   -----------------------
+    *   | gl_ClipDistance 0-3 |
+    *   |---------------------|
+    *   | gl_ClipDistance 4-7 |
+    *   |---------------------|
+    *   |   someAppValue[]    |
+    *   |---------------------|
+    *   |   gl_PrimitiveID    |
+    *   |---------------------|
+    *
+    * This layout adds another challenge because with separate shader
+    * compilations, we cannot tell in the consumer shader how many outputs the
+    * producer has, so we don't know where the gl_PrimitiveID lives. The
+    * solution to this other problem is to read the built-in with a
+    * MOV_INDIRECT and have the offset of the MOV_INDIRECT loaded through a
+    * push constant.
     */
    enum intel_vue_layout layout;
 
@@ -361,6 +436,7 @@ struct intel_fs_params {
    uint32_t rasterization_samples;
    bool coarse_pixel;
    bool alpha_to_coverage;
+   uint32_t primitive_id_index;
 };
 
 static inline enum intel_msaa_flags
@@ -392,6 +468,9 @@ intel_fs_msaa_flags(struct intel_fs_params params)
    if (params.alpha_to_coverage)
       fs_msaa_flags |= INTEL_MSAA_FLAG_ALPHA_TO_COVERAGE;
 
+   fs_msaa_flags |= (enum intel_msaa_flags)(
+      params.primitive_id_index << INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX_OFFSET);
+
    return fs_msaa_flags;
 }
 
diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c
index 44629946981..671924c5ee7 100644
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -1757,7 +1757,10 @@ anv_graphics_pipeline_init_keys(struct anv_graphics_base_pipeline *pipeline,
        !device->vk.enabled_extensions.EXT_graphics_pipeline_library) {
       vue_layout = INTEL_VUE_LAYOUT_FIXED;
    } else {
-      vue_layout = INTEL_VUE_LAYOUT_SEPARATE;
+      vue_layout =
+         (pipeline->base.type == ANV_PIPELINE_GRAPHICS_LIB &&
+          device->vk.enabled_extensions.EXT_mesh_shader) ?
+         INTEL_VUE_LAYOUT_SEPARATE_MESH : INTEL_VUE_LAYOUT_SEPARATE;
    }
 
    for (uint32_t s = 0; s < ANV_GRAPHICS_SHADER_STAGE_COUNT; s++) {
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index 2d707e4f81f..ec891a7dae0 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -4976,6 +4976,9 @@ struct anv_graphics_pipeline {
    uint32_t                                     view_mask;
    uint32_t                                     instance_multiplier;
 
+   /* Attribute index of the PrimitiveID in the delivered attributes */
+   uint32_t                                     primitive_id_index;
+
    bool                                         kill_pixel;
    bool                                         uses_xfb;
    bool                                         sample_shading_enable;
diff --git a/src/intel/vulkan/genX_gfx_state.c b/src/intel/vulkan/genX_gfx_state.c
index 4fd19bd3bee..01da0c25d67 100644
--- a/src/intel/vulkan/genX_gfx_state.c
+++ b/src/intel/vulkan/genX_gfx_state.c
@@ -780,6 +780,7 @@ update_fs_msaa_flags(struct anv_gfx_dynamic_state *hw_state,
             .rasterization_samples     = dyn->ms.rasterization_samples,
             .coarse_pixel              = !vk_fragment_shading_rate_is_disabled(&dyn->fsr),
             .alpha_to_coverage         = dyn->ms.alpha_to_coverage_enable,
+            .primitive_id_index        = pipeline->primitive_id_index,
          });
 
    SET(FS_MSAA_FLAGS, fs_msaa_flags, fs_msaa_flags);
diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c
index b773704ae27..bca6441e048 100644
--- a/src/intel/vulkan/genX_pipeline.c
+++ b/src/intel/vulkan/genX_pipeline.c
@@ -531,7 +531,7 @@ emit_urb_setup_mesh(struct anv_graphics_pipeline *pipeline,
    const struct intel_mesh_urb_allocation alloc =
       intel_get_mesh_urb_config(devinfo, pipeline->base.base.l3_config,
                                 task_prog_data ? task_prog_data->map.size_dw : 0,
-                                mesh_prog_data->map.size_dw);
+                                mesh_prog_data->map.size / 4);
 
    /* Zero out the primitive pipeline URB allocations. */
    for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
@@ -641,14 +641,14 @@ sbe_primitive_id_override(struct anv_graphics_pipeline *pipeline)
          get_mesh_prog_data(pipeline);
       const struct brw_mue_map *mue = &mesh_prog_data->map;
       return (wm_prog_data->inputs & VARYING_BIT_PRIMITIVE_ID) &&
-              mue->start_dw[VARYING_SLOT_PRIMITIVE_ID] == -1;
+              mue->per_primitive_offsets[VARYING_SLOT_PRIMITIVE_ID] == -1;
    }
 
    const struct intel_vue_map *fs_input_map =
       &anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map;
 
    return (wm_prog_data->inputs & VARYING_BIT_PRIMITIVE_ID) &&
-          fs_input_map->varying_to_slot[VARYING_SLOT_PRIMITIVE_ID] == -1;
+          (fs_input_map->slots_valid & VARYING_BIT_PRIMITIVE_ID) == 0;
 }
 
 static void
@@ -667,30 +667,34 @@ emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline)
       return;
    }
 
+   const struct intel_vue_map *vue_map =
+      anv_pipeline_is_mesh(pipeline) ?
+      &get_mesh_prog_data(pipeline)->map.vue_map :
+      &anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map;
+
    anv_pipeline_emit(pipeline, final.sbe, GENX(3DSTATE_SBE), sbe) {
    anv_pipeline_emit(pipeline, final.sbe_swiz, GENX(3DSTATE_SBE_SWIZ), swiz) {
+      int max_source_attr = 0;
+      uint32_t vertex_read_offset, vertex_read_length, vertex_varyings;
+      brw_compute_sbe_per_vertex_urb_read(
+         vue_map, anv_pipeline_is_mesh(pipeline), wm_prog_data,
+         &vertex_read_offset, &vertex_read_length, &vertex_varyings,
+         &pipeline->primitive_id_index);
 
-      /* TODO(mesh): Figure out cases where we need attribute swizzling.  See also
-       * calculate_urb_setup() and related functions.
-       */
       sbe.AttributeSwizzleEnable = anv_pipeline_is_primitive(pipeline);
       sbe.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
-      sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
-      sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
+      sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs &
+                                        ((1u << vertex_varyings) - 1);
+      sbe.NumberofSFOutputAttributes = vertex_varyings;
 
       for (unsigned i = 0; i < 32; i++)
          sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
 
+      /* As far as we can test, some of the fields in 3DSTATE_SBE & all of
+       * 3DSTATE_SBE_SWIZ has no effect when the pipeline is using Mesh so
+       * don't bother filling those fields.
+       */
       if (anv_pipeline_is_primitive(pipeline)) {
-         const struct intel_vue_map *fs_input_map =
-            &anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map;
-
-         int first_slot =
-            brw_compute_first_fs_urb_slot_required(wm_prog_data->inputs,
-                                                   fs_input_map);
-         assert(first_slot % 2 == 0);
-         unsigned urb_entry_read_offset = first_slot / 2;
-         int max_source_attr = 0;
          for (uint8_t idx = 0; idx < wm_prog_data->urb_setup_attribs_count; idx++) {
             uint8_t attr = wm_prog_data->urb_setup_attribs[idx];
             int input_index = wm_prog_data->urb_setup[attr];
@@ -702,7 +706,7 @@ emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline)
                continue;
             }
 
-            const int slot = fs_input_map->varying_to_slot[attr];
+            const int slot = vue_map->varying_to_slot[attr];
 
             if (slot == -1) {
                /* This attribute does not exist in the VUE--that means that
@@ -723,7 +727,7 @@ emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline)
             /* We have to subtract two slots to account for the URB entry
              * output read offset in the VS and GS stages.
              */
-            const int source_attr = slot - 2 * urb_entry_read_offset;
+            const int source_attr = slot - 2 * vertex_read_offset;
             assert(source_attr >= 0 && source_attr < 32);
             max_source_attr = MAX2(max_source_attr, source_attr);
             /* The hardware can only do overrides on 16 overrides at a time,
@@ -737,83 +741,51 @@ emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline)
                assert(source_attr == input_index);
          }
 
-         sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
-         sbe.VertexURBEntryReadLength = DIV_ROUND_UP(max_source_attr + 1, 2);
+         sbe.VertexURBEntryReadOffset = vertex_read_offset;
+         sbe.VertexURBEntryReadLength = vertex_read_length;
          sbe.ForceVertexURBEntryReadOffset = true;
          sbe.ForceVertexURBEntryReadLength = true;
-
-         /* Ask the hardware to supply PrimitiveID if the fragment shader
-          * reads it but a previous stage didn't write one.
-          */
-         if (sbe_primitive_id_override(pipeline)) {
-            sbe.PrimitiveIDOverrideAttributeSelect =
-               wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_ID];
-            sbe.PrimitiveIDOverrideComponentX = true;
-            sbe.PrimitiveIDOverrideComponentY = true;
-            sbe.PrimitiveIDOverrideComponentZ = true;
-            sbe.PrimitiveIDOverrideComponentW = true;
-         }
       }
-   }
-   }
+
+      /* Ask the hardware to supply PrimitiveID if the fragment shader reads
+       * it but a previous stage didn't write one.
+       */
+      if (sbe_primitive_id_override(pipeline)) {
+         sbe.PrimitiveIDOverrideAttributeSelect =
+            wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_ID];
+         sbe.PrimitiveIDOverrideComponentX = true;
+         sbe.PrimitiveIDOverrideComponentY = true;
+         sbe.PrimitiveIDOverrideComponentZ = true;
+         sbe.PrimitiveIDOverrideComponentW = true;
+      }
 
 #if GFX_VERx10 >= 125
-   if (device->vk.enabled_extensions.EXT_mesh_shader) {
-      anv_pipeline_emit(pipeline, final.sbe_mesh,
-                        GENX(3DSTATE_SBE_MESH), sbe_mesh) {
-         if (!anv_pipeline_is_mesh(pipeline))
-            continue;
+      if (device->vk.enabled_extensions.EXT_mesh_shader) {
+         anv_pipeline_emit(pipeline, final.sbe_mesh,
+                           GENX(3DSTATE_SBE_MESH), sbe_mesh) {
+            if (!anv_pipeline_is_mesh(pipeline))
+               continue;
 
-         const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
-         const struct brw_mue_map *mue = &mesh_prog_data->map;
+            const struct brw_mesh_prog_data *mesh_prog_data =
+               get_mesh_prog_data(pipeline);
 
-         assert(mue->per_vertex_header_size_dw % 8 == 0);
-         sbe_mesh.PerVertexURBEntryOutputReadOffset = mue->per_vertex_header_size_dw / 8;
-         sbe_mesh.PerVertexURBEntryOutputReadLength = DIV_ROUND_UP(mue->per_vertex_data_size_dw, 8);
+            sbe_mesh.PerVertexURBEntryOutputReadOffset = vertex_read_offset;
+            sbe_mesh.PerVertexURBEntryOutputReadLength = vertex_read_length;
 
-         /* Clip distance array is passed in the per-vertex header so that it
-          * can be consumed by the HW. If user wants to read it in the FS,
-          * adjust the offset and length to cover it. Conveniently it is at
-          * the end of the per-vertex header, right before per-vertex
-          * attributes.
-          *
-          * Note that FS attribute reading must be aware that the clip
-          * distances have fixed position.
-          */
-         if (mue->per_vertex_header_size_dw > 8 &&
-             (wm_prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] >= 0 ||
-              wm_prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] >= 0)) {
-            sbe_mesh.PerVertexURBEntryOutputReadOffset -= 1;
-            sbe_mesh.PerVertexURBEntryOutputReadLength += 1;
-         }
+            uint32_t prim_read_offset, prim_read_length;
+            brw_compute_sbe_per_primitive_urb_read(wm_prog_data->per_primitive_inputs,
+                                                   wm_prog_data->num_per_primitive_inputs,
+                                                   &mesh_prog_data->map,
+                                                   &prim_read_offset,
+                                                   &prim_read_length);
 
-         if (mue->user_data_in_vertex_header) {
-            sbe_mesh.PerVertexURBEntryOutputReadOffset -= 1;
-            sbe_mesh.PerVertexURBEntryOutputReadLength += 1;
-         }
-
-         assert(mue->per_primitive_header_size_dw % 8 == 0);
-         sbe_mesh.PerPrimitiveURBEntryOutputReadOffset =
-            mue->per_primitive_header_size_dw / 8;
-         sbe_mesh.PerPrimitiveURBEntryOutputReadLength =
-            DIV_ROUND_UP(mue->per_primitive_data_size_dw, 8);
-
-         /* Just like with clip distances, if Primitive Shading Rate, Viewport
-          * Index or Layer is read back in the FS, adjust the offset and
-          * length to cover the Primitive Header, where PSR, Viewport Index &
-          * Layer are stored.
-          */
-         if (wm_prog_data->urb_setup[VARYING_SLOT_VIEWPORT] >= 0 ||
-             wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_SHADING_RATE] >= 0 ||
-             wm_prog_data->urb_setup[VARYING_SLOT_LAYER] >= 0 ||
-             mue->user_data_in_primitive_header) {
-            assert(sbe_mesh.PerPrimitiveURBEntryOutputReadOffset > 0);
-            sbe_mesh.PerPrimitiveURBEntryOutputReadOffset -= 1;
-            sbe_mesh.PerPrimitiveURBEntryOutputReadLength += 1;
+            sbe_mesh.PerPrimitiveURBEntryOutputReadOffset = prim_read_offset;
+            sbe_mesh.PerPrimitiveURBEntryOutputReadLength = prim_read_length;
          }
       }
-   }
 #endif
+   }
+   }
 }
 
 static void
@@ -842,7 +814,7 @@ emit_rs_state(struct anv_graphics_pipeline *pipeline,
       } else {
          assert(anv_pipeline_is_mesh(pipeline));
          const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
-         point_from_shader = mesh_prog_data->map.start_dw[VARYING_SLOT_PSIZ] >= 0;
+         point_from_shader = mesh_prog_data->map.vue_map.slots_valid & VARYING_BIT_PSIZ;
       }
 
       if (point_from_shader) {
@@ -893,7 +865,7 @@ emit_3dstate_clip(struct anv_graphics_pipeline *pipeline,
          const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
 
          clip.ForceZeroRTAIndexEnable =
-            mesh_prog_data->map.start_dw[VARYING_SLOT_LAYER] < 0;
+            mesh_prog_data->map.per_primitive_offsets[VARYING_SLOT_LAYER] < 0;
       }
 
       clip.NonPerspectiveBarycentricEnable = wm_prog_data ?
@@ -905,12 +877,13 @@ emit_3dstate_clip(struct anv_graphics_pipeline *pipeline,
    if (device->vk.enabled_extensions.EXT_mesh_shader) {
       anv_pipeline_emit(pipeline, final.clip_mesh,
                         GENX(3DSTATE_CLIP_MESH), clip_mesh) {
-         if (anv_pipeline_is_mesh(pipeline)) {
-            const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
-            clip_mesh.PrimitiveHeaderEnable = mesh_prog_data->map.per_primitive_header_size_dw > 0;
-            clip_mesh.UserClipDistanceClipTestEnableBitmask = mesh_prog_data->clip_distance_mask;
-            clip_mesh.UserClipDistanceCullTestEnableBitmask = mesh_prog_data->cull_distance_mask;
-         }
+         if (!anv_pipeline_is_mesh(pipeline))
+            continue;
+
+         const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
+         clip_mesh.PrimitiveHeaderEnable = mesh_prog_data->map.has_per_primitive_header;
+         clip_mesh.UserClipDistanceClipTestEnableBitmask = mesh_prog_data->clip_distance_mask;
+         clip_mesh.UserClipDistanceCullTestEnableBitmask = mesh_prog_data->cull_distance_mask;
       }
    }
 #endif
@@ -1903,9 +1876,9 @@ emit_mesh_state(struct anv_graphics_pipeline *pipeline)
 
       mesh.MaximumPrimitiveCount             = MAX2(mesh_prog_data->map.max_primitives, 1) - 1;
       mesh.OutputTopology                    = output_topology;
-      mesh.PerVertexDataPitch                = mesh_prog_data->map.per_vertex_pitch_dw / 8;
-      mesh.PerPrimitiveDataPresent           = mesh_prog_data->map.per_primitive_pitch_dw > 0;
-      mesh.PerPrimitiveDataPitch             = mesh_prog_data->map.per_primitive_pitch_dw / 8;
+      mesh.PerVertexDataPitch                = mesh_prog_data->map.per_vertex_stride / 32;
+      mesh.PerPrimitiveDataPresent           = mesh_prog_data->map.per_primitive_stride > 0;
+      mesh.PerPrimitiveDataPitch             = mesh_prog_data->map.per_primitive_stride / 32;
       mesh.IndexFormat                       = index_format;
 
       mesh.NumberofBarriers                  = mesh_prog_data->base.uses_barrier;