diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h
index b1deca378e8..c39c8f5809a 100644
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -426,6 +426,9 @@ struct brw_task_prog_key
 struct brw_mesh_prog_key
 {
    struct brw_base_prog_key base;
+
+   bool compact_mue:1;
+   unsigned padding:31;
 };
 
 enum brw_sf_primitive {
diff --git a/src/intel/compiler/brw_mesh.cpp b/src/intel/compiler/brw_mesh.cpp
index 759a5336487..0c7eacd753a 100644
--- a/src/intel/compiler/brw_mesh.cpp
+++ b/src/intel/compiler/brw_mesh.cpp
@@ -763,7 +763,7 @@ brw_compute_mue_layout(const struct brw_compiler *compiler,
 static void
 brw_compute_mue_map(const struct brw_compiler *compiler,
                     struct nir_shader *nir, struct brw_mue_map *map,
-                    enum brw_mesh_index_format index_format)
+                    enum brw_mesh_index_format index_format, bool compact_mue)
 {
    memset(map, 0, sizeof(*map));
 
@@ -823,21 +823,17 @@ brw_compute_mue_map(const struct brw_compiler *compiler,
          ~(per_primitive_header_bits | per_vertex_header_bits);
 
    /* packing into prim header is possible only if prim header is present */
-   map->user_data_in_primitive_header =
+   map->user_data_in_primitive_header = compact_mue &&
          (outputs_written & per_primitive_header_bits) != 0;
 
    /* Packing into vert header is always possible, but we allow it only
     * if full vec4 is available (so point size is not used) and there's
     * nothing between it and normal vertex data (so no clip distances).
     */
-   map->user_data_in_vertex_header =
+   map->user_data_in_vertex_header = compact_mue &&
          (outputs_written & per_vertex_header_bits) ==
                BITFIELD64_BIT(VARYING_SLOT_POS);
 
-   brw_compute_mue_layout(compiler, orders, regular_outputs, nir,
-                          &map->user_data_in_primitive_header,
-                          &map->user_data_in_vertex_header);
-
    if (outputs_written & per_primitive_header_bits) {
       if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE)) {
          map->start_dw[VARYING_SLOT_PRIMITIVE_SHADING_RATE] =
@@ -871,39 +867,80 @@ brw_compute_mue_map(const struct brw_compiler *compiler,
 
    map->per_primitive_data_size_dw = 0;
 
-   unsigned start_dw = map->per_primitive_start_dw;
-   if (map->user_data_in_primitive_header)
-      start_dw += 4; /* first 4 dwords are used */
-   else
-      start_dw += map->per_primitive_header_size_dw;
-   unsigned header_used_dw = 0;
+   /* For fast linked libraries, we can't pack the MUE, as the fragment shader
+    * will be compiled without access to the MUE map and won't be able to find
+    * out where everything is.
+    * Instead, keep doing things as we did before the packing, just laying out
+    * everything in varying order, which is how the FS will expect them.
+    */
+   if (compact_mue) {
+      brw_compute_mue_layout(compiler, orders, regular_outputs, nir,
+                             &map->user_data_in_primitive_header,
+                             &map->user_data_in_vertex_header);
+
+      unsigned start_dw = map->per_primitive_start_dw;
+      if (map->user_data_in_primitive_header)
+         start_dw += 4; /* first 4 dwords are used */
+      else
+         start_dw += map->per_primitive_header_size_dw;
+      unsigned header_used_dw = 0;
+
+      for (auto it = orders[PRIM].cbegin(); it != orders[PRIM].cend(); ++it) {
+         int location = (*it).location;
+         if (location < 0) {
+            start_dw += (*it).dwords;
+            if (map->user_data_in_primitive_header && header_used_dw < 4)
+               header_used_dw += (*it).dwords;
+            else
+               map->per_primitive_data_size_dw += (*it).dwords;
+            assert(header_used_dw <= 4);
+            continue;
+         }
+
+         assert(map->start_dw[location] == -1);
+
+         assert(location == VARYING_SLOT_PRIMITIVE_ID ||
+                location >= VARYING_SLOT_VAR0);
+
+         brw_mue_assign_position(&*it, map, start_dw);
 
-   for (auto it = orders[PRIM].cbegin(); it != orders[PRIM].cend(); ++it) {
-      int location = (*it).location;
-      if (location < 0) {
          start_dw += (*it).dwords;
          if (map->user_data_in_primitive_header && header_used_dw < 4)
             header_used_dw += (*it).dwords;
          else
             map->per_primitive_data_size_dw += (*it).dwords;
          assert(header_used_dw <= 4);
-         continue;
+         outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
       }
+   } else {
+      unsigned start_dw = map->per_primitive_start_dw +
+                          map->per_primitive_header_size_dw;
 
-      assert(map->start_dw[location] == -1);
+      uint64_t per_prim_outputs = outputs_written & nir->info.per_primitive_outputs;
+      while (per_prim_outputs) {
+         uint64_t location = ffsl(per_prim_outputs) - 1;
 
-      assert(location == VARYING_SLOT_PRIMITIVE_ID ||
-             location >= VARYING_SLOT_VAR0);
+         assert(map->start_dw[location] == -1);
+         assert(location == VARYING_SLOT_PRIMITIVE_ID ||
+                location >= VARYING_SLOT_VAR0);
 
-      brw_mue_assign_position(&*it, map, start_dw);
+         nir_variable *var =
+            brw_nir_find_complete_variable_with_location(nir,
+                                                         nir_var_shader_out,
+                                                         location);
+         struct attr_desc d;
+         d.location = location;
+         d.type     = brw_nir_get_var_type(nir, var);
+         d.dwords   = glsl_count_dword_slots(d.type, false);
+         d.slots    = glsl_count_attribute_slots(d.type, false);
 
-      start_dw += (*it).dwords;
-      if (map->user_data_in_primitive_header && header_used_dw < 4)
-         header_used_dw += (*it).dwords;
-      else
-         map->per_primitive_data_size_dw += (*it).dwords;
-      assert(header_used_dw <= 4);
-      outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
+         brw_mue_assign_position(&d, map, start_dw);
+
+         map->per_primitive_data_size_dw += ALIGN(d.dwords, 4);
+         start_dw += ALIGN(d.dwords, 4);
+
+         per_prim_outputs &= ~BITFIELD64_RANGE(location, d.slots);
+      }
    }
 
    map->per_primitive_pitch_dw = ALIGN(map->per_primitive_header_size_dw +
@@ -951,15 +988,40 @@ brw_compute_mue_map(const struct brw_compiler *compiler,
 
    map->per_vertex_data_size_dw = 0;
 
-   start_dw = map->per_vertex_start_dw;
-   if (!map->user_data_in_vertex_header)
-      start_dw += map->per_vertex_header_size_dw;
+   /* For fast linked libraries, we can't pack the MUE, as the fragment shader
+    * will be compiled without access to the MUE map and won't be able to find
+    * out where everything is.
+    * Instead, keep doing things as we did before the packing, just laying out
+    * everything in varying order, which is how the FS will expect them.
+    */
+   if (compact_mue) {
+      unsigned start_dw = map->per_vertex_start_dw;
+      if (!map->user_data_in_vertex_header)
+         start_dw += map->per_vertex_header_size_dw;
+
+      unsigned header_used_dw = 0;
+      for (unsigned type = VERT; type <= VERT_FLAT; ++type) {
+         for (auto it = orders[type].cbegin(); it != orders[type].cend(); ++it) {
+            int location = (*it).location;
+            if (location < 0) {
+               start_dw += (*it).dwords;
+               if (map->user_data_in_vertex_header && header_used_dw < 4) {
+                  header_used_dw += (*it).dwords;
+                  assert(header_used_dw <= 4);
+                  if (header_used_dw == 4)
+                     start_dw += 4; /* jump over gl_position */
+               } else {
+                  map->per_vertex_data_size_dw += (*it).dwords;
+               }
+               continue;
+            }
+
+            assert(map->start_dw[location] == -1);
+
+            assert(location >= VARYING_SLOT_VAR0);
+
+            brw_mue_assign_position(&*it, map, start_dw);
 
-   header_used_dw = 0;
-   for (unsigned type = VERT; type <= VERT_FLAT; ++type) {
-      for (auto it = orders[type].cbegin(); it != orders[type].cend(); ++it) {
-         int location = (*it).location;
-         if (location < 0) {
             start_dw += (*it).dwords;
             if (map->user_data_in_vertex_header && header_used_dw < 4) {
                header_used_dw += (*it).dwords;
@@ -969,25 +1031,36 @@ brw_compute_mue_map(const struct brw_compiler *compiler,
             } else {
                map->per_vertex_data_size_dw += (*it).dwords;
             }
-            continue;
+            outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
          }
+      }
+   } else {
+      unsigned start_dw = map->per_vertex_start_dw +
+                          map->per_vertex_header_size_dw;
+
+      uint64_t per_vertex_outputs = outputs_written & ~nir->info.per_primitive_outputs;
+      while (per_vertex_outputs) {
+         uint64_t location = ffsl(per_vertex_outputs) - 1;
 
          assert(map->start_dw[location] == -1);
-
          assert(location >= VARYING_SLOT_VAR0);
 
-         brw_mue_assign_position(&*it, map, start_dw);
+         nir_variable *var =
+            brw_nir_find_complete_variable_with_location(nir,
+                                                         nir_var_shader_out,
+                                                         location);
+         struct attr_desc d;
+         d.location = location;
+         d.type     = brw_nir_get_var_type(nir, var);
+         d.dwords   = glsl_count_dword_slots(d.type, false);
+         d.slots    = glsl_count_attribute_slots(d.type, false);
 
-         start_dw += (*it).dwords;
-         if (map->user_data_in_vertex_header && header_used_dw < 4) {
-            header_used_dw += (*it).dwords;
-            assert(header_used_dw <= 4);
-            if (header_used_dw == 4)
-               start_dw += 4; /* jump over gl_position */
-         } else {
-            map->per_vertex_data_size_dw += (*it).dwords;
-         }
-         outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
+         brw_mue_assign_position(&d, map, start_dw);
+
+         map->per_vertex_data_size_dw += ALIGN(d.dwords, 4);
+         start_dw += ALIGN(d.dwords, 4);
+
+         per_vertex_outputs &= ~BITFIELD64_RANGE(location, d.slots);
       }
    }
 
@@ -1435,7 +1508,8 @@ brw_compile_mesh(const struct brw_compiler *compiler,
 
    brw_nir_lower_tue_inputs(nir, params->tue_map);
 
-   brw_compute_mue_map(compiler, nir, &prog_data->map, prog_data->index_format);
+   brw_compute_mue_map(compiler, nir, &prog_data->map,
+                       prog_data->index_format, key->compact_mue);
    brw_nir_lower_mue_outputs(nir, &prog_data->map);
 
    brw_simd_selection_state simd_state{
diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c
index 5bbcd6440fe..5e9ea5ed1c7 100644
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -554,11 +554,14 @@ populate_task_prog_key(struct anv_pipeline_stage *stage,
 
 static void
 populate_mesh_prog_key(struct anv_pipeline_stage *stage,
-                       const struct anv_device *device)
+                       const struct anv_device *device,
+                       bool compact_mue)
 {
    memset(&stage->key, 0, sizeof(stage->key));
 
    populate_base_prog_key(stage, device);
+
+   stage->key.mesh.compact_mue = compact_mue;
 }
 
 static uint32_t
@@ -1737,9 +1740,13 @@ anv_graphics_pipeline_init_keys(struct anv_graphics_base_pipeline *pipeline,
          populate_task_prog_key(&stages[s], device);
          break;
 
-      case MESA_SHADER_MESH:
-         populate_mesh_prog_key(&stages[s], device);
+      case MESA_SHADER_MESH: {
+         const bool compact_mue =
+            !(pipeline->base.type == ANV_PIPELINE_GRAPHICS_LIB &&
+              !anv_pipeline_base_has_stage(pipeline, MESA_SHADER_FRAGMENT));
+         populate_mesh_prog_key(&stages[s], device, compact_mue);
          break;
+      }
 
       default:
          unreachable("Invalid graphics shader stage");