intel: prepare VUE layout for more than 2 layouts

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Ivan Briano <ivan.briano@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34109>
2025-12-22 22:10:10 +01:00 · 2025-04-29 17:40:22 +03:00 · 2025-04-29 17:40:22 +03:00 · 2d396f6085
commit 2d396f6085
parent 95efdca00b
19 changed files with 196 additions and 93 deletions
--- a/src/gallium/drivers/crocus/crocus_program.c
+++ b/src/gallium/drivers/crocus/crocus_program.c
@ -1204,7 +1204,9 @@ crocus_compile_vs(struct crocus_context *ice,
      crocus_vs_outputs_written(ice, key, nir->info.outputs_written);
   elk_compute_vue_map(devinfo,
                       &vue_prog_data->vue_map, outputs_written,
-                       nir->info.separate_shader, /* pos slots */ 1);
+                       nir->info.separate_shader ?
+                       INTEL_VUE_LAYOUT_SEPARATE :
+                       INTEL_VUE_LAYOUT_FIXED, /* pos slots */ 1);

   /* Don't tell the backend about our clip plane constants, we've already
    * lowered them in NIR and we don't want it doing it again.
@ -1694,7 +1696,9 @@ crocus_compile_gs(struct crocus_context *ice,

   elk_compute_vue_map(devinfo,
                       &vue_prog_data->vue_map, nir->info.outputs_written,
-                       nir->info.separate_shader, /* pos slots */ 1);
+                       nir->info.separate_shader ?
+                       INTEL_VUE_LAYOUT_SEPARATE :
+                       INTEL_VUE_LAYOUT_FIXED, /* pos slots */ 1);

   if (devinfo->ver == 6)
      gfx6_gs_xfb_setup(&ish->stream_output, gs_prog_data);
@ -1969,7 +1973,7 @@ update_last_vue_map(struct crocus_context *ice,
         ice->state.stage_dirty_for_nos[CROCUS_NOS_LAST_VUE_MAP];
   }

-   if (changed_slots || (old_map && old_map->separate != vue_map->separate)) {
+   if (changed_slots || (old_map && old_map->layout != vue_map->layout)) {
      ice->state.dirty |= CROCUS_DIRTY_GEN7_SBE;
      if (devinfo->ver < 6)
         ice->state.dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG;
@ -2872,7 +2876,7 @@ crocus_create_fs_state(struct pipe_context *ctx,
      if (devinfo->ver < 6) {
         elk_compute_vue_map(devinfo, &vue_map,
                             info->inputs_read | VARYING_BIT_POS,
-                             false, /* pos slots */ 1);
+                             INTEL_VUE_LAYOUT_FIXED, /* pos slots */ 1);
      }
      if (!crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key)))
         crocus_compile_fs(ice, ish, &key, &vue_map);
--- a/src/gallium/drivers/iris/iris_context.h
+++ b/src/gallium/drivers/iris/iris_context.h
@ -231,7 +231,8 @@ struct iris_vue_prog_key {
   struct iris_base_prog_key base;

   unsigned nr_userclip_plane_consts:4;
-   unsigned padding:28;
+   enum intel_vue_layout layout:2;
+   unsigned padding:26;
 };

 struct iris_vs_prog_key {
@ -284,7 +285,8 @@ struct iris_fs_prog_key {
   bool multisample_fbo:1;
   bool force_dual_color_blend:1;
   bool coherent_fb_fetch:1;
-   uint64_t padding:43;
+   enum intel_vue_layout vue_layout:2;
+   uint64_t padding:41;
 };

 struct iris_cs_prog_key {
--- a/src/gallium/drivers/iris/iris_program.c
+++ b/src/gallium/drivers/iris/iris_program.c
@ -55,12 +55,20 @@
 #include "iris_pipe.h"
 #include "nir/tgsi_to_nir.h"

-#define KEY_INIT(prefix)                                                   \
-   .prefix.program_string_id = ish->program_id,                            \
-   .prefix.limit_trig_input_range = screen->driconf.limit_trig_input_range
-#define BRW_KEY_INIT(gen, prog_id, limit_trig_input)       \
+static inline enum intel_vue_layout
+vue_layout(bool separate_shader)
+{
+   return separate_shader ? INTEL_VUE_LAYOUT_SEPARATE : INTEL_VUE_LAYOUT_FIXED;
+}
+
+#define KEY_INIT(prefix)                                   \
+   .prefix.program_string_id = ish->program_id,            \
+   .prefix.limit_trig_input_range =                        \
+      screen->driconf.limit_trig_input_range
+#define BRW_KEY_INIT(gen, prog_id, limit_trig_input, _vue_layout) \
   .base.program_string_id = prog_id,                      \
-   .base.limit_trig_input_range = limit_trig_input
+   .base.limit_trig_input_range = limit_trig_input,        \
+   .base.vue_layout = _vue_layout

 #ifdef INTEL_USE_ELK
 #define ELK_KEY_INIT(gen, prog_id, limit_trig_input)       \
@ -525,7 +533,8 @@ iris_to_brw_vs_key(const struct iris_screen *screen,
 {
   return (struct brw_vs_prog_key) {
      BRW_KEY_INIT(screen->devinfo->ver, key->vue.base.program_string_id,
-                   key->vue.base.limit_trig_input_range),
+                   key->vue.base.limit_trig_input_range,
+                   key->vue.layout),
   };
 }

@ -535,7 +544,8 @@ iris_to_brw_tcs_key(const struct iris_screen *screen,
 {
   return (struct brw_tcs_prog_key) {
      BRW_KEY_INIT(screen->devinfo->ver, key->vue.base.program_string_id,
-                   key->vue.base.limit_trig_input_range),
+                   key->vue.base.limit_trig_input_range,
+                   key->vue.layout),
      ._tes_primitive_mode = key->_tes_primitive_mode,
      .input_vertices = key->input_vertices,
      .patch_outputs_written = key->patch_outputs_written,
@ -549,7 +559,8 @@ iris_to_brw_tes_key(const struct iris_screen *screen,
 {
   return (struct brw_tes_prog_key) {
      BRW_KEY_INIT(screen->devinfo->ver, key->vue.base.program_string_id,
-                   key->vue.base.limit_trig_input_range),
+                   key->vue.base.limit_trig_input_range,
+                   key->vue.layout),
      .patch_inputs_read = key->patch_inputs_read,
      .inputs_read = key->inputs_read,
   };
@ -561,7 +572,8 @@ iris_to_brw_gs_key(const struct iris_screen *screen,
 {
   return (struct brw_gs_prog_key) {
      BRW_KEY_INIT(screen->devinfo->ver, key->vue.base.program_string_id,
-                   key->vue.base.limit_trig_input_range),
+                   key->vue.base.limit_trig_input_range,
+                   key->vue.layout),
   };
 }

@ -571,7 +583,8 @@ iris_to_brw_fs_key(const struct iris_screen *screen,
 {
   return (struct brw_wm_prog_key) {
      BRW_KEY_INIT(screen->devinfo->ver, key->base.program_string_id,
-                   key->base.limit_trig_input_range),
+                   key->base.limit_trig_input_range,
+                   key->vue_layout),
      .nr_color_regions = key->nr_color_regions,
      .flat_shade = key->flat_shade,
      .alpha_test_replicate_alpha = key->alpha_test_replicate_alpha,
@ -595,7 +608,8 @@ iris_to_brw_cs_key(const struct iris_screen *screen,
 {
   return (struct brw_cs_prog_key) {
      BRW_KEY_INIT(screen->devinfo->ver, key->base.program_string_id,
-                   key->base.limit_trig_input_range),
+                   key->base.limit_trig_input_range,
+                   INTEL_VUE_LAYOUT_SEPARATE),
   };
 }

@ -1884,7 +1898,7 @@ iris_compile_vs(struct iris_screen *screen,

      brw_compute_vue_map(devinfo,
                          &brw_prog_data->base.vue_map, nir->info.outputs_written,
-                          nir->info.separate_shader, /* pos_slots */ 1);
+                          key->vue.layout, /* pos_slots */ 1);

      struct brw_vs_prog_key brw_key = iris_to_brw_vs_key(screen, key);

@ -1916,7 +1930,9 @@ iris_compile_vs(struct iris_screen *screen,

      elk_compute_vue_map(devinfo,
                          &elk_prog_data->base.vue_map, nir->info.outputs_written,
-                          nir->info.separate_shader, /* pos_slots */ 1);
+                          nir->info.separate_shader ?
+                          INTEL_VUE_LAYOUT_SEPARATE :
+                          INTEL_VUE_LAYOUT_FIXED, /* pos_slots */ 1);

      struct elk_vs_prog_key elk_key = iris_to_elk_vs_key(screen, key);

@ -1983,7 +1999,10 @@ iris_update_compiled_vs(struct iris_context *ice)
   struct iris_uncompiled_shader *ish =
      ice->shaders.uncompiled[MESA_SHADER_VERTEX];

-   struct iris_vs_prog_key key = { KEY_INIT(vue.base) };
+   struct iris_vs_prog_key key = {
+      KEY_INIT(vue.base),
+      .vue.layout = vue_layout(ish->nir->info.separate_shader),
+   };
   screen->vtbl.populate_vs_key(ice, &ish->nir->info, last_vue_stage(ice), &key);

   struct iris_compiled_shader *old = ice->shaders.prog[IRIS_CACHE_VS];
@ -2208,6 +2227,7 @@ iris_update_compiled_tcs(struct iris_context *ice)
      iris_get_shader_info(ice, MESA_SHADER_TESS_EVAL);
   struct iris_tcs_prog_key key = {
      .vue.base.program_string_id = tcs ? tcs->program_id : 0,
+      .vue.layout = vue_layout(tcs ? tcs->nir->info.separate_shader : false),
      ._tes_primitive_mode = tes_info->tess._primitive_mode,
      .input_vertices =
         !tcs || iris_use_tcs_multi_patch(screen) ? ice->state.vertices_per_patch : 0,
@ -2416,7 +2436,10 @@ iris_update_compiled_tes(struct iris_context *ice)
   struct iris_uncompiled_shader *ish =
      ice->shaders.uncompiled[MESA_SHADER_TESS_EVAL];

-   struct iris_tes_prog_key key = { KEY_INIT(vue.base) };
+   struct iris_tes_prog_key key = {
+      KEY_INIT(vue.base),
+      .vue.layout = vue_layout(ish->nir->info.separate_shader),
+   };
   get_unified_tess_slots(ice, &key.inputs_read, &key.patch_inputs_read);
   screen->vtbl.populate_tes_key(ice, &ish->nir->info, last_vue_stage(ice), &key);

@ -2500,7 +2523,7 @@ iris_compile_gs(struct iris_screen *screen,

      brw_compute_vue_map(devinfo,
                          &brw_prog_data->base.vue_map, nir->info.outputs_written,
-                          nir->info.separate_shader, /* pos_slots */ 1);
+                          key->vue.layout, /* pos_slots */ 1);

      struct brw_gs_prog_key brw_key = iris_to_brw_gs_key(screen, key);

@ -2530,7 +2553,9 @@ iris_compile_gs(struct iris_screen *screen,

      elk_compute_vue_map(devinfo,
                          &elk_prog_data->base.vue_map, nir->info.outputs_written,
-                          nir->info.separate_shader, /* pos_slots */ 1);
+                          nir->info.separate_shader ?
+                          INTEL_VUE_LAYOUT_SEPARATE :
+                          INTEL_VUE_LAYOUT_FIXED, /* pos_slots */ 1);

      struct elk_gs_prog_key elk_key = iris_to_elk_gs_key(screen, key);

@ -2600,7 +2625,10 @@ iris_update_compiled_gs(struct iris_context *ice)
   struct iris_screen *screen = (struct iris_screen *)ice->ctx.screen;

   if (ish) {
-      struct iris_gs_prog_key key = { KEY_INIT(vue.base) };
+      struct iris_gs_prog_key key = {
+         KEY_INIT(vue.base),
+         .vue.layout = vue_layout(ish->nir->info.separate_shader),
+      };
      screen->vtbl.populate_gs_key(ice, &ish->nir->info, last_vue_stage(ice), &key);

      bool added;
@ -2777,7 +2805,10 @@ iris_update_compiled_fs(struct iris_context *ice)
   struct iris_uncompiled_shader *ish =
      ice->shaders.uncompiled[MESA_SHADER_FRAGMENT];
   struct iris_screen *screen = (struct iris_screen *)ice->ctx.screen;
-   struct iris_fs_prog_key key = { KEY_INIT(base) };
+   struct iris_fs_prog_key key = {
+      KEY_INIT(base),
+      .vue_layout = vue_layout(ish->nir->info.separate_shader),
+   };
   screen->vtbl.populate_fs_key(ice, &ish->nir->info, &key);

   struct intel_vue_map *last_vue_map =
@ -2847,7 +2878,7 @@ update_last_vue_map(struct iris_context *ice,
      ice->state.dirty |= IRIS_DIRTY_CLIP;
   }

-   if (changed_slots || (old_map && old_map->separate != vue_map->separate)) {
+   if (changed_slots || (old_map && old_map->layout != vue_map->layout)) {
      ice->state.dirty |= IRIS_DIRTY_SBE;
   }

@ -3429,13 +3460,17 @@ iris_create_shader_state(struct pipe_context *ctx,
      if (info->clip_distance_array_size == 0)
         ish->nos |= (1ull << IRIS_NOS_RASTERIZER);

-      key.vs = (struct iris_vs_prog_key) { KEY_INIT(vue.base) };
+      key.vs = (struct iris_vs_prog_key) {
+         KEY_INIT(vue.base),
+         .vue.layout = vue_layout(ish->nir->info.separate_shader),
+      };
      key_size = sizeof(key.vs);
      break;

   case MESA_SHADER_TESS_CTRL: {
      key.tcs = (struct iris_tcs_prog_key) {
         KEY_INIT(vue.base),
+         .vue.layout = vue_layout(ish->nir->info.separate_shader),
         // XXX: make sure the linker fills this out from the TES...
         ._tes_primitive_mode =
         info->tess._primitive_mode ? info->tess._primitive_mode
@ -3463,6 +3498,7 @@ iris_create_shader_state(struct pipe_context *ctx,

      key.tes = (struct iris_tes_prog_key) {
         KEY_INIT(vue.base),
+         .vue.layout = vue_layout(ish->nir->info.separate_shader),
         // XXX: not ideal, need TCS output/TES input unification
         .inputs_read = info->inputs_read,
         .patch_inputs_read = info->patch_inputs_read,
@ -3474,7 +3510,10 @@ iris_create_shader_state(struct pipe_context *ctx,
   case MESA_SHADER_GEOMETRY:
      ish->nos |= (1ull << IRIS_NOS_RASTERIZER);

-      key.gs = (struct iris_gs_prog_key) { KEY_INIT(vue.base) };
+      key.gs = (struct iris_gs_prog_key) {
+         KEY_INIT(vue.base),
+         .vue.layout = vue_layout(ish->nir->info.separate_shader),
+      };
      key_size = sizeof(key.gs);
      break;

@ -3505,6 +3544,7 @@ iris_create_shader_state(struct pipe_context *ctx,

      key.fs = (struct iris_fs_prog_key) {
         KEY_INIT(base),
+         .vue_layout = vue_layout(ish->nir->info.separate_shader),
         .nr_color_regions = util_bitcount(color_outputs),
         .coherent_fb_fetch = devinfo->ver >= 9 && devinfo->ver < 20,
         .input_slots_valid =
--- a/src/intel/blorp/blorp_elk.c
+++ b/src/intel/blorp/blorp_elk.c
@ -91,7 +91,9 @@ blorp_compile_vs_elk(struct blorp_context *blorp, void *mem_ctx,
   elk_compute_vue_map(compiler->devinfo,
                       &vs_prog_data->base.vue_map,
                       nir->info.outputs_written,
-                       nir->info.separate_shader,
+                       nir->info.separate_shader ?
+                       INTEL_VUE_LAYOUT_SEPARATE :
+                       INTEL_VUE_LAYOUT_FIXED,
                       1);

   struct elk_vs_prog_key vs_key = { 0, };
@ -231,7 +233,8 @@ blorp_ensure_sf_program_elk(struct blorp_batch *batch,
   unsigned program_size;

   struct intel_vue_map vue_map;
-   elk_compute_vue_map(compiler->devinfo, &vue_map, slots_valid, false, 1);
+   elk_compute_vue_map(compiler->devinfo, &vue_map, slots_valid,
+                       INTEL_VUE_LAYOUT_FIXED, 1);

   struct elk_sf_prog_data prog_data_tmp;
   program = elk_compile_sf(compiler, mem_ctx, &key.key,
--- a/src/intel/compiler/brw_compile_fs.cpp
+++ b/src/intel/compiler/brw_compile_fs.cpp
@ -825,7 +825,7 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
         struct intel_vue_map prev_stage_vue_map;
         brw_compute_vue_map(devinfo, &prev_stage_vue_map,
                             key->input_slots_valid,
-                             nir->info.separate_shader, 1);
+                             key->base.vue_layout, 1);

         int first_slot =
            brw_compute_first_fs_urb_slot_required(unique_fs_attrs,
--- a/src/intel/compiler/brw_compile_gs.cpp
+++ b/src/intel/compiler/brw_compile_gs.cpp
@ -161,7 +161,7 @@ brw_compile_gs(const struct brw_compiler *compiler,
   GLbitfield64 inputs_read = nir->info.inputs_read;
   brw_compute_vue_map(compiler->devinfo,
                       &input_vue_map, inputs_read,
-                       nir->info.separate_shader, 1);
+                       key->base.vue_layout, 1);

   brw_nir_apply_key(nir, compiler, &key->base, dispatch_width);
   brw_nir_lower_vue_inputs(nir, &input_vue_map);
--- a/src/intel/compiler/brw_compile_mesh.cpp
+++ b/src/intel/compiler/brw_compile_mesh.cpp
@ -1662,12 +1662,9 @@ brw_compile_mesh(const struct brw_compiler *compiler,

   brw_nir_lower_tue_inputs(nir, params->tue_map);

-   /* Incorrectly set separate to false until we fix the anv/brw in the next
-    * commit.
-    */
   brw_compute_mue_map(compiler, nir, &prog_data->map,
                       prog_data->index_format,
-                       false /* TODO: use nir->info.separate_shader */);
+                       key->base.vue_layout);
   brw_nir_lower_mue_outputs(nir, &prog_data->map);

   prog_data->autostrip_enable = brw_mesh_autostrip_enable(compiler, nir, &prog_data->map);
--- a/src/intel/compiler/brw_compile_tcs.cpp
+++ b/src/intel/compiler/brw_compile_tcs.cpp
@ -200,7 +200,7 @@ brw_compile_tcs(const struct brw_compiler *compiler,

   struct intel_vue_map input_vue_map;
   brw_compute_vue_map(devinfo, &input_vue_map, nir->info.inputs_read,
-                       nir->info.separate_shader, 1);
+                       key->base.vue_layout, 1);
   brw_compute_tess_vue_map(&vue_prog_data->vue_map,
                            nir->info.outputs_written,
                            nir->info.patch_outputs_written);
--- a/src/intel/compiler/brw_compile_tes.cpp
+++ b/src/intel/compiler/brw_compile_tes.cpp
@ -84,7 +84,7 @@ brw_compile_tes(const struct brw_compiler *compiler,

   brw_compute_vue_map(devinfo, &prog_data->base.vue_map,
                       nir->info.outputs_written,
-                       nir->info.separate_shader, 1);
+                       key->base.vue_layout, 1);

   unsigned output_size_bytes = prog_data->base.vue_map.num_slots * 4 * 4;

--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@ -210,14 +210,16 @@ struct brw_base_prog_key {

   bool uses_inline_push_addr:1;

-   unsigned padding:21;
+   enum intel_vue_layout vue_layout:2;

   /**
    * Apply workarounds for SIN and COS input range problems.
    * This limits input range for SIN and COS to [-2p : 2p] to
    * avoid precision issues.
    */
-   bool limit_trig_input_range;
+   bool limit_trig_input_range:1;
+
+   unsigned padding:26;
 };

 /**
@ -1070,7 +1072,7 @@ brw_varying_to_offset(const struct intel_vue_map *vue_map, unsigned varying)
 void brw_compute_vue_map(const struct intel_device_info *devinfo,
                         struct intel_vue_map *vue_map,
                         uint64_t slots_valid,
-                         bool separate_shader,
+                         enum intel_vue_layout layout,
                         uint32_t pos_slots);

 void brw_compute_tess_vue_map(struct intel_vue_map *const vue_map,
--- a/src/intel/compiler/brw_vue_map.c
+++ b/src/intel/compiler/brw_vue_map.c
@ -60,10 +60,10 @@ void
 brw_compute_vue_map(const struct intel_device_info *devinfo,
                    struct intel_vue_map *vue_map,
                    uint64_t slots_valid,
-                    bool separate,
+                    enum intel_vue_layout layout,
                    uint32_t pos_slots)
 {
-   if (separate) {
+   if (layout != INTEL_VUE_LAYOUT_FIXED) {
      /* In SSO mode, we don't know whether the adjacent stage will
       * read/write gl_ClipDistance, which has a fixed slot location.
       * We have to assume the worst and reserve a slot for it, or else
@ -77,7 +77,7 @@ brw_compute_vue_map(const struct intel_device_info *devinfo,
   }

   vue_map->slots_valid = slots_valid;
-   vue_map->separate = separate;
+   vue_map->layout = layout;

   /* gl_Layer, gl_ViewportIndex & gl_PrimitiveShadingRateEXT don't get their
    * own varying slots -- they are stored in the first VUE slot
@ -177,7 +177,7 @@ brw_compute_vue_map(const struct intel_device_info *devinfo,
   uint64_t generics = slots_valid & ~BITFIELD64_MASK(VARYING_SLOT_VAR0);
   while (generics != 0) {
      const int varying = ffsll(generics) - 1;
-      if (separate) {
+      if (layout != INTEL_VUE_LAYOUT_FIXED) {
         slot = first_generic_slot + varying - VARYING_SLOT_VAR0;
      }
      assign_vue_slot(vue_map, varying, slot++);
@ -202,8 +202,10 @@ brw_compute_tess_vue_map(struct intel_vue_map *vue_map,
   /* I don't think anything actually uses this... */
   vue_map->slots_valid = vertex_slots;

-   /* separate isn't really meaningful, but make sure it's initialized */
-   vue_map->separate = false;
+   /* separate isn't really meaningful, we always compiled tessellation
+    * shaders together, so use a fixed layout.
+    */
+   vue_map->layout = INTEL_VUE_LAYOUT_FIXED;

   vertex_slots &= ~(VARYING_BIT_TESS_LEVEL_OUTER |
                     VARYING_BIT_TESS_LEVEL_INNER);
@ -278,12 +280,17 @@ void
 brw_print_vue_map(FILE *fp, const struct intel_vue_map *vue_map,
                  gl_shader_stage stage)
 {
+   const char *layout_name =
+      vue_map->layout == INTEL_VUE_LAYOUT_FIXED ? "fixed" :
+      vue_map->layout == INTEL_VUE_LAYOUT_SEPARATE ? "separate" :
+      "separate-mesh";
+
   if (vue_map->num_per_vertex_slots > 0 || vue_map->num_per_patch_slots > 0) {
      fprintf(fp, "PUE map (%d slots, %d/patch, %d/vertex, %s)\n",
              vue_map->num_slots,
              vue_map->num_per_patch_slots,
              vue_map->num_per_vertex_slots,
-              vue_map->separate ? "SSO" : "non-SSO");
+              layout_name);
      for (int i = 0; i < vue_map->num_slots; i++) {
         if (vue_map->slot_to_varying[i] >= VARYING_SLOT_PATCH0) {
            fprintf(fp, "  [%02d] VARYING_SLOT_PATCH%d\n", i,
@ -295,8 +302,7 @@ brw_print_vue_map(FILE *fp, const struct intel_vue_map *vue_map,
      }
   } else {
      fprintf(fp, "%s VUE map (%d slots, %s)\n",
-              gl_shader_stage_name(stage),
-              vue_map->num_slots, vue_map->separate ? "SSO" : "non-SSO");
+              gl_shader_stage_name(stage), vue_map->num_slots, layout_name);
      for (int i = 0; i < vue_map->num_slots; i++) {
         fprintf(fp, "  [%02d] %s\n", i,
                 varying_name(vue_map->slot_to_varying[i], stage));
--- a/src/intel/compiler/elk/elk_compiler.h
+++ b/src/intel/compiler/elk/elk_compiler.h
@ -1215,7 +1215,7 @@ elk_varying_to_offset(const struct intel_vue_map *vue_map, unsigned varying)
 void elk_compute_vue_map(const struct intel_device_info *devinfo,
                         struct intel_vue_map *vue_map,
                         uint64_t slots_valid,
-                         bool separate_shader,
+                         enum intel_vue_layout layout,
                         uint32_t pos_slots);

 void elk_compute_tess_vue_map(struct intel_vue_map *const vue_map,
--- a/src/intel/compiler/elk/elk_fs.cpp
+++ b/src/intel/compiler/elk/elk_fs.cpp
@ -1424,7 +1424,9 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
         struct intel_vue_map prev_stage_vue_map;
         elk_compute_vue_map(devinfo, &prev_stage_vue_map,
                             key->input_slots_valid,
-                             nir->info.separate_shader, 1);
+                             nir->info.separate_shader ?
+                             INTEL_VUE_LAYOUT_SEPARATE :
+                             INTEL_VUE_LAYOUT_FIXED, 1);

         int first_slot =
            elk_compute_first_urb_slot_required(inputs_read,
--- a/src/intel/compiler/elk/elk_shader.cpp
+++ b/src/intel/compiler/elk/elk_shader.cpp
@ -1276,7 +1276,9 @@ elk_compile_tes(const struct elk_compiler *compiler,

   elk_compute_vue_map(devinfo, &prog_data->base.vue_map,
                       nir->info.outputs_written,
-                       nir->info.separate_shader, 1);
+                       nir->info.separate_shader ?
+                       INTEL_VUE_LAYOUT_SEPARATE :
+                       INTEL_VUE_LAYOUT_FIXED, 1);

   unsigned output_size_bytes = prog_data->base.vue_map.num_slots * 4 * 4;

--- a/src/intel/compiler/elk/elk_vec4_gs_visitor.cpp
+++ b/src/intel/compiler/elk/elk_vec4_gs_visitor.cpp
@ -610,7 +610,9 @@ elk_compile_gs(const struct elk_compiler *compiler,
   GLbitfield64 inputs_read = nir->info.inputs_read;
   elk_compute_vue_map(compiler->devinfo,
                       &c.input_vue_map, inputs_read,
-                       nir->info.separate_shader, 1);
+                       nir->info.separate_shader ?
+                       INTEL_VUE_LAYOUT_SEPARATE :
+                       INTEL_VUE_LAYOUT_FIXED, 1);

   elk_nir_apply_key(nir, compiler, &key->base, 8);
   elk_nir_lower_vue_inputs(nir, &c.input_vue_map);
--- a/src/intel/compiler/elk/elk_vec4_tcs.cpp
+++ b/src/intel/compiler/elk/elk_vec4_tcs.cpp
@ -373,7 +373,9 @@ elk_compile_tcs(const struct elk_compiler *compiler,

   struct intel_vue_map input_vue_map;
   elk_compute_vue_map(devinfo, &input_vue_map, nir->info.inputs_read,
-                       nir->info.separate_shader, 1);
+                       nir->info.separate_shader ?
+                       INTEL_VUE_LAYOUT_SEPARATE :
+                       INTEL_VUE_LAYOUT_FIXED, 1);
   elk_compute_tess_vue_map(&vue_prog_data->vue_map,
                            nir->info.outputs_written,
                            nir->info.patch_outputs_written);
--- a/src/intel/compiler/elk/elk_vue_map.c
+++ b/src/intel/compiler/elk/elk_vue_map.c
@ -60,17 +60,20 @@ void
 elk_compute_vue_map(const struct intel_device_info *devinfo,
                    struct intel_vue_map *vue_map,
                    uint64_t slots_valid,
-                    bool separate,
+                    enum intel_vue_layout layout,
                    uint32_t pos_slots)
 {
+   assert(layout == INTEL_VUE_LAYOUT_FIXED ||
+          layout == INTEL_VUE_LAYOUT_SEPARATE);
+
   /* Keep using the packed/contiguous layout on old hardware - we only need
    * the SSO layout when using geometry/tessellation shaders or 32 FS input
    * varyings, which only exist on Gen >= 6.  It's also a bit more efficient.
    */
   if (devinfo->ver < 6)
-      separate = false;
+      layout = INTEL_VUE_LAYOUT_FIXED;

-   if (separate) {
+   if (layout == INTEL_VUE_LAYOUT_SEPARATE) {
      /* In SSO mode, we don't know whether the adjacent stage will
       * read/write gl_ClipDistance, which has a fixed slot location.
       * We have to assume the worst and reserve a slot for it, or else
@ -84,7 +87,7 @@ elk_compute_vue_map(const struct intel_device_info *devinfo,
   }

   vue_map->slots_valid = slots_valid;
-   vue_map->separate = separate;
+   vue_map->layout = layout;

   /* gl_Layer, gl_ViewportIndex & gl_PrimitiveShadingRateEXT don't get their
    * own varying slots -- they are stored in the first VUE slot
@ -198,7 +201,7 @@ elk_compute_vue_map(const struct intel_device_info *devinfo,
   uint64_t generics = slots_valid & ~BITFIELD64_MASK(VARYING_SLOT_VAR0);
   while (generics != 0) {
      const int varying = ffsll(generics) - 1;
-      if (separate) {
+      if (layout == INTEL_VUE_LAYOUT_SEPARATE) {
         slot = first_generic_slot + varying - VARYING_SLOT_VAR0;
      }
      assign_vue_slot(vue_map, varying, slot++);
@ -224,7 +227,7 @@ elk_compute_tess_vue_map(struct intel_vue_map *vue_map,
   vue_map->slots_valid = vertex_slots;

   /* separate isn't really meaningful, but make sure it's initialized */
-   vue_map->separate = false;
+   vue_map->layout = INTEL_VUE_LAYOUT_FIXED;

   vertex_slots &= ~(VARYING_BIT_TESS_LEVEL_OUTER |
                     VARYING_BIT_TESS_LEVEL_INNER);
@ -301,12 +304,15 @@ void
 elk_print_vue_map(FILE *fp, const struct intel_vue_map *vue_map,
                  gl_shader_stage stage)
 {
+   const char *layout_name =
+      vue_map->layout == INTEL_VUE_LAYOUT_FIXED ? "non-SSO" : "SSO";
+
   if (vue_map->num_per_vertex_slots > 0 || vue_map->num_per_patch_slots > 0) {
      fprintf(fp, "PUE map (%d slots, %d/patch, %d/vertex, %s)\n",
              vue_map->num_slots,
              vue_map->num_per_patch_slots,
              vue_map->num_per_vertex_slots,
-              vue_map->separate ? "SSO" : "non-SSO");
+              layout_name);
      for (int i = 0; i < vue_map->num_slots; i++) {
         if (vue_map->slot_to_varying[i] >= VARYING_SLOT_PATCH0) {
            fprintf(fp, "  [%d] VARYING_SLOT_PATCH%d\n", i,
@ -317,8 +323,7 @@ elk_print_vue_map(FILE *fp, const struct intel_vue_map *vue_map,
         }
      }
   } else {
-      fprintf(fp, "VUE map (%d slots, %s)\n",
-              vue_map->num_slots, vue_map->separate ? "SSO" : "non-SSO");
+      fprintf(fp, "VUE map (%d slots, %s)\n", vue_map->num_slots, layout_name);
      for (int i = 0; i < vue_map->num_slots; i++) {
         fprintf(fp, "  [%d] %s\n", i,
                 varying_name(vue_map->slot_to_varying[i], stage));
--- a/src/intel/compiler/intel_shader_enums.h
+++ b/src/intel/compiler/intel_shader_enums.h
@ -122,6 +122,19 @@ enum intel_barycentric_mode {
    (1 << INTEL_BARYCENTRIC_NONPERSPECTIVE_CENTROID) | \
    (1 << INTEL_BARYCENTRIC_NONPERSPECTIVE_SAMPLE))

+enum intel_vue_layout {
+   /**
+    * Layout is fixed and shared by producer/consumer, allowing for tigh
+    * packing
+    */
+   INTEL_VUE_LAYOUT_FIXED = 0,
+   /**
+    * Layout is separate, works for ARB_separate_shader_objects but without
+    * Mesh support.
+    */
+   INTEL_VUE_LAYOUT_SEPARATE,
+};
+
 /**
 * Data structure recording the relationship between the gl_varying_slot enum
 * and "slots" within the vertex URB entry (VUE).  A "slot" is defined as a
@ -142,7 +155,7 @@ struct intel_vue_map {
   uint64_t slots_valid;

   /**
-    * Is this VUE map for a separate shader pipeline?
+    * The layout of the VUE
    *
    * Separable programs (GL_ARB_separate_shader_objects) can be mixed and matched
    * without the linker having a chance to dead code eliminate unused varyings.
@ -150,7 +163,7 @@ struct intel_vue_map {
    * This means that we have to use a fixed slot layout, based on the output's
    * location field, rather than assigning slots in a compact contiguous block.
    */
-   bool separate;
+   enum intel_vue_layout layout;

   /**
    * Map from gl_varying_slot value to VUE slot.  For gl_varying_slots that are
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@ -316,20 +316,23 @@ anv_get_robust_flags(const struct vk_pipeline_robustness_state *rstate)

 static void
 populate_base_prog_key(struct anv_pipeline_stage *stage,
-                       const struct anv_device *device)
+                       const struct anv_device *device,
+                       const enum intel_vue_layout vue_layout)
 {
   stage->key.base.robust_flags = anv_get_robust_flags(&stage->rstate);
+   stage->key.base.vue_layout = vue_layout;
   stage->key.base.limit_trig_input_range =
      device->physical->instance->limit_trig_input_range;
 }

 static void
 populate_vs_prog_key(struct anv_pipeline_stage *stage,
-                     const struct anv_device *device)
+                     const struct anv_device *device,
+                     const enum intel_vue_layout vue_layout)
 {
   memset(&stage->key, 0, sizeof(stage->key));

-   populate_base_prog_key(stage, device);
+   populate_base_prog_key(stage, device, vue_layout);

   stage->key.vs.vf_component_packing =
      device->physical->instance->vf_component_packing;
@ -338,31 +341,34 @@ populate_vs_prog_key(struct anv_pipeline_stage *stage,
 static void
 populate_tcs_prog_key(struct anv_pipeline_stage *stage,
                      const struct anv_device *device,
-                      unsigned input_vertices)
+                      unsigned input_vertices,
+                      const enum intel_vue_layout vue_layout)
 {
   memset(&stage->key, 0, sizeof(stage->key));

-   populate_base_prog_key(stage, device);
+   populate_base_prog_key(stage, device, vue_layout);

   stage->key.tcs.input_vertices = input_vertices;
 }

 static void
 populate_tes_prog_key(struct anv_pipeline_stage *stage,
-                      const struct anv_device *device)
+                      const struct anv_device *device,
+                      const enum intel_vue_layout vue_layout)
 {
   memset(&stage->key, 0, sizeof(stage->key));

-   populate_base_prog_key(stage, device);
+   populate_base_prog_key(stage, device, vue_layout);
 }

 static void
 populate_gs_prog_key(struct anv_pipeline_stage *stage,
-                     const struct anv_device *device)
+                     const struct anv_device *device,
+                     const enum intel_vue_layout vue_layout)
 {
   memset(&stage->key, 0, sizeof(stage->key));

-   populate_base_prog_key(stage, device);
+   populate_base_prog_key(stage, device, vue_layout);
 }

 static bool
@ -424,18 +430,19 @@ populate_task_prog_key(struct anv_pipeline_stage *stage,
 {
   memset(&stage->key, 0, sizeof(stage->key));

-   populate_base_prog_key(stage, device);
+   populate_base_prog_key(stage, device, INTEL_VUE_LAYOUT_FIXED);

   stage->key.base.uses_inline_push_addr = true;
 }

 static void
 populate_mesh_prog_key(struct anv_pipeline_stage *stage,
-                       const struct anv_device *device)
+                       const struct anv_device *device,
+                       const enum intel_vue_layout vue_layout)
 {
   memset(&stage->key, 0, sizeof(stage->key));

-   populate_base_prog_key(stage, device);
+   populate_base_prog_key(stage, device, vue_layout);

   stage->key.base.uses_inline_push_addr = true;
 }
@ -462,13 +469,14 @@ populate_wm_prog_key(struct anv_pipeline_stage *stage,
                     const struct vk_multisample_state *ms,
                     const struct vk_fragment_shading_rate_state *fsr,
                     const struct vk_render_pass_state *rp,
-                     const enum intel_sometimes is_mesh)
+                     const enum intel_sometimes is_mesh,
+                     const enum intel_vue_layout vue_layout)
 {
   const struct anv_device *device = pipeline->base.device;

   memset(&stage->key, 0, sizeof(stage->key));

-   populate_base_prog_key(stage, device);
+   populate_base_prog_key(stage, device, vue_layout);

   struct brw_wm_prog_key *key = &stage->key.wm;

@ -553,7 +561,7 @@ populate_cs_prog_key(struct anv_pipeline_stage *stage,
 {
   memset(&stage->key, 0, sizeof(stage->key));

-   populate_base_prog_key(stage, device);
+   populate_base_prog_key(stage, device, INTEL_VUE_LAYOUT_FIXED);

   stage->key.base.uses_inline_push_addr = device->info->verx10 >= 125;
 }
@ -565,7 +573,7 @@ populate_bs_prog_key(struct anv_pipeline_stage *stage,
 {
   memset(&stage->key, 0, sizeof(stage->key));

-   populate_base_prog_key(stage, device);
+   populate_base_prog_key(stage, device, INTEL_VUE_LAYOUT_FIXED);

   stage->key.bs.pipeline_ray_flags = ray_flags;
   stage->key.bs.pipeline_ray_flags = ray_flags;
@ -1159,7 +1167,7 @@ anv_pipeline_compile_vs(const struct brw_compiler *compiler,
   brw_compute_vue_map(compiler->devinfo,
                       &vs_stage->prog_data.vs.base.vue_map,
                       vs_stage->nir->info.outputs_written,
-                       vs_stage->nir->info.separate_shader,
+                       vs_stage->key.base.vue_layout,
                       pos_slots);

   vs_stage->num_stats = 1;
@ -1335,7 +1343,7 @@ anv_pipeline_compile_gs(const struct brw_compiler *compiler,
   brw_compute_vue_map(compiler->devinfo,
                       &gs_stage->prog_data.gs.base.vue_map,
                       gs_stage->nir->info.outputs_written,
-                       gs_stage->nir->info.separate_shader, 1);
+                       gs_stage->key.base.vue_layout, 1);

   gs_stage->num_stats = 1;

@ -1522,7 +1530,7 @@ anv_pipeline_compile_fs(const struct brw_compiler *compiler,
      brw_compute_vue_map(compiler->devinfo,
                          &prev_vue_map,
                          fs_stage->nir->info.inputs_read,
-                          fs_stage->nir->info.separate_shader,
+                          fs_stage->key.base.vue_layout,
                          pos_slots);

      fs_stage->key.wm.input_slots_valid = prev_vue_map.slots_valid;
@ -1742,6 +1750,16 @@ anv_graphics_pipeline_init_keys(struct anv_graphics_base_pipeline *pipeline,
                                const struct vk_graphics_pipeline_state *state,
                                struct anv_pipeline_stage *stages)
 {
+   struct anv_device *device = pipeline->base.device;
+   enum intel_vue_layout vue_layout;
+
+   if ((pipeline->base.flags & VK_PIPELINE_CREATE_LINK_TIME_OPTIMIZATION_BIT_EXT) ||
+       !device->vk.enabled_extensions.EXT_graphics_pipeline_library) {
+      vue_layout = INTEL_VUE_LAYOUT_FIXED;
+   } else {
+      vue_layout = INTEL_VUE_LAYOUT_SEPARATE;
+   }
+
   for (uint32_t s = 0; s < ANV_GRAPHICS_SHADER_STAGE_COUNT; s++) {
      if (!anv_pipeline_base_has_stage(pipeline, s))
         continue;
@ -1751,20 +1769,21 @@ anv_graphics_pipeline_init_keys(struct anv_graphics_base_pipeline *pipeline,
      const struct anv_device *device = pipeline->base.device;
      switch (stages[s].stage) {
      case MESA_SHADER_VERTEX:
-         populate_vs_prog_key(&stages[s], device);
+         populate_vs_prog_key(&stages[s], device, vue_layout);
         break;
      case MESA_SHADER_TESS_CTRL:
         populate_tcs_prog_key(&stages[s],
                               device,
                               BITSET_TEST(state->dynamic,
                                           MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS) ?
-                               0 : state->ts->patch_control_points);
+                               0 : state->ts->patch_control_points,
+                               vue_layout);
         break;
      case MESA_SHADER_TESS_EVAL:
-         populate_tes_prog_key(&stages[s], device);
+         populate_tes_prog_key(&stages[s], device, vue_layout);
         break;
      case MESA_SHADER_GEOMETRY:
-         populate_gs_prog_key(&stages[s], device);
+         populate_gs_prog_key(&stages[s], device, vue_layout);
         break;
      case MESA_SHADER_FRAGMENT: {
         /* Assume rasterization enabled in any of the following case :
@ -1794,7 +1813,8 @@ anv_graphics_pipeline_init_keys(struct anv_graphics_base_pipeline *pipeline,
                              pipeline,
                              state->dynamic,
                              raster_enabled ? state->ms : NULL,
-                              state->fsr, state->rp, is_mesh);
+                              state->fsr, state->rp, is_mesh,
+                              vue_layout);
         break;
      }

@ -1803,7 +1823,7 @@ anv_graphics_pipeline_init_keys(struct anv_graphics_base_pipeline *pipeline,
         break;

      case MESA_SHADER_MESH: {
-         populate_mesh_prog_key(&stages[s], device);
+         populate_mesh_prog_key(&stages[s], device, vue_layout);
         break;
      }

@ -2019,8 +2039,11 @@ anv_pipeline_nir_preprocess(struct anv_pipeline *pipeline,
   };
   NIR_PASS(_, stage->nir, nir_opt_access, &opt_access_options);

-   /* Vulkan uses the separate-shader linking model */
-   stage->nir->info.separate_shader = true;
+   /* Use a separate-shader linking model for pipeline libraries, we do cross
+    * stage linking otherwise.
+    */
+   stage->nir->info.separate_shader =
+      stage->key.base.vue_layout != INTEL_VUE_LAYOUT_FIXED;

   struct brw_nir_compiler_opts opts = {
      .softfp64 = device->fp64_nir,